1 /* regexec.c 2 */ 3 4 /* 5 * One Ring to rule them all, One Ring to find them 6 * 7 * [p.v of _The Lord of the Rings_, opening poem] 8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] 9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] 10 */ 11 12 /* This file contains functions for executing a regular expression. See 13 * also regcomp.c which funnily enough, contains functions for compiling 14 * a regular expression. 15 * 16 * This file is also copied at build time to ext/re/re_exec.c, where 17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 18 * This causes the main functions to be compiled under new names and with 19 * debugging support added, which makes "use re 'debug'" work. 20 */ 21 22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 23 * confused with the original package (see point 3 below). Thanks, Henry! 24 */ 25 26 /* Additional note: this code is very heavily munged from Henry's version 27 * in places. In some spots I've traded clarity for efficiency, so don't 28 * blame Henry for some of the lack of readability. 29 */ 30 31 /* The names of the functions have been changed from regcomp and 32 * regexec to pregcomp and pregexec in order to avoid conflicts 33 * with the POSIX routines of the same names. 34 */ 35 36 #ifdef PERL_EXT_RE_BUILD 37 #include "re_top.h" 38 #endif 39 40 /* 41 * pregcomp and pregexec -- regsub and regerror are not used in perl 42 * 43 * Copyright (c) 1986 by University of Toronto. 44 * Written by Henry Spencer. Not derived from licensed software. 45 * 46 * Permission is granted to anyone to use this software for any 47 * purpose on any computer system, and to redistribute it freely, 48 * subject to the following restrictions: 49 * 50 * 1. The author is not responsible for the consequences of use of 51 * this software, no matter how awful, even if they arise 52 * from defects in it. 53 * 54 * 2. The origin of this software must not be misrepresented, either 55 * by explicit claim or by omission. 56 * 57 * 3. Altered versions must be plainly marked as such, and must not 58 * be misrepresented as being the original software. 59 * 60 **** Alterations to Henry's code are... 61 **** 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 64 **** by Larry Wall and others 65 **** 66 **** You may distribute under the terms of either the GNU General Public 67 **** License or the Artistic License, as specified in the README file. 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGEXEC_C 75 #include "perl.h" 76 77 #ifdef PERL_IN_XSUB_RE 78 # include "re_comp.h" 79 #else 80 # include "regcomp.h" 81 #endif 82 83 #include "invlist_inline.h" 84 #include "unicode_constants.h" 85 86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG \ 87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale" 88 89 static const char utf8_locale_required[] = 90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"; 91 92 #ifdef DEBUGGING 93 /* At least one required character in the target string is expressible only in 94 * UTF-8. */ 95 static const char non_utf8_target_but_utf8_required[] 96 = "Can't match, because target string needs to be in UTF-8\n"; 97 #endif 98 99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ 100 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\ 101 goto target; \ 102 } STMT_END 103 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) 105 106 #ifndef STATIC 107 #define STATIC static 108 #endif 109 110 /* 111 * Forwards. 112 */ 113 114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) 115 116 #define HOPc(pos,off) \ 117 (char *)(reginfo->is_utf8_target \ 118 ? reghop3((U8*)pos, off, \ 119 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ 120 : (U8*)(pos + off)) 121 122 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */ 123 #define HOPBACK3(pos, off, lim) \ 124 (reginfo->is_utf8_target \ 125 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \ 126 : (pos - off >= lim) \ 127 ? (U8*)pos - off \ 128 : NULL) 129 130 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg)) 131 132 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) 133 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) 134 135 /* lim must be +ve. Returns NULL on overshoot */ 136 #define HOPMAYBE3(pos,off,lim) \ 137 (reginfo->is_utf8_target \ 138 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ 139 : ((U8*)pos + off <= lim) \ 140 ? (U8*)pos + off \ 141 : NULL) 142 143 /* like HOP3, but limits the result to <= lim even for the non-utf8 case. 144 * off must be >=0; args should be vars rather than expressions */ 145 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ 146 ? reghop3((U8*)(pos), off, (U8*)(lim)) \ 147 : (U8*)((pos + off) > lim ? lim : (pos + off))) 148 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim)) 149 150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ 151 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ 152 : (U8*)(pos + off)) 153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) 154 155 #define PLACEHOLDER /* Something for the preprocessor to grab onto */ 156 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ 157 158 /* for use after a quantifier and before an EXACT-like node -- japhy */ 159 /* it would be nice to rework regcomp.sym to generate this stuff. sigh 160 * 161 * NOTE that *nothing* that affects backtracking should be in here, specifically 162 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a 163 * node that is in between two EXACT like nodes when ascertaining what the required 164 * "follow" character is. This should probably be moved to regex compile time 165 * although it may be done at run time beause of the REF possibility - more 166 * investigation required. -- demerphq 167 */ 168 #define JUMPABLE(rn) ( \ 169 OP(rn) == OPEN || \ 170 (OP(rn) == CLOSE && \ 171 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \ 172 OP(rn) == EVAL || \ 173 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ 174 OP(rn) == PLUS || OP(rn) == MINMOD || \ 175 OP(rn) == KEEPS || \ 176 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ 177 ) 178 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) 179 180 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) 181 182 /* 183 Search for mandatory following text node; for lookahead, the text must 184 follow but for lookbehind (rn->flags != 0) we skip to the next step. 185 */ 186 #define FIND_NEXT_IMPT(rn) STMT_START { \ 187 while (JUMPABLE(rn)) { \ 188 const OPCODE type = OP(rn); \ 189 if (type == SUSPEND || PL_regkind[type] == CURLY) \ 190 rn = NEXTOPER(NEXTOPER(rn)); \ 191 else if (type == PLUS) \ 192 rn = NEXTOPER(rn); \ 193 else if (type == IFMATCH) \ 194 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ 195 else rn += NEXT_OFF(rn); \ 196 } \ 197 } STMT_END 198 199 #define SLAB_FIRST(s) (&(s)->states[0]) 200 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) 201 202 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); 203 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); 204 static regmatch_state * S_push_slab(pTHX); 205 206 #define REGCP_PAREN_ELEMS 3 207 #define REGCP_OTHER_ELEMS 3 208 #define REGCP_FRAME_ELEMS 1 209 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and 210 * are needed for the regexp context stack bookkeeping. */ 211 212 STATIC CHECKPOINT 213 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) 214 { 215 const int retval = PL_savestack_ix; 216 const int paren_elems_to_push = 217 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS; 218 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; 219 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; 220 I32 p; 221 DECLARE_AND_GET_RE_DEBUG_FLAGS; 222 223 PERL_ARGS_ASSERT_REGCPPUSH; 224 225 if (paren_elems_to_push < 0) 226 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u", 227 (int)paren_elems_to_push, (int)maxopenparen, 228 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS); 229 230 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) 231 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf 232 " out of range (%lu-%ld)", 233 total_elems, 234 (unsigned long)maxopenparen, 235 (long)parenfloor); 236 237 SSGROW(total_elems + REGCP_FRAME_ELEMS); 238 239 DEBUG_BUFFERS_r( 240 if ((int)maxopenparen > (int)parenfloor) 241 Perl_re_exec_indentf( aTHX_ 242 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n", 243 depth, 244 PTR2UV(rex), 245 PTR2UV(rex->offs) 246 ); 247 ); 248 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { 249 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ 250 SSPUSHIV(rex->offs[p].end); 251 SSPUSHIV(rex->offs[p].start); 252 SSPUSHINT(rex->offs[p].start_tmp); 253 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 254 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n", 255 depth, 256 (UV)p, 257 (IV)rex->offs[p].start, 258 (IV)rex->offs[p].start_tmp, 259 (IV)rex->offs[p].end 260 )); 261 } 262 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ 263 SSPUSHINT(maxopenparen); 264 SSPUSHINT(rex->lastparen); 265 SSPUSHINT(rex->lastcloseparen); 266 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ 267 268 return retval; 269 } 270 271 /* These are needed since we do not localize EVAL nodes: */ 272 #define REGCP_SET(cp) \ 273 DEBUG_STATE_r( \ 274 Perl_re_exec_indentf( aTHX_ \ 275 "Setting an EVAL scope, savestack=%" IVdf ",\n", \ 276 depth, (IV)PL_savestack_ix \ 277 ) \ 278 ); \ 279 cp = PL_savestack_ix 280 281 #define REGCP_UNWIND(cp) \ 282 DEBUG_STATE_r( \ 283 if (cp != PL_savestack_ix) \ 284 Perl_re_exec_indentf( aTHX_ \ 285 "Clearing an EVAL scope, savestack=%" \ 286 IVdf "..%" IVdf "\n", \ 287 depth, (IV)(cp), (IV)PL_savestack_ix \ 288 ) \ 289 ); \ 290 regcpblow(cp) 291 292 /* set the start and end positions of capture ix */ 293 #define CLOSE_CAPTURE(ix, s, e) \ 294 rex->offs[ix].start = s; \ 295 rex->offs[ix].end = e; \ 296 if (ix > rex->lastparen) \ 297 rex->lastparen = ix; \ 298 rex->lastcloseparen = ix; \ 299 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ 300 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \ 301 depth, \ 302 PTR2UV(rex), \ 303 PTR2UV(rex->offs), \ 304 (UV)ix, \ 305 (IV)rex->offs[ix].start, \ 306 (IV)rex->offs[ix].end, \ 307 (UV)rex->lastparen \ 308 )) 309 310 #define UNWIND_PAREN(lp, lcp) \ 311 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ 312 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \ 313 depth, \ 314 PTR2UV(rex), \ 315 PTR2UV(rex->offs), \ 316 (UV)(lp), \ 317 (UV)(rex->lastparen), \ 318 (UV)(lcp) \ 319 )); \ 320 for (n = rex->lastparen; n > lp; n--) \ 321 rex->offs[n].end = -1; \ 322 rex->lastparen = n; \ 323 rex->lastcloseparen = lcp; 324 325 326 STATIC void 327 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH) 328 { 329 UV i; 330 U32 paren; 331 DECLARE_AND_GET_RE_DEBUG_FLAGS; 332 333 PERL_ARGS_ASSERT_REGCPPOP; 334 335 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ 336 i = SSPOPUV; 337 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ 338 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ 339 rex->lastcloseparen = SSPOPINT; 340 rex->lastparen = SSPOPINT; 341 *maxopenparen_p = SSPOPINT; 342 343 i -= REGCP_OTHER_ELEMS; 344 /* Now restore the parentheses context. */ 345 DEBUG_BUFFERS_r( 346 if (i || rex->lastparen + 1 <= rex->nparens) 347 Perl_re_exec_indentf( aTHX_ 348 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n", 349 depth, 350 PTR2UV(rex), 351 PTR2UV(rex->offs) 352 ); 353 ); 354 paren = *maxopenparen_p; 355 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { 356 SSize_t tmps; 357 rex->offs[paren].start_tmp = SSPOPINT; 358 rex->offs[paren].start = SSPOPIV; 359 tmps = SSPOPIV; 360 if (paren <= rex->lastparen) 361 rex->offs[paren].end = tmps; 362 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_ 363 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n", 364 depth, 365 (UV)paren, 366 (IV)rex->offs[paren].start, 367 (IV)rex->offs[paren].start_tmp, 368 (IV)rex->offs[paren].end, 369 (paren > rex->lastparen ? "(skipped)" : "")); 370 ); 371 paren--; 372 } 373 #if 1 374 /* It would seem that the similar code in regtry() 375 * already takes care of this, and in fact it is in 376 * a better location to since this code can #if 0-ed out 377 * but the code in regtry() is needed or otherwise tests 378 * requiring null fields (pat.t#187 and split.t#{13,14} 379 * (as of patchlevel 7877) will fail. Then again, 380 * this code seems to be necessary or otherwise 381 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 382 * --jhi updated by dapm */ 383 for (i = rex->lastparen + 1; i <= rex->nparens; i++) { 384 if (i > *maxopenparen_p) 385 rex->offs[i].start = -1; 386 rex->offs[i].end = -1; 387 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_ 388 " \\%" UVuf ": %s ..-1 undeffing\n", 389 depth, 390 (UV)i, 391 (i > *maxopenparen_p) ? "-1" : " " 392 )); 393 } 394 #endif 395 } 396 397 /* restore the parens and associated vars at savestack position ix, 398 * but without popping the stack */ 399 400 STATIC void 401 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH) 402 { 403 I32 tmpix = PL_savestack_ix; 404 PERL_ARGS_ASSERT_REGCP_RESTORE; 405 406 PL_savestack_ix = ix; 407 regcppop(rex, maxopenparen_p); 408 PL_savestack_ix = tmpix; 409 } 410 411 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ 412 413 #ifndef PERL_IN_XSUB_RE 414 415 bool 416 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character) 417 { 418 /* Returns a boolean as to whether or not 'character' is a member of the 419 * Posix character class given by 'classnum' that should be equivalent to a 420 * value in the typedef '_char_class_number'. 421 * 422 * Ideally this could be replaced by a just an array of function pointers 423 * to the C library functions that implement the macros this calls. 424 * However, to compile, the precise function signatures are required, and 425 * these may vary from platform to platform. To avoid having to figure 426 * out what those all are on each platform, I (khw) am using this method, 427 * which adds an extra layer of function call overhead (unless the C 428 * optimizer strips it away). But we don't particularly care about 429 * performance with locales anyway. */ 430 431 switch ((_char_class_number) classnum) { 432 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character); 433 case _CC_ENUM_ALPHA: return isALPHA_LC(character); 434 case _CC_ENUM_ASCII: return isASCII_LC(character); 435 case _CC_ENUM_BLANK: return isBLANK_LC(character); 436 case _CC_ENUM_CASED: return isLOWER_LC(character) 437 || isUPPER_LC(character); 438 case _CC_ENUM_CNTRL: return isCNTRL_LC(character); 439 case _CC_ENUM_DIGIT: return isDIGIT_LC(character); 440 case _CC_ENUM_GRAPH: return isGRAPH_LC(character); 441 case _CC_ENUM_LOWER: return isLOWER_LC(character); 442 case _CC_ENUM_PRINT: return isPRINT_LC(character); 443 case _CC_ENUM_PUNCT: return isPUNCT_LC(character); 444 case _CC_ENUM_SPACE: return isSPACE_LC(character); 445 case _CC_ENUM_UPPER: return isUPPER_LC(character); 446 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character); 447 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character); 448 default: /* VERTSPACE should never occur in locales */ 449 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum); 450 } 451 452 NOT_REACHED; /* NOTREACHED */ 453 return FALSE; 454 } 455 456 #endif 457 458 PERL_STATIC_INLINE I32 459 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len) 460 { 461 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be 462 * folded. Works on all folds representable without UTF-8, except for 463 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it 464 * check that the strings each have at least 'len' characters. 465 * 466 * There is almost an identical API function where s2 need not be folded: 467 * Perl_foldEQ_latin1() */ 468 469 const U8 *a = (const U8 *)s1; 470 const U8 *b = (const U8 *)s2; 471 472 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED; 473 474 assert(len >= 0); 475 476 while (len--) { 477 assert(! isUPPER_L1(*b)); 478 if (toLOWER_L1(*a) != *b) { 479 return 0; 480 } 481 a++, b++; 482 } 483 return 1; 484 } 485 486 STATIC bool 487 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e) 488 { 489 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded 490 * 'character' is a member of the Posix character class given by 'classnum' 491 * that should be equivalent to a value in the typedef 492 * '_char_class_number'. 493 * 494 * This just calls isFOO_lc on the code point for the character if it is in 495 * the range 0-255. Outside that range, all characters use Unicode 496 * rules, ignoring any locale. So use the Unicode function if this class 497 * requires an inversion list, and use the Unicode macro otherwise. */ 498 499 dVAR; 500 501 PERL_ARGS_ASSERT_ISFOO_UTF8_LC; 502 503 if (UTF8_IS_INVARIANT(*character)) { 504 return isFOO_lc(classnum, *character); 505 } 506 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { 507 return isFOO_lc(classnum, 508 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1))); 509 } 510 511 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e); 512 513 switch ((_char_class_number) classnum) { 514 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character); 515 case _CC_ENUM_BLANK: return is_HORIZWS_high(character); 516 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character); 517 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character); 518 default: 519 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], 520 utf8_to_uvchr_buf(character, e, NULL)); 521 } 522 523 return FALSE; /* Things like CNTRL are always below 256 */ 524 } 525 526 STATIC U8 * 527 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte) 528 { 529 /* Returns the position of the first byte in the sequence between 's' and 530 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found. 531 * */ 532 533 PERL_ARGS_ASSERT_FIND_SPAN_END; 534 535 assert(send >= s); 536 537 if ((STRLEN) (send - s) >= PERL_WORDSIZE 538 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 539 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 540 { 541 PERL_UINTMAX_T span_word; 542 543 /* Process per-byte until reach word boundary. XXX This loop could be 544 * eliminated if we knew that this platform had fast unaligned reads */ 545 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 546 if (*s != span_byte) { 547 return s; 548 } 549 s++; 550 } 551 552 /* Create a word filled with the bytes we are spanning */ 553 span_word = PERL_COUNT_MULTIPLIER * span_byte; 554 555 /* Process per-word as long as we have at least a full word left */ 556 do { 557 558 /* Keep going if the whole word is composed of 'span_byte's */ 559 if ((* (PERL_UINTMAX_T *) s) == span_word) { 560 s += PERL_WORDSIZE; 561 continue; 562 } 563 564 /* Here, at least one byte in the word isn't 'span_byte'. */ 565 566 #ifdef EBCDIC 567 568 break; 569 570 #else 571 572 /* This xor leaves 1 bits only in those non-matching bytes */ 573 span_word ^= * (PERL_UINTMAX_T *) s; 574 575 /* Make sure the upper bit of each non-matching byte is set. This 576 * makes each such byte look like an ASCII platform variant byte */ 577 span_word |= span_word << 1; 578 span_word |= span_word << 2; 579 span_word |= span_word << 4; 580 581 /* That reduces the problem to what this function solves */ 582 return s + variant_byte_number(span_word); 583 584 #endif 585 586 } while (s + PERL_WORDSIZE <= send); 587 } 588 589 /* Process the straggler bytes beyond the final word boundary */ 590 while (s < send) { 591 if (*s != span_byte) { 592 return s; 593 } 594 s++; 595 } 596 597 return s; 598 } 599 600 STATIC U8 * 601 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask) 602 { 603 /* Returns the position of the first byte in the sequence between 's' 604 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte'; 605 * returns 'send' if none found. It uses word-level operations instead of 606 * byte to speed up the process */ 607 608 PERL_ARGS_ASSERT_FIND_NEXT_MASKED; 609 610 assert(send >= s); 611 assert((byte & mask) == byte); 612 613 #ifndef EBCDIC 614 615 if ((STRLEN) (send - s) >= PERL_WORDSIZE 616 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 617 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 618 { 619 PERL_UINTMAX_T word, mask_word; 620 621 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 622 if (((*s) & mask) == byte) { 623 return s; 624 } 625 s++; 626 } 627 628 word = PERL_COUNT_MULTIPLIER * byte; 629 mask_word = PERL_COUNT_MULTIPLIER * mask; 630 631 do { 632 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; 633 634 /* If 'masked' contains bytes with the bit pattern of 'byte' within 635 * it, xoring with 'word' will leave each of the 8 bits in such 636 * bytes be 0, and no byte containing any other bit pattern will be 637 * 0. */ 638 masked ^= word; 639 640 /* This causes the most significant bit to be set to 1 for any 641 * bytes in the word that aren't completely 0 */ 642 masked |= masked << 1; 643 masked |= masked << 2; 644 masked |= masked << 4; 645 646 /* The msbits are the same as what marks a byte as variant, so we 647 * can use this mask. If all msbits are 1, the word doesn't 648 * contain 'byte' */ 649 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) { 650 s += PERL_WORDSIZE; 651 continue; 652 } 653 654 /* Here, the msbit of bytes in the word that aren't 'byte' are 1, 655 * and any that are, are 0. Complement and re-AND to swap that */ 656 masked = ~ masked; 657 masked &= PERL_VARIANTS_WORD_MASK; 658 659 /* This reduces the problem to that solved by this function */ 660 s += variant_byte_number(masked); 661 return s; 662 663 } while (s + PERL_WORDSIZE <= send); 664 } 665 666 #endif 667 668 while (s < send) { 669 if (((*s) & mask) == byte) { 670 return s; 671 } 672 s++; 673 } 674 675 return s; 676 } 677 678 STATIC U8 * 679 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask) 680 { 681 /* Returns the position of the first byte in the sequence between 's' and 682 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'. 683 * 'span_byte' should have been ANDed with 'mask' in the call of this 684 * function. Returns 'send' if none found. Works like find_span_end(), 685 * except for the AND */ 686 687 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK; 688 689 assert(send >= s); 690 assert((span_byte & mask) == span_byte); 691 692 if ((STRLEN) (send - s) >= PERL_WORDSIZE 693 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 694 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 695 { 696 PERL_UINTMAX_T span_word, mask_word; 697 698 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 699 if (((*s) & mask) != span_byte) { 700 return s; 701 } 702 s++; 703 } 704 705 span_word = PERL_COUNT_MULTIPLIER * span_byte; 706 mask_word = PERL_COUNT_MULTIPLIER * mask; 707 708 do { 709 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; 710 711 if (masked == span_word) { 712 s += PERL_WORDSIZE; 713 continue; 714 } 715 716 #ifdef EBCDIC 717 718 break; 719 720 #else 721 722 masked ^= span_word; 723 masked |= masked << 1; 724 masked |= masked << 2; 725 masked |= masked << 4; 726 return s + variant_byte_number(masked); 727 728 #endif 729 730 } while (s + PERL_WORDSIZE <= send); 731 } 732 733 while (s < send) { 734 if (((*s) & mask) != span_byte) { 735 return s; 736 } 737 s++; 738 } 739 740 return s; 741 } 742 743 /* 744 * pregexec and friends 745 */ 746 747 #ifndef PERL_IN_XSUB_RE 748 /* 749 - pregexec - match a regexp against a string 750 */ 751 I32 752 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, 753 char *strbeg, SSize_t minend, SV *screamer, U32 nosave) 754 /* stringarg: the point in the string at which to begin matching */ 755 /* strend: pointer to null at end of string */ 756 /* strbeg: real beginning of string */ 757 /* minend: end of match must be >= minend bytes after stringarg. */ 758 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string 759 * itself is accessed via the pointers above */ 760 /* nosave: For optimizations. */ 761 { 762 PERL_ARGS_ASSERT_PREGEXEC; 763 764 return 765 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 766 nosave ? 0 : REXEC_COPY_STR); 767 } 768 #endif 769 770 771 772 /* re_intuit_start(): 773 * 774 * Based on some optimiser hints, try to find the earliest position in the 775 * string where the regex could match. 776 * 777 * rx: the regex to match against 778 * sv: the SV being matched: only used for utf8 flag; the string 779 * itself is accessed via the pointers below. Note that on 780 * something like an overloaded SV, SvPOK(sv) may be false 781 * and the string pointers may point to something unrelated to 782 * the SV itself. 783 * strbeg: real beginning of string 784 * strpos: the point in the string at which to begin matching 785 * strend: pointer to the byte following the last char of the string 786 * flags currently unused; set to 0 787 * data: currently unused; set to NULL 788 * 789 * The basic idea of re_intuit_start() is to use some known information 790 * about the pattern, namely: 791 * 792 * a) the longest known anchored substring (i.e. one that's at a 793 * constant offset from the beginning of the pattern; but not 794 * necessarily at a fixed offset from the beginning of the 795 * string); 796 * b) the longest floating substring (i.e. one that's not at a constant 797 * offset from the beginning of the pattern); 798 * c) Whether the pattern is anchored to the string; either 799 * an absolute anchor: /^../, or anchored to \n: /^.../m, 800 * or anchored to pos(): /\G/; 801 * d) A start class: a real or synthetic character class which 802 * represents which characters are legal at the start of the pattern; 803 * 804 * to either quickly reject the match, or to find the earliest position 805 * within the string at which the pattern might match, thus avoiding 806 * running the full NFA engine at those earlier locations, only to 807 * eventually fail and retry further along. 808 * 809 * Returns NULL if the pattern can't match, or returns the address within 810 * the string which is the earliest place the match could occur. 811 * 812 * The longest of the anchored and floating substrings is called 'check' 813 * and is checked first. The other is called 'other' and is checked 814 * second. The 'other' substring may not be present. For example, 815 * 816 * /(abc|xyz)ABC\d{0,3}DEFG/ 817 * 818 * will have 819 * 820 * check substr (float) = "DEFG", offset 6..9 chars 821 * other substr (anchored) = "ABC", offset 3..3 chars 822 * stclass = [ax] 823 * 824 * Be aware that during the course of this function, sometimes 'anchored' 825 * refers to a substring being anchored relative to the start of the 826 * pattern, and sometimes to the pattern itself being anchored relative to 827 * the string. For example: 828 * 829 * /\dabc/: "abc" is anchored to the pattern; 830 * /^\dabc/: "abc" is anchored to the pattern and the string; 831 * /\d+abc/: "abc" is anchored to neither the pattern nor the string; 832 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, 833 * but the pattern is anchored to the string. 834 */ 835 836 char * 837 Perl_re_intuit_start(pTHX_ 838 REGEXP * const rx, 839 SV *sv, 840 const char * const strbeg, 841 char *strpos, 842 char *strend, 843 const U32 flags, 844 re_scream_pos_data *data) 845 { 846 struct regexp *const prog = ReANY(rx); 847 SSize_t start_shift = prog->check_offset_min; 848 /* Should be nonnegative! */ 849 SSize_t end_shift = 0; 850 /* current lowest pos in string where the regex can start matching */ 851 char *rx_origin = strpos; 852 SV *check; 853 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ 854 U8 other_ix = 1 - prog->substrs->check_ix; 855 bool ml_anch = 0; 856 char *other_last = strpos;/* latest pos 'other' substr already checked to */ 857 char *check_at = NULL; /* check substr found at this pos */ 858 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; 859 RXi_GET_DECL(prog,progi); 860 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ 861 regmatch_info *const reginfo = ®info_buf; 862 DECLARE_AND_GET_RE_DEBUG_FLAGS; 863 864 PERL_ARGS_ASSERT_RE_INTUIT_START; 865 PERL_UNUSED_ARG(flags); 866 PERL_UNUSED_ARG(data); 867 868 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 869 "Intuit: trying to determine minimum start position...\n")); 870 871 /* for now, assume that all substr offsets are positive. If at some point 872 * in the future someone wants to do clever things with lookbehind and 873 * -ve offsets, they'll need to fix up any code in this function 874 * which uses these offsets. See the thread beginning 875 * <20140113145929.GF27210@iabyn.com> 876 */ 877 assert(prog->substrs->data[0].min_offset >= 0); 878 assert(prog->substrs->data[0].max_offset >= 0); 879 assert(prog->substrs->data[1].min_offset >= 0); 880 assert(prog->substrs->data[1].max_offset >= 0); 881 assert(prog->substrs->data[2].min_offset >= 0); 882 assert(prog->substrs->data[2].max_offset >= 0); 883 884 /* for now, assume that if both present, that the floating substring 885 * doesn't start before the anchored substring. 886 * If you break this assumption (e.g. doing better optimisations 887 * with lookahead/behind), then you'll need to audit the code in this 888 * function carefully first 889 */ 890 assert( 891 ! ( (prog->anchored_utf8 || prog->anchored_substr) 892 && (prog->float_utf8 || prog->float_substr)) 893 || (prog->float_min_offset >= prog->anchored_offset)); 894 895 /* byte rather than char calculation for efficiency. It fails 896 * to quickly reject some cases that can't match, but will reject 897 * them later after doing full char arithmetic */ 898 if (prog->minlen > strend - strpos) { 899 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 900 " String too short...\n")); 901 goto fail; 902 } 903 904 RXp_MATCH_UTF8_set(prog, utf8_target); 905 reginfo->is_utf8_target = cBOOL(utf8_target); 906 reginfo->info_aux = NULL; 907 reginfo->strbeg = strbeg; 908 reginfo->strend = strend; 909 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 910 reginfo->intuit = 1; 911 /* not actually used within intuit, but zero for safety anyway */ 912 reginfo->poscache_maxiter = 0; 913 914 if (utf8_target) { 915 if ((!prog->anchored_utf8 && prog->anchored_substr) 916 || (!prog->float_utf8 && prog->float_substr)) 917 to_utf8_substr(prog); 918 check = prog->check_utf8; 919 } else { 920 if (!prog->check_substr && prog->check_utf8) { 921 if (! to_byte_substr(prog)) { 922 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); 923 } 924 } 925 check = prog->check_substr; 926 } 927 928 /* dump the various substring data */ 929 DEBUG_OPTIMISE_MORE_r({ 930 int i; 931 for (i=0; i<=2; i++) { 932 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr 933 : prog->substrs->data[i].substr); 934 if (!sv) 935 continue; 936 937 Perl_re_printf( aTHX_ 938 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf 939 " useful=%" IVdf " utf8=%d [%s]\n", 940 i, 941 (IV)prog->substrs->data[i].min_offset, 942 (IV)prog->substrs->data[i].max_offset, 943 (IV)prog->substrs->data[i].end_shift, 944 BmUSEFUL(sv), 945 utf8_target ? 1 : 0, 946 SvPEEK(sv)); 947 } 948 }); 949 950 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ 951 952 /* ml_anch: check after \n? 953 * 954 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning 955 * with /.*.../, these flags will have been added by the 956 * compiler: 957 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL 958 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL 959 */ 960 ml_anch = (prog->intflags & PREGf_ANCH_MBOL) 961 && !(prog->intflags & PREGf_IMPLICIT); 962 963 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { 964 /* we are only allowed to match at BOS or \G */ 965 966 /* trivially reject if there's a BOS anchor and we're not at BOS. 967 * 968 * Note that we don't try to do a similar quick reject for 969 * \G, since generally the caller will have calculated strpos 970 * based on pos() and gofs, so the string is already correctly 971 * anchored by definition; and handling the exceptions would 972 * be too fiddly (e.g. REXEC_IGNOREPOS). 973 */ 974 if ( strpos != strbeg 975 && (prog->intflags & PREGf_ANCH_SBOL)) 976 { 977 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 978 " Not at start...\n")); 979 goto fail; 980 } 981 982 /* in the presence of an anchor, the anchored (relative to the 983 * start of the regex) substr must also be anchored relative 984 * to strpos. So quickly reject if substr isn't found there. 985 * This works for \G too, because the caller will already have 986 * subtracted gofs from pos, and gofs is the offset from the 987 * \G to the start of the regex. For example, in /.abc\Gdef/, 988 * where substr="abcdef", pos()=3, gofs=4, offset_min=1: 989 * caller will have set strpos=pos()-4; we look for the substr 990 * at position pos()-4+1, which lines up with the "a" */ 991 992 if (prog->check_offset_min == prog->check_offset_max) { 993 /* Substring at constant offset from beg-of-str... */ 994 SSize_t slen = SvCUR(check); 995 char *s = HOP3c(strpos, prog->check_offset_min, strend); 996 997 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 998 " Looking for check substr at fixed offset %" IVdf "...\n", 999 (IV)prog->check_offset_min)); 1000 1001 if (SvTAIL(check)) { 1002 /* In this case, the regex is anchored at the end too. 1003 * Unless it's a multiline match, the lengths must match 1004 * exactly, give or take a \n. NB: slen >= 1 since 1005 * the last char of check is \n */ 1006 if (!multiline 1007 && ( strend - s > slen 1008 || strend - s < slen - 1 1009 || (strend - s == slen && strend[-1] != '\n'))) 1010 { 1011 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1012 " String too long...\n")); 1013 goto fail_finish; 1014 } 1015 /* Now should match s[0..slen-2] */ 1016 slen--; 1017 } 1018 if (slen && (strend - s < slen 1019 || *SvPVX_const(check) != *s 1020 || (slen > 1 && (memNE(SvPVX_const(check), s, slen))))) 1021 { 1022 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1023 " String not equal...\n")); 1024 goto fail_finish; 1025 } 1026 1027 check_at = s; 1028 goto success_at_start; 1029 } 1030 } 1031 } 1032 1033 end_shift = prog->check_end_shift; 1034 1035 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ 1036 if (end_shift < 0) 1037 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ", 1038 (IV)end_shift, RX_PRECOMP(rx)); 1039 #endif 1040 1041 restart: 1042 1043 /* This is the (re)entry point of the main loop in this function. 1044 * The goal of this loop is to: 1045 * 1) find the "check" substring in the region rx_origin..strend 1046 * (adjusted by start_shift / end_shift). If not found, reject 1047 * immediately. 1048 * 2) If it exists, look for the "other" substr too if defined; for 1049 * example, if the check substr maps to the anchored substr, then 1050 * check the floating substr, and vice-versa. If not found, go 1051 * back to (1) with rx_origin suitably incremented. 1052 * 3) If we find an rx_origin position that doesn't contradict 1053 * either of the substrings, then check the possible additional 1054 * constraints on rx_origin of /^.../m or a known start class. 1055 * If these fail, then depending on which constraints fail, jump 1056 * back to here, or to various other re-entry points further along 1057 * that skip some of the first steps. 1058 * 4) If we pass all those tests, update the BmUSEFUL() count on the 1059 * substring. If the start position was determined to be at the 1060 * beginning of the string - so, not rejected, but not optimised, 1061 * since we have to run regmatch from position 0 - decrement the 1062 * BmUSEFUL() count. Otherwise increment it. 1063 */ 1064 1065 1066 /* first, look for the 'check' substring */ 1067 1068 { 1069 U8* start_point; 1070 U8* end_point; 1071 1072 DEBUG_OPTIMISE_MORE_r({ 1073 Perl_re_printf( aTHX_ 1074 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf 1075 " Start shift: %" IVdf " End shift %" IVdf 1076 " Real end Shift: %" IVdf "\n", 1077 (IV)(rx_origin - strbeg), 1078 (IV)prog->check_offset_min, 1079 (IV)start_shift, 1080 (IV)end_shift, 1081 (IV)prog->check_end_shift); 1082 }); 1083 1084 end_point = HOPBACK3(strend, end_shift, rx_origin); 1085 if (!end_point) 1086 goto fail_finish; 1087 start_point = HOPMAYBE3(rx_origin, start_shift, end_point); 1088 if (!start_point) 1089 goto fail_finish; 1090 1091 1092 /* If the regex is absolutely anchored to either the start of the 1093 * string (SBOL) or to pos() (ANCH_GPOS), then 1094 * check_offset_max represents an upper bound on the string where 1095 * the substr could start. For the ANCH_GPOS case, we assume that 1096 * the caller of intuit will have already set strpos to 1097 * pos()-gofs, so in this case strpos + offset_max will still be 1098 * an upper bound on the substr. 1099 */ 1100 if (!ml_anch 1101 && prog->intflags & PREGf_ANCH 1102 && prog->check_offset_max != SSize_t_MAX) 1103 { 1104 SSize_t check_len = SvCUR(check) - !!SvTAIL(check); 1105 const char * const anchor = 1106 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); 1107 SSize_t targ_len = (char*)end_point - anchor; 1108 1109 if (check_len > targ_len) { 1110 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1111 "Target string too short to match required substring...\n")); 1112 goto fail_finish; 1113 } 1114 1115 /* do a bytes rather than chars comparison. It's conservative; 1116 * so it skips doing the HOP if the result can't possibly end 1117 * up earlier than the old value of end_point. 1118 */ 1119 assert(anchor + check_len <= (char *)end_point); 1120 if (prog->check_offset_max + check_len < targ_len) { 1121 end_point = HOP3lim((U8*)anchor, 1122 prog->check_offset_max, 1123 end_point - check_len 1124 ) 1125 + check_len; 1126 if (end_point < start_point) 1127 goto fail_finish; 1128 } 1129 } 1130 1131 check_at = fbm_instr( start_point, end_point, 1132 check, multiline ? FBMrf_MULTILINE : 0); 1133 1134 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1135 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n", 1136 (IV)((char*)start_point - strbeg), 1137 (IV)((char*)end_point - strbeg), 1138 (IV)(check_at ? check_at - strbeg : -1) 1139 )); 1140 1141 /* Update the count-of-usability, remove useless subpatterns, 1142 unshift s. */ 1143 1144 DEBUG_EXECUTE_r({ 1145 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 1146 SvPVX_const(check), RE_SV_DUMPLEN(check), 30); 1147 Perl_re_printf( aTHX_ " %s %s substr %s%s%s", 1148 (check_at ? "Found" : "Did not find"), 1149 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) 1150 ? "anchored" : "floating"), 1151 quoted, 1152 RE_SV_TAIL(check), 1153 (check_at ? " at offset " : "...\n") ); 1154 }); 1155 1156 if (!check_at) 1157 goto fail_finish; 1158 /* set rx_origin to the minimum position where the regex could start 1159 * matching, given the constraint of the just-matched check substring. 1160 * But don't set it lower than previously. 1161 */ 1162 1163 if (check_at - rx_origin > prog->check_offset_max) 1164 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); 1165 /* Finish the diagnostic message */ 1166 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1167 "%ld (rx_origin now %" IVdf ")...\n", 1168 (long)(check_at - strbeg), 1169 (IV)(rx_origin - strbeg) 1170 )); 1171 } 1172 1173 1174 /* now look for the 'other' substring if defined */ 1175 1176 if (prog->substrs->data[other_ix].utf8_substr 1177 || prog->substrs->data[other_ix].substr) 1178 { 1179 /* Take into account the "other" substring. */ 1180 char *last, *last1; 1181 char *s; 1182 SV* must; 1183 struct reg_substr_datum *other; 1184 1185 do_other_substr: 1186 other = &prog->substrs->data[other_ix]; 1187 if (!utf8_target && !other->substr) { 1188 if (!to_byte_substr(prog)) { 1189 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); 1190 } 1191 } 1192 1193 /* if "other" is anchored: 1194 * we've previously found a floating substr starting at check_at. 1195 * This means that the regex origin must lie somewhere 1196 * between min (rx_origin): HOP3(check_at, -check_offset_max) 1197 * and max: HOP3(check_at, -check_offset_min) 1198 * (except that min will be >= strpos) 1199 * So the fixed substr must lie somewhere between 1200 * HOP3(min, anchored_offset) 1201 * HOP3(max, anchored_offset) + SvCUR(substr) 1202 */ 1203 1204 /* if "other" is floating 1205 * Calculate last1, the absolute latest point where the 1206 * floating substr could start in the string, ignoring any 1207 * constraints from the earlier fixed match. It is calculated 1208 * as follows: 1209 * 1210 * strend - prog->minlen (in chars) is the absolute latest 1211 * position within the string where the origin of the regex 1212 * could appear. The latest start point for the floating 1213 * substr is float_min_offset(*) on from the start of the 1214 * regex. last1 simply combines thee two offsets. 1215 * 1216 * (*) You might think the latest start point should be 1217 * float_max_offset from the regex origin, and technically 1218 * you'd be correct. However, consider 1219 * /a\d{2,4}bcd\w/ 1220 * Here, float min, max are 3,5 and minlen is 7. 1221 * This can match either 1222 * /a\d\dbcd\w/ 1223 * /a\d\d\dbcd\w/ 1224 * /a\d\d\d\dbcd\w/ 1225 * In the first case, the regex matches minlen chars; in the 1226 * second, minlen+1, in the third, minlen+2. 1227 * In the first case, the floating offset is 3 (which equals 1228 * float_min), in the second, 4, and in the third, 5 (which 1229 * equals float_max). In all cases, the floating string bcd 1230 * can never start more than 4 chars from the end of the 1231 * string, which equals minlen - float_min. As the substring 1232 * starts to match more than float_min from the start of the 1233 * regex, it makes the regex match more than minlen chars, 1234 * and the two cancel each other out. So we can always use 1235 * float_min - minlen, rather than float_max - minlen for the 1236 * latest position in the string. 1237 * 1238 * Note that -minlen + float_min_offset is equivalent (AFAIKT) 1239 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift 1240 */ 1241 1242 assert(prog->minlen >= other->min_offset); 1243 last1 = HOP3c(strend, 1244 other->min_offset - prog->minlen, strbeg); 1245 1246 if (other_ix) {/* i.e. if (other-is-float) */ 1247 /* last is the latest point where the floating substr could 1248 * start, *given* any constraints from the earlier fixed 1249 * match. This constraint is that the floating string starts 1250 * <= float_max_offset chars from the regex origin (rx_origin). 1251 * If this value is less than last1, use it instead. 1252 */ 1253 assert(rx_origin <= last1); 1254 last = 1255 /* this condition handles the offset==infinity case, and 1256 * is a short-cut otherwise. Although it's comparing a 1257 * byte offset to a char length, it does so in a safe way, 1258 * since 1 char always occupies 1 or more bytes, 1259 * so if a string range is (last1 - rx_origin) bytes, 1260 * it will be less than or equal to (last1 - rx_origin) 1261 * chars; meaning it errs towards doing the accurate HOP3 1262 * rather than just using last1 as a short-cut */ 1263 (last1 - rx_origin) < other->max_offset 1264 ? last1 1265 : (char*)HOP3lim(rx_origin, other->max_offset, last1); 1266 } 1267 else { 1268 assert(strpos + start_shift <= check_at); 1269 last = HOP4c(check_at, other->min_offset - start_shift, 1270 strbeg, strend); 1271 } 1272 1273 s = HOP3c(rx_origin, other->min_offset, strend); 1274 if (s < other_last) /* These positions already checked */ 1275 s = other_last; 1276 1277 must = utf8_target ? other->utf8_substr : other->substr; 1278 assert(SvPOK(must)); 1279 { 1280 char *from = s; 1281 char *to = last + SvCUR(must) - (SvTAIL(must)!=0); 1282 1283 if (to > strend) 1284 to = strend; 1285 if (from > to) { 1286 s = NULL; 1287 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1288 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n", 1289 (IV)(from - strbeg), 1290 (IV)(to - strbeg) 1291 )); 1292 } 1293 else { 1294 s = fbm_instr( 1295 (unsigned char*)from, 1296 (unsigned char*)to, 1297 must, 1298 multiline ? FBMrf_MULTILINE : 0 1299 ); 1300 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1301 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n", 1302 (IV)(from - strbeg), 1303 (IV)(to - strbeg), 1304 (IV)(s ? s - strbeg : -1) 1305 )); 1306 } 1307 } 1308 1309 DEBUG_EXECUTE_r({ 1310 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 1311 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 1312 Perl_re_printf( aTHX_ " %s %s substr %s%s", 1313 s ? "Found" : "Contradicts", 1314 other_ix ? "floating" : "anchored", 1315 quoted, RE_SV_TAIL(must)); 1316 }); 1317 1318 1319 if (!s) { 1320 /* last1 is latest possible substr location. If we didn't 1321 * find it before there, we never will */ 1322 if (last >= last1) { 1323 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1324 "; giving up...\n")); 1325 goto fail_finish; 1326 } 1327 1328 /* try to find the check substr again at a later 1329 * position. Maybe next time we'll find the "other" substr 1330 * in range too */ 1331 other_last = HOP3c(last, 1, strend) /* highest failure */; 1332 rx_origin = 1333 other_ix /* i.e. if other-is-float */ 1334 ? HOP3c(rx_origin, 1, strend) 1335 : HOP4c(last, 1 - other->min_offset, strbeg, strend); 1336 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1337 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n", 1338 (other_ix ? "floating" : "anchored"), 1339 (long)(HOP3c(check_at, 1, strend) - strbeg), 1340 (IV)(rx_origin - strbeg) 1341 )); 1342 goto restart; 1343 } 1344 else { 1345 if (other_ix) { /* if (other-is-float) */ 1346 /* other_last is set to s, not s+1, since its possible for 1347 * a floating substr to fail first time, then succeed 1348 * second time at the same floating position; e.g.: 1349 * "-AB--AABZ" =~ /\wAB\d*Z/ 1350 * The first time round, anchored and float match at 1351 * "-(AB)--AAB(Z)" then fail on the initial \w character 1352 * class. Second time round, they match at "-AB--A(AB)(Z)". 1353 */ 1354 other_last = s; 1355 } 1356 else { 1357 rx_origin = HOP3c(s, -other->min_offset, strbeg); 1358 other_last = HOP3c(s, 1, strend); 1359 } 1360 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1361 " at offset %ld (rx_origin now %" IVdf ")...\n", 1362 (long)(s - strbeg), 1363 (IV)(rx_origin - strbeg) 1364 )); 1365 1366 } 1367 } 1368 else { 1369 DEBUG_OPTIMISE_MORE_r( 1370 Perl_re_printf( aTHX_ 1371 " Check-only match: offset min:%" IVdf " max:%" IVdf 1372 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf 1373 " strend:%" IVdf "\n", 1374 (IV)prog->check_offset_min, 1375 (IV)prog->check_offset_max, 1376 (IV)(check_at-strbeg), 1377 (IV)(rx_origin-strbeg), 1378 (IV)(rx_origin-check_at), 1379 (IV)(strend-strbeg) 1380 ) 1381 ); 1382 } 1383 1384 postprocess_substr_matches: 1385 1386 /* handle the extra constraint of /^.../m if present */ 1387 1388 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { 1389 char *s; 1390 1391 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1392 " looking for /^/m anchor")); 1393 1394 /* we have failed the constraint of a \n before rx_origin. 1395 * Find the next \n, if any, even if it's beyond the current 1396 * anchored and/or floating substrings. Whether we should be 1397 * scanning ahead for the next \n or the next substr is debatable. 1398 * On the one hand you'd expect rare substrings to appear less 1399 * often than \n's. On the other hand, searching for \n means 1400 * we're effectively flipping between check_substr and "\n" on each 1401 * iteration as the current "rarest" string candidate, which 1402 * means for example that we'll quickly reject the whole string if 1403 * hasn't got a \n, rather than trying every substr position 1404 * first 1405 */ 1406 1407 s = HOP3c(strend, - prog->minlen, strpos); 1408 if (s <= rx_origin || 1409 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) 1410 { 1411 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1412 " Did not find /%s^%s/m...\n", 1413 PL_colors[0], PL_colors[1])); 1414 goto fail_finish; 1415 } 1416 1417 /* earliest possible origin is 1 char after the \n. 1418 * (since *rx_origin == '\n', it's safe to ++ here rather than 1419 * HOP(rx_origin, 1)) */ 1420 rx_origin++; 1421 1422 if (prog->substrs->check_ix == 0 /* check is anchored */ 1423 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) 1424 { 1425 /* Position contradicts check-string; either because 1426 * check was anchored (and thus has no wiggle room), 1427 * or check was float and rx_origin is above the float range */ 1428 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1429 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n", 1430 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg))); 1431 goto restart; 1432 } 1433 1434 /* if we get here, the check substr must have been float, 1435 * is in range, and we may or may not have had an anchored 1436 * "other" substr which still contradicts */ 1437 assert(prog->substrs->check_ix); /* check is float */ 1438 1439 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { 1440 /* whoops, the anchored "other" substr exists, so we still 1441 * contradict. On the other hand, the float "check" substr 1442 * didn't contradict, so just retry the anchored "other" 1443 * substr */ 1444 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1445 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n", 1446 PL_colors[0], PL_colors[1], 1447 (IV)(rx_origin - strbeg + prog->anchored_offset), 1448 (IV)(rx_origin - strbeg) 1449 )); 1450 goto do_other_substr; 1451 } 1452 1453 /* success: we don't contradict the found floating substring 1454 * (and there's no anchored substr). */ 1455 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1456 " Found /%s^%s/m with rx_origin %ld...\n", 1457 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg))); 1458 } 1459 else { 1460 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1461 " (multiline anchor test skipped)\n")); 1462 } 1463 1464 success_at_start: 1465 1466 1467 /* if we have a starting character class, then test that extra constraint. 1468 * (trie stclasses are too expensive to use here, we are better off to 1469 * leave it to regmatch itself) */ 1470 1471 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { 1472 const U8* const str = (U8*)STRING(progi->regstclass); 1473 1474 /* XXX this value could be pre-computed */ 1475 const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT 1476 ? (reginfo->is_utf8_pat 1477 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str) 1478 : (SSize_t)STR_LEN(progi->regstclass)) 1479 : 1); 1480 char * endpos; 1481 char *s; 1482 /* latest pos that a matching float substr constrains rx start to */ 1483 char *rx_max_float = NULL; 1484 1485 /* if the current rx_origin is anchored, either by satisfying an 1486 * anchored substring constraint, or a /^.../m constraint, then we 1487 * can reject the current origin if the start class isn't found 1488 * at the current position. If we have a float-only match, then 1489 * rx_origin is constrained to a range; so look for the start class 1490 * in that range. if neither, then look for the start class in the 1491 * whole rest of the string */ 1492 1493 /* XXX DAPM it's not clear what the minlen test is for, and why 1494 * it's not used in the floating case. Nothing in the test suite 1495 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. 1496 * Here are some old comments, which may or may not be correct: 1497 * 1498 * minlen == 0 is possible if regstclass is \b or \B, 1499 * and the fixed substr is ''$. 1500 * Since minlen is already taken into account, rx_origin+1 is 1501 * before strend; accidentally, minlen >= 1 guaranties no false 1502 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : 1503 * 0) below assumes that regstclass does not come from lookahead... 1504 * If regstclass takes bytelength more than 1: If charlength==1, OK. 1505 * This leaves EXACTF-ish only, which are dealt with in 1506 * find_byclass(). 1507 */ 1508 1509 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) 1510 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend); 1511 else if (prog->float_substr || prog->float_utf8) { 1512 rx_max_float = HOP3c(check_at, -start_shift, strbeg); 1513 endpos = HOP3clim(rx_max_float, cl_l, strend); 1514 } 1515 else 1516 endpos= strend; 1517 1518 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1519 " looking for class: start_shift: %" IVdf " check_at: %" IVdf 1520 " rx_origin: %" IVdf " endpos: %" IVdf "\n", 1521 (IV)start_shift, (IV)(check_at - strbeg), 1522 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); 1523 1524 s = find_byclass(prog, progi->regstclass, rx_origin, endpos, 1525 reginfo); 1526 if (!s) { 1527 if (endpos == strend) { 1528 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1529 " Could not match STCLASS...\n") ); 1530 goto fail; 1531 } 1532 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1533 " This position contradicts STCLASS...\n") ); 1534 if ((prog->intflags & PREGf_ANCH) && !ml_anch 1535 && !(prog->intflags & PREGf_IMPLICIT)) 1536 goto fail; 1537 1538 /* Contradict one of substrings */ 1539 if (prog->anchored_substr || prog->anchored_utf8) { 1540 if (prog->substrs->check_ix == 1) { /* check is float */ 1541 /* Have both, check_string is floating */ 1542 assert(rx_origin + start_shift <= check_at); 1543 if (rx_origin + start_shift != check_at) { 1544 /* not at latest position float substr could match: 1545 * Recheck anchored substring, but not floating. 1546 * The condition above is in bytes rather than 1547 * chars for efficiency. It's conservative, in 1548 * that it errs on the side of doing 'goto 1549 * do_other_substr'. In this case, at worst, 1550 * an extra anchored search may get done, but in 1551 * practice the extra fbm_instr() is likely to 1552 * get skipped anyway. */ 1553 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1554 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n", 1555 (long)(other_last - strbeg), 1556 (IV)(rx_origin - strbeg) 1557 )); 1558 goto do_other_substr; 1559 } 1560 } 1561 } 1562 else { 1563 /* float-only */ 1564 1565 if (ml_anch) { 1566 /* In the presence of ml_anch, we might be able to 1567 * find another \n without breaking the current float 1568 * constraint. */ 1569 1570 /* strictly speaking this should be HOP3c(..., 1, ...), 1571 * but since we goto a block of code that's going to 1572 * search for the next \n if any, its safe here */ 1573 rx_origin++; 1574 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1575 " about to look for /%s^%s/m starting at rx_origin %ld...\n", 1576 PL_colors[0], PL_colors[1], 1577 (long)(rx_origin - strbeg)) ); 1578 goto postprocess_substr_matches; 1579 } 1580 1581 /* strictly speaking this can never be true; but might 1582 * be if we ever allow intuit without substrings */ 1583 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) 1584 goto fail; 1585 1586 rx_origin = rx_max_float; 1587 } 1588 1589 /* at this point, any matching substrings have been 1590 * contradicted. Start again... */ 1591 1592 rx_origin = HOP3c(rx_origin, 1, strend); 1593 1594 /* uses bytes rather than char calculations for efficiency. 1595 * It's conservative: it errs on the side of doing 'goto restart', 1596 * where there is code that does a proper char-based test */ 1597 if (rx_origin + start_shift + end_shift > strend) { 1598 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1599 " Could not match STCLASS...\n") ); 1600 goto fail; 1601 } 1602 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1603 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n", 1604 (prog->substrs->check_ix ? "floating" : "anchored"), 1605 (long)(rx_origin + start_shift - strbeg), 1606 (IV)(rx_origin - strbeg) 1607 )); 1608 goto restart; 1609 } 1610 1611 /* Success !!! */ 1612 1613 if (rx_origin != s) { 1614 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1615 " By STCLASS: moving %ld --> %ld\n", 1616 (long)(rx_origin - strbeg), (long)(s - strbeg)) 1617 ); 1618 } 1619 else { 1620 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1621 " Does not contradict STCLASS...\n"); 1622 ); 1623 } 1624 } 1625 1626 /* Decide whether using the substrings helped */ 1627 1628 if (rx_origin != strpos) { 1629 /* Fixed substring is found far enough so that the match 1630 cannot start at strpos. */ 1631 1632 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n")); 1633 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ 1634 } 1635 else { 1636 /* The found rx_origin position does not prohibit matching at 1637 * strpos, so calling intuit didn't gain us anything. Decrement 1638 * the BmUSEFUL() count on the check substring, and if we reach 1639 * zero, free it. */ 1640 if (!(prog->intflags & PREGf_NAUGHTY) 1641 && (utf8_target ? ( 1642 prog->check_utf8 /* Could be deleted already */ 1643 && --BmUSEFUL(prog->check_utf8) < 0 1644 && (prog->check_utf8 == prog->float_utf8) 1645 ) : ( 1646 prog->check_substr /* Could be deleted already */ 1647 && --BmUSEFUL(prog->check_substr) < 0 1648 && (prog->check_substr == prog->float_substr) 1649 ))) 1650 { 1651 /* If flags & SOMETHING - do not do it many times on the same match */ 1652 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n")); 1653 /* XXX Does the destruction order has to change with utf8_target? */ 1654 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); 1655 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); 1656 prog->check_substr = prog->check_utf8 = NULL; /* disable */ 1657 prog->float_substr = prog->float_utf8 = NULL; /* clear */ 1658 check = NULL; /* abort */ 1659 /* XXXX This is a remnant of the old implementation. It 1660 looks wasteful, since now INTUIT can use many 1661 other heuristics. */ 1662 prog->extflags &= ~RXf_USE_INTUIT; 1663 } 1664 } 1665 1666 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1667 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", 1668 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) ); 1669 1670 return rx_origin; 1671 1672 fail_finish: /* Substring not found */ 1673 if (prog->check_substr || prog->check_utf8) /* could be removed already */ 1674 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ 1675 fail: 1676 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n", 1677 PL_colors[4], PL_colors[5])); 1678 return NULL; 1679 } 1680 1681 1682 #define DECL_TRIE_TYPE(scan) \ 1683 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ 1684 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \ 1685 trie_utf8l, trie_flu8, trie_flu8_latin } \ 1686 trie_type = ((scan->flags == EXACT) \ 1687 ? (utf8_target ? trie_utf8 : trie_plain) \ 1688 : (scan->flags == EXACTL) \ 1689 ? (utf8_target ? trie_utf8l : trie_plain) \ 1690 : (scan->flags == EXACTFAA) \ 1691 ? (utf8_target \ 1692 ? trie_utf8_exactfa_fold \ 1693 : trie_latin_utf8_exactfa_fold) \ 1694 : (scan->flags == EXACTFLU8 \ 1695 ? (utf8_target \ 1696 ? trie_flu8 \ 1697 : trie_flu8_latin) \ 1698 : (utf8_target \ 1699 ? trie_utf8_fold \ 1700 : trie_latin_utf8_fold))) 1701 1702 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is 1703 * 'foldbuf+sizeof(foldbuf)' */ 1704 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ 1705 STMT_START { \ 1706 STRLEN skiplen; \ 1707 U8 flags = FOLD_FLAGS_FULL; \ 1708 switch (trie_type) { \ 1709 case trie_flu8: \ 1710 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ 1711 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \ 1712 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \ 1713 } \ 1714 goto do_trie_utf8_fold; \ 1715 case trie_utf8_exactfa_fold: \ 1716 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1717 /* FALLTHROUGH */ \ 1718 case trie_utf8_fold: \ 1719 do_trie_utf8_fold: \ 1720 if ( foldlen>0 ) { \ 1721 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \ 1722 foldlen -= len; \ 1723 uscan += len; \ 1724 len=0; \ 1725 } else { \ 1726 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \ 1727 flags); \ 1728 len = UTF8_SAFE_SKIP(uc, uc_end); \ 1729 skiplen = UVCHR_SKIP( uvc ); \ 1730 foldlen -= skiplen; \ 1731 uscan = foldbuf + skiplen; \ 1732 } \ 1733 break; \ 1734 case trie_flu8_latin: \ 1735 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ 1736 goto do_trie_latin_utf8_fold; \ 1737 case trie_latin_utf8_exactfa_fold: \ 1738 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1739 /* FALLTHROUGH */ \ 1740 case trie_latin_utf8_fold: \ 1741 do_trie_latin_utf8_fold: \ 1742 if ( foldlen>0 ) { \ 1743 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \ 1744 foldlen -= len; \ 1745 uscan += len; \ 1746 len=0; \ 1747 } else { \ 1748 len = 1; \ 1749 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ 1750 skiplen = UVCHR_SKIP( uvc ); \ 1751 foldlen -= skiplen; \ 1752 uscan = foldbuf + skiplen; \ 1753 } \ 1754 break; \ 1755 case trie_utf8l: \ 1756 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ 1757 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \ 1758 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \ 1759 } \ 1760 /* FALLTHROUGH */ \ 1761 case trie_utf8: \ 1762 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \ 1763 break; \ 1764 case trie_plain: \ 1765 uvc = (UV)*uc; \ 1766 len = 1; \ 1767 } \ 1768 if (uvc < 256) { \ 1769 charid = trie->charmap[ uvc ]; \ 1770 } \ 1771 else { \ 1772 charid = 0; \ 1773 if (widecharmap) { \ 1774 SV** const svpp = hv_fetch(widecharmap, \ 1775 (char*)&uvc, sizeof(UV), 0); \ 1776 if (svpp) \ 1777 charid = (U16)SvIV(*svpp); \ 1778 } \ 1779 } \ 1780 } STMT_END 1781 1782 #define DUMP_EXEC_POS(li,s,doutf8,depth) \ 1783 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ 1784 startpos, doutf8, depth) 1785 1786 #define REXEC_FBC_SCAN(UTF8, CODE) \ 1787 STMT_START { \ 1788 while (s < strend) { \ 1789 CODE \ 1790 s += ((UTF8) \ 1791 ? UTF8_SAFE_SKIP(s, reginfo->strend) \ 1792 : 1); \ 1793 } \ 1794 } STMT_END 1795 1796 #define REXEC_FBC_CLASS_SCAN(UTF8, COND) \ 1797 STMT_START { \ 1798 while (s < strend) { \ 1799 REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \ 1800 } \ 1801 } STMT_END 1802 1803 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \ 1804 if (COND) { \ 1805 FBC_CHECK_AND_TRY \ 1806 s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\ 1807 previous_occurrence_end = s; \ 1808 } \ 1809 else { \ 1810 s += ((UTF8) ? UTF8SKIP(s) : 1); \ 1811 } 1812 1813 #define REXEC_FBC_CSCAN(CONDUTF8,COND) \ 1814 if (utf8_target) { \ 1815 REXEC_FBC_CLASS_SCAN(1, CONDUTF8); \ 1816 } \ 1817 else { \ 1818 REXEC_FBC_CLASS_SCAN(0, COND); \ 1819 } 1820 1821 /* We keep track of where the next character should start after an occurrence 1822 * of the one we're looking for. Knowing that, we can see right away if the 1823 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we 1824 * don't accept the 2nd and succeeding adjacent occurrences */ 1825 #define FBC_CHECK_AND_TRY \ 1826 if ( ( doevery \ 1827 || s != previous_occurrence_end) \ 1828 && ( reginfo->intuit \ 1829 || (s <= reginfo->strend && regtry(reginfo, &s)))) \ 1830 { \ 1831 goto got_it; \ 1832 } 1833 1834 1835 /* This differs from the above macros in that it calls a function which returns 1836 * the next occurrence of the thing being looked for in 's'; and 'strend' if 1837 * there is no such occurrence. */ 1838 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f) \ 1839 while (s < strend) { \ 1840 s = (f); \ 1841 if (s >= strend) { \ 1842 break; \ 1843 } \ 1844 \ 1845 FBC_CHECK_AND_TRY \ 1846 s += (UTF8) ? UTF8SKIP(s) : 1; \ 1847 previous_occurrence_end = s; \ 1848 } 1849 1850 /* This differs from the above macros in that it is passed a single byte that 1851 * is known to begin the next occurrence of the thing being looked for in 's'. 1852 * It does a memchr to find the next occurrence of 'byte', before trying 'COND' 1853 * at that position. */ 1854 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \ 1855 while (s < strend) { \ 1856 s = (char *) memchr(s, byte, strend -s); \ 1857 if (s == NULL) { \ 1858 s = (char *) strend; \ 1859 break; \ 1860 } \ 1861 \ 1862 if (COND) { \ 1863 FBC_CHECK_AND_TRY \ 1864 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1865 previous_occurrence_end = s; \ 1866 } \ 1867 else { \ 1868 s += UTF8SKIP(s); \ 1869 } \ 1870 } 1871 1872 /* The three macros below are slightly different versions of the same logic. 1873 * 1874 * The first is for /a and /aa when the target string is UTF-8. This can only 1875 * match ascii, but it must advance based on UTF-8. The other two handle the 1876 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking 1877 * for the boundary (or non-boundary) between a word and non-word character. 1878 * The utf8 and non-utf8 cases have the same logic, but the details must be 1879 * different. Find the "wordness" of the character just prior to this one, and 1880 * compare it with the wordness of this one. If they differ, we have a 1881 * boundary. At the beginning of the string, pretend that the previous 1882 * character was a new-line. 1883 * 1884 * All these macros uncleanly have side-effects with each other and outside 1885 * variables. So far it's been too much trouble to clean-up 1886 * 1887 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is 1888 * a word character or not. 1889 * IF_SUCCESS is code to do if it finds that we are at a boundary between 1890 * word/non-word 1891 * IF_FAIL is code to do if we aren't at a boundary between word/non-word 1892 * 1893 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we 1894 * are looking for a boundary or for a non-boundary. If we are looking for a 1895 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and 1896 * see if this tentative match actually works, and if so, to quit the loop 1897 * here. And vice-versa if we are looking for a non-boundary. 1898 * 1899 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and 1900 * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of 1901 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be 1902 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal 1903 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that 1904 * complement. But in that branch we complement tmp, meaning that at the 1905 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s), 1906 * which means at the top of the loop in the next iteration, it is 1907 * TEST_NON_UTF8(s-1) */ 1908 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 1909 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 1910 tmp = TEST_NON_UTF8(tmp); \ 1911 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \ 1912 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ 1913 tmp = !tmp; \ 1914 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ 1915 } \ 1916 else { \ 1917 IF_FAIL; \ 1918 } \ 1919 ); \ 1920 1921 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and 1922 * TEST_UTF8 is a macro that for the same input code points returns identically 1923 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */ 1924 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ 1925 if (s == reginfo->strbeg) { \ 1926 tmp = '\n'; \ 1927 } \ 1928 else { /* Back-up to the start of the previous character */ \ 1929 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ 1930 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ 1931 0, UTF8_ALLOW_DEFAULT); \ 1932 } \ 1933 tmp = TEST_UV(tmp); \ 1934 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \ 1935 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \ 1936 tmp = !tmp; \ 1937 IF_SUCCESS; \ 1938 } \ 1939 else { \ 1940 IF_FAIL; \ 1941 } \ 1942 ); 1943 1944 /* Like the above two macros. UTF8_CODE is the complete code for handling 1945 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc 1946 * macros below */ 1947 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 1948 if (utf8_target) { \ 1949 UTF8_CODE \ 1950 } \ 1951 else { /* Not utf8 */ \ 1952 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 1953 tmp = TEST_NON_UTF8(tmp); \ 1954 REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */ \ 1955 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ 1956 IF_SUCCESS; \ 1957 tmp = !tmp; \ 1958 } \ 1959 else { \ 1960 IF_FAIL; \ 1961 } \ 1962 ); \ 1963 } \ 1964 /* Here, things have been set up by the previous code so that tmp is the \ 1965 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \ 1966 * utf8ness of the target). We also have to check if this matches against \ 1967 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \ 1968 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \ 1969 * string */ \ 1970 if (tmp == ! TEST_NON_UTF8('\n')) { \ 1971 IF_SUCCESS; \ 1972 } \ 1973 else { \ 1974 IF_FAIL; \ 1975 } 1976 1977 /* This is the macro to use when we want to see if something that looks like it 1978 * could match, actually does, and if so exits the loop. It needs to be used 1979 * only for bounds checking macros, as it allows for matching beyond the end of 1980 * string (which should be zero length without having to look at the string 1981 * contents) */ 1982 #define REXEC_FBC_TRYIT \ 1983 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \ 1984 goto got_it 1985 1986 /* The only difference between the BOUND and NBOUND cases is that 1987 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in 1988 * NBOUND. This is accomplished by passing it as either the if or else clause, 1989 * with the other one being empty (PLACEHOLDER is defined as empty). 1990 * 1991 * The TEST_FOO parameters are for operating on different forms of input, but 1992 * all should be ones that return identically for the same underlying code 1993 * points */ 1994 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ 1995 FBC_BOUND_COMMON( \ 1996 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ 1997 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 1998 1999 #define FBC_BOUND_A(TEST_NON_UTF8) \ 2000 FBC_BOUND_COMMON( \ 2001 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ 2002 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2003 2004 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ 2005 FBC_BOUND_COMMON( \ 2006 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ 2007 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2008 2009 #define FBC_NBOUND_A(TEST_NON_UTF8) \ 2010 FBC_BOUND_COMMON( \ 2011 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ 2012 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2013 2014 #ifdef DEBUGGING 2015 static IV 2016 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) { 2017 IV cp_out = _invlist_search(invlist, cp_in); 2018 assert(cp_out >= 0); 2019 return cp_out; 2020 } 2021 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ 2022 invmap[S_get_break_val_cp_checked(invlist, cp)] 2023 #else 2024 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ 2025 invmap[_invlist_search(invlist, cp)] 2026 #endif 2027 2028 /* Takes a pointer to an inversion list, a pointer to its corresponding 2029 * inversion map, and a code point, and returns the code point's value 2030 * according to the two arrays. It assumes that all code points have a value. 2031 * This is used as the base macro for macros for particular properties */ 2032 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \ 2033 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) 2034 2035 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead 2036 * of a code point, returning the value for the first code point in the string. 2037 * And it takes the particular macro name that finds the desired value given a 2038 * code point. Merely convert the UTF-8 to code point and call the cp macro */ 2039 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \ 2040 (__ASSERT_(pos < strend) \ 2041 /* Note assumes is valid UTF-8 */ \ 2042 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL)))) 2043 2044 /* Returns the GCB value for the input code point */ 2045 #define getGCB_VAL_CP(cp) \ 2046 _generic_GET_BREAK_VAL_CP( \ 2047 PL_GCB_invlist, \ 2048 _Perl_GCB_invmap, \ 2049 (cp)) 2050 2051 /* Returns the GCB value for the first code point in the UTF-8 encoded string 2052 * bounded by pos and strend */ 2053 #define getGCB_VAL_UTF8(pos, strend) \ 2054 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend) 2055 2056 /* Returns the LB value for the input code point */ 2057 #define getLB_VAL_CP(cp) \ 2058 _generic_GET_BREAK_VAL_CP( \ 2059 PL_LB_invlist, \ 2060 _Perl_LB_invmap, \ 2061 (cp)) 2062 2063 /* Returns the LB value for the first code point in the UTF-8 encoded string 2064 * bounded by pos and strend */ 2065 #define getLB_VAL_UTF8(pos, strend) \ 2066 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend) 2067 2068 2069 /* Returns the SB value for the input code point */ 2070 #define getSB_VAL_CP(cp) \ 2071 _generic_GET_BREAK_VAL_CP( \ 2072 PL_SB_invlist, \ 2073 _Perl_SB_invmap, \ 2074 (cp)) 2075 2076 /* Returns the SB value for the first code point in the UTF-8 encoded string 2077 * bounded by pos and strend */ 2078 #define getSB_VAL_UTF8(pos, strend) \ 2079 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend) 2080 2081 /* Returns the WB value for the input code point */ 2082 #define getWB_VAL_CP(cp) \ 2083 _generic_GET_BREAK_VAL_CP( \ 2084 PL_WB_invlist, \ 2085 _Perl_WB_invmap, \ 2086 (cp)) 2087 2088 /* Returns the WB value for the first code point in the UTF-8 encoded string 2089 * bounded by pos and strend */ 2090 #define getWB_VAL_UTF8(pos, strend) \ 2091 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend) 2092 2093 /* We know what class REx starts with. Try to find this position... */ 2094 /* if reginfo->intuit, its a dryrun */ 2095 /* annoyingly all the vars in this routine have different names from their counterparts 2096 in regmatch. /grrr */ 2097 STATIC char * 2098 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, 2099 const char *strend, regmatch_info *reginfo) 2100 { 2101 dVAR; 2102 2103 /* TRUE if x+ need not match at just the 1st pos of run of x's */ 2104 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; 2105 2106 char *pat_string; /* The pattern's exactish string */ 2107 char *pat_end; /* ptr to end char of pat_string */ 2108 re_fold_t folder; /* Function for computing non-utf8 folds */ 2109 const U8 *fold_array; /* array for folding ords < 256 */ 2110 STRLEN ln; 2111 STRLEN lnc; 2112 U8 c1; 2113 U8 c2; 2114 char *e = NULL; 2115 2116 /* In some cases we accept only the first occurence of 'x' in a sequence of 2117 * them. This variable points to just beyond the end of the previous 2118 * occurrence of 'x', hence we can tell if we are in a sequence. (Having 2119 * it point to beyond the 'x' allows us to work for UTF-8 without having to 2120 * hop back.) */ 2121 char * previous_occurrence_end = 0; 2122 2123 I32 tmp; /* Scratch variable */ 2124 const bool utf8_target = reginfo->is_utf8_target; 2125 UV utf8_fold_flags = 0; 2126 const bool is_utf8_pat = reginfo->is_utf8_pat; 2127 bool to_complement = FALSE; /* Invert the result? Taking the xor of this 2128 with a result inverts that result, as 0^1 = 2129 1 and 1^1 = 0 */ 2130 _char_class_number classnum; 2131 2132 RXi_GET_DECL(prog,progi); 2133 2134 PERL_ARGS_ASSERT_FIND_BYCLASS; 2135 2136 /* We know what class it must start with. */ 2137 switch (OP(c)) { 2138 case ANYOFPOSIXL: 2139 case ANYOFL: 2140 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2141 2142 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) { 2143 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); 2144 } 2145 2146 /* FALLTHROUGH */ 2147 case ANYOFD: 2148 case ANYOF: 2149 if (utf8_target) { 2150 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */ 2151 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); 2152 } 2153 else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 2154 /* We know that s is in the bitmap range since the target isn't 2155 * UTF-8, so what happens for out-of-range values is not relevant, 2156 * so exclude that from the flags */ 2157 REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0)); 2158 } 2159 else { 2160 REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s))); 2161 } 2162 break; 2163 2164 case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */ 2165 /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so 2166 * use 0 */ 2167 REXEC_FBC_FIND_NEXT_SCAN(0, 2168 (char *) find_next_masked((U8 *) s, (U8 *) strend, 2169 (U8) ARG(c), FLAGS(c))); 2170 break; 2171 2172 case NANYOFM: /* UTF-8ness does matter because can match UTF-8 variants. 2173 */ 2174 REXEC_FBC_FIND_NEXT_SCAN(utf8_target, 2175 (char *) find_span_end_mask((U8 *) s, (U8 *) strend, 2176 (U8) ARG(c), FLAGS(c))); 2177 break; 2178 2179 case ANYOFH: 2180 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ 2181 REXEC_FBC_CLASS_SCAN(TRUE, 2182 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) 2183 && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); 2184 } 2185 break; 2186 2187 case ANYOFHb: 2188 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ 2189 2190 /* We know what the first byte of any matched string should be */ 2191 U8 first_byte = FLAGS(c); 2192 2193 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2194 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); 2195 } 2196 break; 2197 2198 case ANYOFHr: 2199 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ 2200 REXEC_FBC_CLASS_SCAN(TRUE, 2201 ( inRANGE(NATIVE_UTF8_TO_I8(*s), 2202 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)), 2203 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c))) 2204 && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); 2205 } 2206 break; 2207 2208 case ANYOFHs: 2209 if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ 2210 REXEC_FBC_CLASS_SCAN(TRUE, 2211 ( strend -s >= FLAGS(c) 2212 && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c)) 2213 && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); 2214 } 2215 break; 2216 2217 case ANYOFR: 2218 if (utf8_target) { 2219 REXEC_FBC_CLASS_SCAN(TRUE, 2220 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) 2221 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s, 2222 (U8 *) strend, 2223 NULL), 2224 ANYOFRbase(c), ANYOFRdelta(c)))); 2225 } 2226 else { 2227 REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s, 2228 ANYOFRbase(c), ANYOFRdelta(c))); 2229 } 2230 break; 2231 2232 case ANYOFRb: 2233 if (utf8_target) { 2234 2235 /* We know what the first byte of any matched string should be */ 2236 U8 first_byte = FLAGS(c); 2237 2238 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2239 withinCOUNT(utf8_to_uvchr_buf((U8 *) s, 2240 (U8 *) strend, 2241 NULL), 2242 ANYOFRbase(c), ANYOFRdelta(c))); 2243 } 2244 else { 2245 REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s, 2246 ANYOFRbase(c), ANYOFRdelta(c))); 2247 } 2248 break; 2249 2250 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ 2251 assert(! is_utf8_pat); 2252 /* FALLTHROUGH */ 2253 case EXACTFAA: 2254 if (is_utf8_pat) { 2255 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII 2256 |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE; 2257 goto do_exactf_utf8; 2258 } 2259 else if (utf8_target) { 2260 2261 /* Here, and elsewhere in this file, the reason we can't consider a 2262 * non-UTF-8 pattern already folded in the presence of a UTF-8 2263 * target is because any MICRO SIGN in the pattern won't be folded. 2264 * Since the fold of the MICRO SIGN requires UTF-8 to represent, we 2265 * can consider a non-UTF-8 pattern folded when matching a 2266 * non-UTF-8 target */ 2267 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 2268 goto do_exactf_utf8; 2269 } 2270 2271 /* Latin1 folds are not affected by /a, except it excludes the sharp s, 2272 * which these functions don't handle anyway */ 2273 fold_array = PL_fold_latin1; 2274 folder = foldEQ_latin1_s2_folded; 2275 goto do_exactf_non_utf8; 2276 2277 case EXACTF: /* This node only generated for non-utf8 patterns */ 2278 assert(! is_utf8_pat); 2279 if (utf8_target) { 2280 goto do_exactf_utf8; 2281 } 2282 fold_array = PL_fold; 2283 folder = foldEQ; 2284 goto do_exactf_non_utf8; 2285 2286 case EXACTFL: 2287 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2288 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { 2289 utf8_fold_flags = FOLDEQ_LOCALE; 2290 goto do_exactf_utf8; 2291 } 2292 fold_array = PL_fold_locale; 2293 folder = foldEQ_locale; 2294 goto do_exactf_non_utf8; 2295 2296 case EXACTFUP: /* Problematic even though pattern isn't UTF-8. Use 2297 full functionality normally not done except for 2298 UTF-8 */ 2299 assert(! is_utf8_pat); 2300 goto do_exactf_utf8; 2301 2302 case EXACTFLU8: 2303 if (! utf8_target) { /* All code points in this node require 2304 UTF-8 to express. */ 2305 break; 2306 } 2307 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED 2308 | FOLDEQ_S2_FOLDS_SANE; 2309 goto do_exactf_utf8; 2310 2311 case EXACTFU_REQ8: 2312 if (! utf8_target) { 2313 break; 2314 } 2315 assert(is_utf8_pat); 2316 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; 2317 goto do_exactf_utf8; 2318 2319 case EXACTFU: 2320 if (is_utf8_pat || utf8_target) { 2321 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; 2322 goto do_exactf_utf8; 2323 } 2324 2325 /* Any 'ss' in the pattern should have been replaced by regcomp, 2326 * so we don't have to worry here about this single special case 2327 * in the Latin1 range */ 2328 fold_array = PL_fold_latin1; 2329 folder = foldEQ_latin1_s2_folded; 2330 2331 /* FALLTHROUGH */ 2332 2333 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there 2334 are no glitches with fold-length differences 2335 between the target string and pattern */ 2336 2337 /* The idea in the non-utf8 EXACTF* cases is to first find the 2338 * first character of the EXACTF* node and then, if necessary, 2339 * case-insensitively compare the full text of the node. c1 is the 2340 * first character. c2 is its fold. This logic will not work for 2341 * Unicode semantics and the german sharp ss, which hence should 2342 * not be compiled into a node that gets here. */ 2343 pat_string = STRINGs(c); 2344 ln = STR_LENs(c); /* length to match in octets/bytes */ 2345 2346 /* We know that we have to match at least 'ln' bytes (which is the 2347 * same as characters, since not utf8). If we have to match 3 2348 * characters, and there are only 2 availabe, we know without 2349 * trying that it will fail; so don't start a match past the 2350 * required minimum number from the far end */ 2351 e = HOP3c(strend, -((SSize_t)ln), s); 2352 if (e < s) 2353 break; 2354 2355 c1 = *pat_string; 2356 c2 = fold_array[c1]; 2357 if (c1 == c2) { /* If char and fold are the same */ 2358 while (s <= e) { 2359 s = (char *) memchr(s, c1, e + 1 - s); 2360 if (s == NULL) { 2361 break; 2362 } 2363 2364 /* Check that the rest of the node matches */ 2365 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1)) 2366 && (reginfo->intuit || regtry(reginfo, &s)) ) 2367 { 2368 goto got_it; 2369 } 2370 s++; 2371 } 2372 } 2373 else { 2374 U8 bits_differing = c1 ^ c2; 2375 2376 /* If the folds differ in one bit position only, we can mask to 2377 * match either of them, and can use this faster find method. Both 2378 * ASCII and EBCDIC tend to have their case folds differ in only 2379 * one position, so this is very likely */ 2380 if (LIKELY(PL_bitcount[bits_differing] == 1)) { 2381 bits_differing = ~ bits_differing; 2382 while (s <= e) { 2383 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1, 2384 (c1 & bits_differing), bits_differing); 2385 if (s > e) { 2386 break; 2387 } 2388 2389 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1)) 2390 && (reginfo->intuit || regtry(reginfo, &s)) ) 2391 { 2392 goto got_it; 2393 } 2394 s++; 2395 } 2396 } 2397 else { /* Otherwise, stuck with looking byte-at-a-time. This 2398 should actually happen only in EXACTFL nodes */ 2399 while (s <= e) { 2400 if ( (*(U8*)s == c1 || *(U8*)s == c2) 2401 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1)) 2402 && (reginfo->intuit || regtry(reginfo, &s)) ) 2403 { 2404 goto got_it; 2405 } 2406 s++; 2407 } 2408 } 2409 } 2410 break; 2411 2412 do_exactf_utf8: 2413 { 2414 unsigned expansion; 2415 2416 /* If one of the operands is in utf8, we can't use the simpler folding 2417 * above, due to the fact that many different characters can have the 2418 * same fold, or portion of a fold, or different- length fold */ 2419 pat_string = STRINGs(c); 2420 ln = STR_LENs(c); /* length to match in octets/bytes */ 2421 pat_end = pat_string + ln; 2422 lnc = is_utf8_pat /* length to match in characters */ 2423 ? utf8_length((U8 *) pat_string, (U8 *) pat_end) 2424 : ln; 2425 2426 /* We have 'lnc' characters to match in the pattern, but because of 2427 * multi-character folding, each character in the target can match 2428 * up to 3 characters (Unicode guarantees it will never exceed 2429 * this) if it is utf8-encoded; and up to 2 if not (based on the 2430 * fact that the Latin 1 folds are already determined, and the 2431 * only multi-char fold in that range is the sharp-s folding to 2432 * 'ss'. Thus, a pattern character can match as little as 1/3 of a 2433 * string character. Adjust lnc accordingly, rounding up, so that 2434 * if we need to match at least 4+1/3 chars, that really is 5. */ 2435 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; 2436 lnc = (lnc + expansion - 1) / expansion; 2437 2438 /* As in the non-UTF8 case, if we have to match 3 characters, and 2439 * only 2 are left, it's guaranteed to fail, so don't start a 2440 * match that would require us to go beyond the end of the string 2441 */ 2442 e = HOP3c(strend, -((SSize_t)lnc), s); 2443 2444 /* XXX Note that we could recalculate e to stop the loop earlier, 2445 * as the worst case expansion above will rarely be met, and as we 2446 * go along we would usually find that e moves further to the left. 2447 * This would happen only after we reached the point in the loop 2448 * where if there were no expansion we should fail. Unclear if 2449 * worth the expense */ 2450 2451 while (s <= e) { 2452 char *my_strend= (char *)strend; 2453 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, 2454 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags) 2455 && (reginfo->intuit || regtry(reginfo, &s)) ) 2456 { 2457 goto got_it; 2458 } 2459 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; 2460 } 2461 break; 2462 } 2463 2464 case BOUNDL: 2465 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2466 if (FLAGS(c) != TRADITIONAL_BOUND) { 2467 if (! IN_UTF8_CTYPE_LOCALE) { 2468 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 2469 B_ON_NON_UTF8_LOCALE_IS_WRONG); 2470 } 2471 goto do_boundu; 2472 } 2473 2474 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe); 2475 break; 2476 2477 case NBOUNDL: 2478 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2479 if (FLAGS(c) != TRADITIONAL_BOUND) { 2480 if (! IN_UTF8_CTYPE_LOCALE) { 2481 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 2482 B_ON_NON_UTF8_LOCALE_IS_WRONG); 2483 } 2484 goto do_nboundu; 2485 } 2486 2487 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe); 2488 break; 2489 2490 case BOUND: /* regcomp.c makes sure that this only has the traditional \b 2491 meaning */ 2492 assert(FLAGS(c) == TRADITIONAL_BOUND); 2493 2494 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe); 2495 break; 2496 2497 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b 2498 meaning */ 2499 assert(FLAGS(c) == TRADITIONAL_BOUND); 2500 2501 FBC_BOUND_A(isWORDCHAR_A); 2502 break; 2503 2504 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b 2505 meaning */ 2506 assert(FLAGS(c) == TRADITIONAL_BOUND); 2507 2508 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe); 2509 break; 2510 2511 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b 2512 meaning */ 2513 assert(FLAGS(c) == TRADITIONAL_BOUND); 2514 2515 FBC_NBOUND_A(isWORDCHAR_A); 2516 break; 2517 2518 case NBOUNDU: 2519 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2520 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe); 2521 break; 2522 } 2523 2524 do_nboundu: 2525 2526 to_complement = 1; 2527 /* FALLTHROUGH */ 2528 2529 case BOUNDU: 2530 do_boundu: 2531 switch((bound_type) FLAGS(c)) { 2532 case TRADITIONAL_BOUND: 2533 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe); 2534 break; 2535 case GCB_BOUND: 2536 if (s == reginfo->strbeg) { 2537 if (reginfo->intuit || regtry(reginfo, &s)) 2538 { 2539 goto got_it; 2540 } 2541 2542 /* Didn't match. Try at the next position (if there is one) */ 2543 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; 2544 if (UNLIKELY(s >= reginfo->strend)) { 2545 break; 2546 } 2547 } 2548 2549 if (utf8_target) { 2550 GCB_enum before = getGCB_VAL_UTF8( 2551 reghop3((U8*)s, -1, 2552 (U8*)(reginfo->strbeg)), 2553 (U8*) reginfo->strend); 2554 while (s < strend) { 2555 GCB_enum after = getGCB_VAL_UTF8((U8*) s, 2556 (U8*) reginfo->strend); 2557 if ( (to_complement ^ isGCB(before, 2558 after, 2559 (U8*) reginfo->strbeg, 2560 (U8*) s, 2561 utf8_target)) 2562 && (reginfo->intuit || regtry(reginfo, &s))) 2563 { 2564 goto got_it; 2565 } 2566 before = after; 2567 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2568 } 2569 } 2570 else { /* Not utf8. Everything is a GCB except between CR and 2571 LF */ 2572 while (s < strend) { 2573 if ((to_complement ^ ( UCHARAT(s - 1) != '\r' 2574 || UCHARAT(s) != '\n')) 2575 && (reginfo->intuit || regtry(reginfo, &s))) 2576 { 2577 goto got_it; 2578 } 2579 s++; 2580 } 2581 } 2582 2583 /* And, since this is a bound, it can match after the final 2584 * character in the string */ 2585 if ( reginfo->intuit 2586 || (s <= reginfo->strend && regtry(reginfo, &s))) 2587 { 2588 goto got_it; 2589 } 2590 break; 2591 2592 case LB_BOUND: 2593 if (s == reginfo->strbeg) { 2594 if (reginfo->intuit || regtry(reginfo, &s)) { 2595 goto got_it; 2596 } 2597 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; 2598 if (UNLIKELY(s >= reginfo->strend)) { 2599 break; 2600 } 2601 } 2602 2603 if (utf8_target) { 2604 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s, 2605 -1, 2606 (U8*)(reginfo->strbeg)), 2607 (U8*) reginfo->strend); 2608 while (s < strend) { 2609 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); 2610 if (to_complement ^ isLB(before, 2611 after, 2612 (U8*) reginfo->strbeg, 2613 (U8*) s, 2614 (U8*) reginfo->strend, 2615 utf8_target) 2616 && (reginfo->intuit || regtry(reginfo, &s))) 2617 { 2618 goto got_it; 2619 } 2620 before = after; 2621 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2622 } 2623 } 2624 else { /* Not utf8. */ 2625 LB_enum before = getLB_VAL_CP((U8) *(s -1)); 2626 while (s < strend) { 2627 LB_enum after = getLB_VAL_CP((U8) *s); 2628 if (to_complement ^ isLB(before, 2629 after, 2630 (U8*) reginfo->strbeg, 2631 (U8*) s, 2632 (U8*) reginfo->strend, 2633 utf8_target) 2634 && (reginfo->intuit || regtry(reginfo, &s))) 2635 { 2636 goto got_it; 2637 } 2638 before = after; 2639 s++; 2640 } 2641 } 2642 2643 if ( reginfo->intuit 2644 || (s <= reginfo->strend && regtry(reginfo, &s))) 2645 { 2646 goto got_it; 2647 } 2648 2649 break; 2650 2651 case SB_BOUND: 2652 if (s == reginfo->strbeg) { 2653 if (reginfo->intuit || regtry(reginfo, &s)) { 2654 goto got_it; 2655 } 2656 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; 2657 if (UNLIKELY(s >= reginfo->strend)) { 2658 break; 2659 } 2660 } 2661 2662 if (utf8_target) { 2663 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s, 2664 -1, 2665 (U8*)(reginfo->strbeg)), 2666 (U8*) reginfo->strend); 2667 while (s < strend) { 2668 SB_enum after = getSB_VAL_UTF8((U8*) s, 2669 (U8*) reginfo->strend); 2670 if ((to_complement ^ isSB(before, 2671 after, 2672 (U8*) reginfo->strbeg, 2673 (U8*) s, 2674 (U8*) reginfo->strend, 2675 utf8_target)) 2676 && (reginfo->intuit || regtry(reginfo, &s))) 2677 { 2678 goto got_it; 2679 } 2680 before = after; 2681 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2682 } 2683 } 2684 else { /* Not utf8. */ 2685 SB_enum before = getSB_VAL_CP((U8) *(s -1)); 2686 while (s < strend) { 2687 SB_enum after = getSB_VAL_CP((U8) *s); 2688 if ((to_complement ^ isSB(before, 2689 after, 2690 (U8*) reginfo->strbeg, 2691 (U8*) s, 2692 (U8*) reginfo->strend, 2693 utf8_target)) 2694 && (reginfo->intuit || regtry(reginfo, &s))) 2695 { 2696 goto got_it; 2697 } 2698 before = after; 2699 s++; 2700 } 2701 } 2702 2703 /* Here are at the final position in the target string. The SB 2704 * value is always true here, so matches, depending on other 2705 * constraints */ 2706 if ( reginfo->intuit 2707 || (s <= reginfo->strend && regtry(reginfo, &s))) 2708 { 2709 goto got_it; 2710 } 2711 2712 break; 2713 2714 case WB_BOUND: 2715 if (s == reginfo->strbeg) { 2716 if (reginfo->intuit || regtry(reginfo, &s)) { 2717 goto got_it; 2718 } 2719 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; 2720 if (UNLIKELY(s >= reginfo->strend)) { 2721 break; 2722 } 2723 } 2724 2725 if (utf8_target) { 2726 /* We are at a boundary between char_sub_0 and char_sub_1. 2727 * We also keep track of the value for char_sub_-1 as we 2728 * loop through the line. Context may be needed to make a 2729 * determination, and if so, this can save having to 2730 * recalculate it */ 2731 WB_enum previous = WB_UNKNOWN; 2732 WB_enum before = getWB_VAL_UTF8( 2733 reghop3((U8*)s, 2734 -1, 2735 (U8*)(reginfo->strbeg)), 2736 (U8*) reginfo->strend); 2737 while (s < strend) { 2738 WB_enum after = getWB_VAL_UTF8((U8*) s, 2739 (U8*) reginfo->strend); 2740 if ((to_complement ^ isWB(previous, 2741 before, 2742 after, 2743 (U8*) reginfo->strbeg, 2744 (U8*) s, 2745 (U8*) reginfo->strend, 2746 utf8_target)) 2747 && (reginfo->intuit || regtry(reginfo, &s))) 2748 { 2749 goto got_it; 2750 } 2751 previous = before; 2752 before = after; 2753 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2754 } 2755 } 2756 else { /* Not utf8. */ 2757 WB_enum previous = WB_UNKNOWN; 2758 WB_enum before = getWB_VAL_CP((U8) *(s -1)); 2759 while (s < strend) { 2760 WB_enum after = getWB_VAL_CP((U8) *s); 2761 if ((to_complement ^ isWB(previous, 2762 before, 2763 after, 2764 (U8*) reginfo->strbeg, 2765 (U8*) s, 2766 (U8*) reginfo->strend, 2767 utf8_target)) 2768 && (reginfo->intuit || regtry(reginfo, &s))) 2769 { 2770 goto got_it; 2771 } 2772 previous = before; 2773 before = after; 2774 s++; 2775 } 2776 } 2777 2778 if ( reginfo->intuit 2779 || (s <= reginfo->strend && regtry(reginfo, &s))) 2780 { 2781 goto got_it; 2782 } 2783 } 2784 break; 2785 2786 case LNBREAK: 2787 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), 2788 is_LNBREAK_latin1_safe(s, strend) 2789 ); 2790 break; 2791 2792 /* The argument to all the POSIX node types is the class number to pass to 2793 * _generic_isCC() to build a mask for searching in PL_charclass[] */ 2794 2795 case NPOSIXL: 2796 to_complement = 1; 2797 /* FALLTHROUGH */ 2798 2799 case POSIXL: 2800 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 2801 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)), 2802 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); 2803 break; 2804 2805 case NPOSIXD: 2806 to_complement = 1; 2807 /* FALLTHROUGH */ 2808 2809 case POSIXD: 2810 if (utf8_target) { 2811 goto posix_utf8; 2812 } 2813 goto posixa; 2814 2815 case NPOSIXA: 2816 if (utf8_target) { 2817 /* The complement of something that matches only ASCII matches all 2818 * non-ASCII, plus everything in ASCII that isn't in the class. */ 2819 REXEC_FBC_CLASS_SCAN(1, ! isASCII_utf8_safe(s, strend) 2820 || ! _generic_isCC_A(*s, FLAGS(c))); 2821 break; 2822 } 2823 2824 to_complement = 1; 2825 goto posixa; 2826 2827 case POSIXA: 2828 /* Don't need to worry about utf8, as it can match only a single 2829 * byte invariant character. But we do anyway for performance reasons, 2830 * as otherwise we would have to examine all the continuation 2831 * characters */ 2832 if (utf8_target) { 2833 REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c))); 2834 break; 2835 } 2836 2837 posixa: 2838 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */ 2839 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); 2840 break; 2841 2842 case NPOSIXU: 2843 to_complement = 1; 2844 /* FALLTHROUGH */ 2845 2846 case POSIXU: 2847 if (! utf8_target) { 2848 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */ 2849 to_complement ^ cBOOL(_generic_isCC(*s, 2850 FLAGS(c)))); 2851 } 2852 else { 2853 2854 posix_utf8: 2855 classnum = (_char_class_number) FLAGS(c); 2856 switch (classnum) { 2857 default: 2858 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */ 2859 to_complement ^ cBOOL(_invlist_contains_cp( 2860 PL_XPosix_ptrs[classnum], 2861 utf8_to_uvchr_buf((U8 *) s, 2862 (U8 *) strend, 2863 NULL)))); 2864 break; 2865 case _CC_ENUM_SPACE: 2866 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */ 2867 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend))); 2868 break; 2869 2870 case _CC_ENUM_BLANK: 2871 REXEC_FBC_CLASS_SCAN(1, 2872 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend))); 2873 break; 2874 2875 case _CC_ENUM_XDIGIT: 2876 REXEC_FBC_CLASS_SCAN(1, 2877 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend))); 2878 break; 2879 2880 case _CC_ENUM_VERTSPACE: 2881 REXEC_FBC_CLASS_SCAN(1, 2882 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend))); 2883 break; 2884 2885 case _CC_ENUM_CNTRL: 2886 REXEC_FBC_CLASS_SCAN(1, 2887 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend))); 2888 break; 2889 } 2890 } 2891 break; 2892 2893 case AHOCORASICKC: 2894 case AHOCORASICK: 2895 { 2896 DECL_TRIE_TYPE(c); 2897 /* what trie are we using right now */ 2898 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; 2899 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ]; 2900 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); 2901 2902 const char *last_start = strend - trie->minlen; 2903 #ifdef DEBUGGING 2904 const char *real_start = s; 2905 #endif 2906 STRLEN maxlen = trie->maxlen; 2907 SV *sv_points; 2908 U8 **points; /* map of where we were in the input string 2909 when reading a given char. For ASCII this 2910 is unnecessary overhead as the relationship 2911 is always 1:1, but for Unicode, especially 2912 case folded Unicode this is not true. */ 2913 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 2914 U8 *bitmap=NULL; 2915 2916 2917 DECLARE_AND_GET_RE_DEBUG_FLAGS; 2918 2919 /* We can't just allocate points here. We need to wrap it in 2920 * an SV so it gets freed properly if there is a croak while 2921 * running the match */ 2922 ENTER; 2923 SAVETMPS; 2924 sv_points=newSV(maxlen * sizeof(U8 *)); 2925 SvCUR_set(sv_points, 2926 maxlen * sizeof(U8 *)); 2927 SvPOK_on(sv_points); 2928 sv_2mortal(sv_points); 2929 points=(U8**)SvPV_nolen(sv_points ); 2930 if ( trie_type != trie_utf8_fold 2931 && (trie->bitmap || OP(c)==AHOCORASICKC) ) 2932 { 2933 if (trie->bitmap) 2934 bitmap=(U8*)trie->bitmap; 2935 else 2936 bitmap=(U8*)ANYOF_BITMAP(c); 2937 } 2938 /* this is the Aho-Corasick algorithm modified a touch 2939 to include special handling for long "unknown char" sequences. 2940 The basic idea being that we use AC as long as we are dealing 2941 with a possible matching char, when we encounter an unknown char 2942 (and we have not encountered an accepting state) we scan forward 2943 until we find a legal starting char. 2944 AC matching is basically that of trie matching, except that when 2945 we encounter a failing transition, we fall back to the current 2946 states "fail state", and try the current char again, a process 2947 we repeat until we reach the root state, state 1, or a legal 2948 transition. If we fail on the root state then we can either 2949 terminate if we have reached an accepting state previously, or 2950 restart the entire process from the beginning if we have not. 2951 2952 */ 2953 while (s <= last_start) { 2954 const U32 uniflags = UTF8_ALLOW_DEFAULT; 2955 U8 *uc = (U8*)s; 2956 U16 charid = 0; 2957 U32 base = 1; 2958 U32 state = 1; 2959 UV uvc = 0; 2960 STRLEN len = 0; 2961 STRLEN foldlen = 0; 2962 U8 *uscan = (U8*)NULL; 2963 U8 *leftmost = NULL; 2964 #ifdef DEBUGGING 2965 U32 accepted_word= 0; 2966 #endif 2967 U32 pointpos = 0; 2968 2969 while ( state && uc <= (U8*)strend ) { 2970 int failed=0; 2971 U32 word = aho->states[ state ].wordnum; 2972 2973 if( state==1 ) { 2974 if ( bitmap ) { 2975 DEBUG_TRIE_EXECUTE_r( 2976 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 2977 dump_exec_pos( (char *)uc, c, strend, real_start, 2978 (char *)uc, utf8_target, 0 ); 2979 Perl_re_printf( aTHX_ 2980 " Scanning for legal start char...\n"); 2981 } 2982 ); 2983 if (utf8_target) { 2984 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 2985 uc += UTF8SKIP(uc); 2986 } 2987 } else { 2988 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 2989 uc++; 2990 } 2991 } 2992 s= (char *)uc; 2993 } 2994 if (uc >(U8*)last_start) break; 2995 } 2996 2997 if ( word ) { 2998 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ]; 2999 if (!leftmost || lpos < leftmost) { 3000 DEBUG_r(accepted_word=word); 3001 leftmost= lpos; 3002 } 3003 if (base==0) break; 3004 3005 } 3006 points[pointpos++ % maxlen]= uc; 3007 if (foldlen || uc < (U8*)strend) { 3008 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 3009 (U8 *) strend, uscan, len, uvc, 3010 charid, foldlen, foldbuf, 3011 uniflags); 3012 DEBUG_TRIE_EXECUTE_r({ 3013 dump_exec_pos( (char *)uc, c, strend, 3014 real_start, s, utf8_target, 0); 3015 Perl_re_printf( aTHX_ 3016 " Charid:%3u CP:%4" UVxf " ", 3017 charid, uvc); 3018 }); 3019 } 3020 else { 3021 len = 0; 3022 charid = 0; 3023 } 3024 3025 3026 do { 3027 #ifdef DEBUGGING 3028 word = aho->states[ state ].wordnum; 3029 #endif 3030 base = aho->states[ state ].trans.base; 3031 3032 DEBUG_TRIE_EXECUTE_r({ 3033 if (failed) 3034 dump_exec_pos( (char *)uc, c, strend, real_start, 3035 s, utf8_target, 0 ); 3036 Perl_re_printf( aTHX_ 3037 "%sState: %4" UVxf ", word=%" UVxf, 3038 failed ? " Fail transition to " : "", 3039 (UV)state, (UV)word); 3040 }); 3041 if ( base ) { 3042 U32 tmp; 3043 I32 offset; 3044 if (charid && 3045 ( ((offset = base + charid 3046 - 1 - trie->uniquecharcount)) >= 0) 3047 && ((U32)offset < trie->lasttrans) 3048 && trie->trans[offset].check == state 3049 && (tmp=trie->trans[offset].next)) 3050 { 3051 DEBUG_TRIE_EXECUTE_r( 3052 Perl_re_printf( aTHX_ " - legal\n")); 3053 state = tmp; 3054 break; 3055 } 3056 else { 3057 DEBUG_TRIE_EXECUTE_r( 3058 Perl_re_printf( aTHX_ " - fail\n")); 3059 failed = 1; 3060 state = aho->fail[state]; 3061 } 3062 } 3063 else { 3064 /* we must be accepting here */ 3065 DEBUG_TRIE_EXECUTE_r( 3066 Perl_re_printf( aTHX_ " - accepting\n")); 3067 failed = 1; 3068 break; 3069 } 3070 } while(state); 3071 uc += len; 3072 if (failed) { 3073 if (leftmost) 3074 break; 3075 if (!state) state = 1; 3076 } 3077 } 3078 if ( aho->states[ state ].wordnum ) { 3079 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ]; 3080 if (!leftmost || lpos < leftmost) { 3081 DEBUG_r(accepted_word=aho->states[ state ].wordnum); 3082 leftmost = lpos; 3083 } 3084 } 3085 if (leftmost) { 3086 s = (char*)leftmost; 3087 DEBUG_TRIE_EXECUTE_r({ 3088 Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n", 3089 (UV)accepted_word, (IV)(s - real_start) 3090 ); 3091 }); 3092 if (reginfo->intuit || regtry(reginfo, &s)) { 3093 FREETMPS; 3094 LEAVE; 3095 goto got_it; 3096 } 3097 if (s < reginfo->strend) { 3098 s = HOPc(s,1); 3099 } 3100 DEBUG_TRIE_EXECUTE_r({ 3101 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n"); 3102 }); 3103 } else { 3104 DEBUG_TRIE_EXECUTE_r( 3105 Perl_re_printf( aTHX_ "No match.\n")); 3106 break; 3107 } 3108 } 3109 FREETMPS; 3110 LEAVE; 3111 } 3112 break; 3113 default: 3114 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); 3115 } 3116 return 0; 3117 got_it: 3118 return s; 3119 } 3120 3121 /* set RX_SAVED_COPY, RX_SUBBEG etc. 3122 * flags have same meanings as with regexec_flags() */ 3123 3124 static void 3125 S_reg_set_capture_string(pTHX_ REGEXP * const rx, 3126 char *strbeg, 3127 char *strend, 3128 SV *sv, 3129 U32 flags, 3130 bool utf8_target) 3131 { 3132 struct regexp *const prog = ReANY(rx); 3133 3134 if (flags & REXEC_COPY_STR) { 3135 #ifdef PERL_ANY_COW 3136 if (SvCANCOW(sv)) { 3137 DEBUG_C(Perl_re_printf( aTHX_ 3138 "Copy on write: regexp capture, type %d\n", 3139 (int) SvTYPE(sv))); 3140 /* Create a new COW SV to share the match string and store 3141 * in saved_copy, unless the current COW SV in saved_copy 3142 * is valid and suitable for our purpose */ 3143 if (( prog->saved_copy 3144 && SvIsCOW(prog->saved_copy) 3145 && SvPOKp(prog->saved_copy) 3146 && SvIsCOW(sv) 3147 && SvPOKp(sv) 3148 && SvPVX(sv) == SvPVX(prog->saved_copy))) 3149 { 3150 /* just reuse saved_copy SV */ 3151 if (RXp_MATCH_COPIED(prog)) { 3152 Safefree(prog->subbeg); 3153 RXp_MATCH_COPIED_off(prog); 3154 } 3155 } 3156 else { 3157 /* create new COW SV to share string */ 3158 RXp_MATCH_COPY_FREE(prog); 3159 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); 3160 } 3161 prog->subbeg = (char *)SvPVX_const(prog->saved_copy); 3162 assert (SvPOKp(prog->saved_copy)); 3163 prog->sublen = strend - strbeg; 3164 prog->suboffset = 0; 3165 prog->subcoffset = 0; 3166 } else 3167 #endif 3168 { 3169 SSize_t min = 0; 3170 SSize_t max = strend - strbeg; 3171 SSize_t sublen; 3172 3173 if ( (flags & REXEC_COPY_SKIP_POST) 3174 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 3175 && !(PL_sawampersand & SAWAMPERSAND_RIGHT) 3176 ) { /* don't copy $' part of string */ 3177 U32 n = 0; 3178 max = -1; 3179 /* calculate the right-most part of the string covered 3180 * by a capture. Due to lookahead, this may be to 3181 * the right of $&, so we have to scan all captures */ 3182 while (n <= prog->lastparen) { 3183 if (prog->offs[n].end > max) 3184 max = prog->offs[n].end; 3185 n++; 3186 } 3187 if (max == -1) 3188 max = (PL_sawampersand & SAWAMPERSAND_LEFT) 3189 ? prog->offs[0].start 3190 : 0; 3191 assert(max >= 0 && max <= strend - strbeg); 3192 } 3193 3194 if ( (flags & REXEC_COPY_SKIP_PRE) 3195 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 3196 && !(PL_sawampersand & SAWAMPERSAND_LEFT) 3197 ) { /* don't copy $` part of string */ 3198 U32 n = 0; 3199 min = max; 3200 /* calculate the left-most part of the string covered 3201 * by a capture. Due to lookbehind, this may be to 3202 * the left of $&, so we have to scan all captures */ 3203 while (min && n <= prog->lastparen) { 3204 if ( prog->offs[n].start != -1 3205 && prog->offs[n].start < min) 3206 { 3207 min = prog->offs[n].start; 3208 } 3209 n++; 3210 } 3211 if ((PL_sawampersand & SAWAMPERSAND_RIGHT) 3212 && min > prog->offs[0].end 3213 ) 3214 min = prog->offs[0].end; 3215 3216 } 3217 3218 assert(min >= 0 && min <= max && min <= strend - strbeg); 3219 sublen = max - min; 3220 3221 if (RXp_MATCH_COPIED(prog)) { 3222 if (sublen > prog->sublen) 3223 prog->subbeg = 3224 (char*)saferealloc(prog->subbeg, sublen+1); 3225 } 3226 else 3227 prog->subbeg = (char*)safemalloc(sublen+1); 3228 Copy(strbeg + min, prog->subbeg, sublen, char); 3229 prog->subbeg[sublen] = '\0'; 3230 prog->suboffset = min; 3231 prog->sublen = sublen; 3232 RXp_MATCH_COPIED_on(prog); 3233 } 3234 prog->subcoffset = prog->suboffset; 3235 if (prog->suboffset && utf8_target) { 3236 /* Convert byte offset to chars. 3237 * XXX ideally should only compute this if @-/@+ 3238 * has been seen, a la PL_sawampersand ??? */ 3239 3240 /* If there's a direct correspondence between the 3241 * string which we're matching and the original SV, 3242 * then we can use the utf8 len cache associated with 3243 * the SV. In particular, it means that under //g, 3244 * sv_pos_b2u() will use the previously cached 3245 * position to speed up working out the new length of 3246 * subcoffset, rather than counting from the start of 3247 * the string each time. This stops 3248 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; 3249 * from going quadratic */ 3250 if (SvPOKp(sv) && SvPVX(sv) == strbeg) 3251 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, 3252 SV_GMAGIC|SV_CONST_RETURN); 3253 else 3254 prog->subcoffset = utf8_length((U8*)strbeg, 3255 (U8*)(strbeg+prog->suboffset)); 3256 } 3257 } 3258 else { 3259 RXp_MATCH_COPY_FREE(prog); 3260 prog->subbeg = strbeg; 3261 prog->suboffset = 0; 3262 prog->subcoffset = 0; 3263 prog->sublen = strend - strbeg; 3264 } 3265 } 3266 3267 3268 3269 3270 /* 3271 - regexec_flags - match a regexp against a string 3272 */ 3273 I32 3274 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, 3275 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) 3276 /* stringarg: the point in the string at which to begin matching */ 3277 /* strend: pointer to null at end of string */ 3278 /* strbeg: real beginning of string */ 3279 /* minend: end of match must be >= minend bytes after stringarg. */ 3280 /* sv: SV being matched: only used for utf8 flag, pos() etc; string 3281 * itself is accessed via the pointers above */ 3282 /* data: May be used for some additional optimizations. 3283 Currently unused. */ 3284 /* flags: For optimizations. See REXEC_* in regexp.h */ 3285 3286 { 3287 struct regexp *const prog = ReANY(rx); 3288 char *s; 3289 regnode *c; 3290 char *startpos; 3291 SSize_t minlen; /* must match at least this many chars */ 3292 SSize_t dontbother = 0; /* how many characters not to try at end */ 3293 const bool utf8_target = cBOOL(DO_UTF8(sv)); 3294 I32 multiline; 3295 RXi_GET_DECL(prog,progi); 3296 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ 3297 regmatch_info *const reginfo = ®info_buf; 3298 regexp_paren_pair *swap = NULL; 3299 I32 oldsave; 3300 DECLARE_AND_GET_RE_DEBUG_FLAGS; 3301 3302 PERL_ARGS_ASSERT_REGEXEC_FLAGS; 3303 PERL_UNUSED_ARG(data); 3304 3305 /* Be paranoid... */ 3306 if (prog == NULL) { 3307 Perl_croak(aTHX_ "NULL regexp parameter"); 3308 } 3309 3310 DEBUG_EXECUTE_r( 3311 debug_start_match(rx, utf8_target, stringarg, strend, 3312 "Matching"); 3313 ); 3314 3315 startpos = stringarg; 3316 3317 /* set these early as they may be used by the HOP macros below */ 3318 reginfo->strbeg = strbeg; 3319 reginfo->strend = strend; 3320 reginfo->is_utf8_target = cBOOL(utf8_target); 3321 3322 if (prog->intflags & PREGf_GPOS_SEEN) { 3323 MAGIC *mg; 3324 3325 /* set reginfo->ganch, the position where \G can match */ 3326 3327 reginfo->ganch = 3328 (flags & REXEC_IGNOREPOS) 3329 ? stringarg /* use start pos rather than pos() */ 3330 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0) 3331 /* Defined pos(): */ 3332 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) 3333 : strbeg; /* pos() not defined; use start of string */ 3334 3335 DEBUG_GPOS_r(Perl_re_printf( aTHX_ 3336 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg))); 3337 3338 /* in the presence of \G, we may need to start looking earlier in 3339 * the string than the suggested start point of stringarg: 3340 * if prog->gofs is set, then that's a known, fixed minimum 3341 * offset, such as 3342 * /..\G/: gofs = 2 3343 * /ab|c\G/: gofs = 1 3344 * or if the minimum offset isn't known, then we have to go back 3345 * to the start of the string, e.g. /w+\G/ 3346 */ 3347 3348 if (prog->intflags & PREGf_ANCH_GPOS) { 3349 if (prog->gofs) { 3350 startpos = HOPBACKc(reginfo->ganch, prog->gofs); 3351 if (!startpos || 3352 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg)) 3353 { 3354 DEBUG_GPOS_r(Perl_re_printf( aTHX_ 3355 "fail: ganch-gofs before earliest possible start\n")); 3356 return 0; 3357 } 3358 } 3359 else 3360 startpos = reginfo->ganch; 3361 } 3362 else if (prog->gofs) { 3363 startpos = HOPBACKc(startpos, prog->gofs); 3364 if (!startpos) 3365 startpos = strbeg; 3366 } 3367 else if (prog->intflags & PREGf_GPOS_FLOAT) 3368 startpos = strbeg; 3369 } 3370 3371 minlen = prog->minlen; 3372 if ((startpos + minlen) > strend || startpos < strbeg) { 3373 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3374 "Regex match can't succeed, so not even tried\n")); 3375 return 0; 3376 } 3377 3378 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), 3379 * which will call destuctors to reset PL_regmatch_state, free higher 3380 * PL_regmatch_slabs, and clean up regmatch_info_aux and 3381 * regmatch_info_aux_eval */ 3382 3383 oldsave = PL_savestack_ix; 3384 3385 s = startpos; 3386 3387 if ((prog->extflags & RXf_USE_INTUIT) 3388 && !(flags & REXEC_CHECKED)) 3389 { 3390 s = re_intuit_start(rx, sv, strbeg, startpos, strend, 3391 flags, NULL); 3392 if (!s) 3393 return 0; 3394 3395 if (prog->extflags & RXf_CHECK_ALL) { 3396 /* we can match based purely on the result of INTUIT. 3397 * Set up captures etc just for $& and $-[0] 3398 * (an intuit-only match wont have $1,$2,..) */ 3399 assert(!prog->nparens); 3400 3401 /* s/// doesn't like it if $& is earlier than where we asked it to 3402 * start searching (which can happen on something like /.\G/) */ 3403 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 3404 && (s < stringarg)) 3405 { 3406 /* this should only be possible under \G */ 3407 assert(prog->intflags & PREGf_GPOS_SEEN); 3408 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3409 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 3410 goto phooey; 3411 } 3412 3413 /* match via INTUIT shouldn't have any captures. 3414 * Let @-, @+, $^N know */ 3415 prog->lastparen = prog->lastcloseparen = 0; 3416 RXp_MATCH_UTF8_set(prog, utf8_target); 3417 prog->offs[0].start = s - strbeg; 3418 prog->offs[0].end = utf8_target 3419 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg 3420 : s - strbeg + prog->minlenret; 3421 if ( !(flags & REXEC_NOT_FIRST) ) 3422 S_reg_set_capture_string(aTHX_ rx, 3423 strbeg, strend, 3424 sv, flags, utf8_target); 3425 3426 return 1; 3427 } 3428 } 3429 3430 multiline = prog->extflags & RXf_PMf_MULTILINE; 3431 3432 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { 3433 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3434 "String too short [regexec_flags]...\n")); 3435 goto phooey; 3436 } 3437 3438 /* Check validity of program. */ 3439 if (UCHARAT(progi->program) != REG_MAGIC) { 3440 Perl_croak(aTHX_ "corrupted regexp program"); 3441 } 3442 3443 RXp_MATCH_TAINTED_off(prog); 3444 RXp_MATCH_UTF8_set(prog, utf8_target); 3445 3446 reginfo->prog = rx; /* Yes, sorry that this is confusing. */ 3447 reginfo->intuit = 0; 3448 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 3449 reginfo->warned = FALSE; 3450 reginfo->sv = sv; 3451 reginfo->poscache_maxiter = 0; /* not yet started a countdown */ 3452 /* see how far we have to get to not match where we matched before */ 3453 reginfo->till = stringarg + minend; 3454 3455 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { 3456 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after 3457 S_cleanup_regmatch_info_aux has executed (registered by 3458 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies 3459 magic belonging to this SV. 3460 Not newSVsv, either, as it does not COW. 3461 */ 3462 reginfo->sv = newSV(0); 3463 SvSetSV_nosteal(reginfo->sv, sv); 3464 SAVEFREESV(reginfo->sv); 3465 } 3466 3467 /* reserve next 2 or 3 slots in PL_regmatch_state: 3468 * slot N+0: may currently be in use: skip it 3469 * slot N+1: use for regmatch_info_aux struct 3470 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s 3471 * slot N+3: ready for use by regmatch() 3472 */ 3473 3474 { 3475 regmatch_state *old_regmatch_state; 3476 regmatch_slab *old_regmatch_slab; 3477 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; 3478 3479 /* on first ever match, allocate first slab */ 3480 if (!PL_regmatch_slab) { 3481 Newx(PL_regmatch_slab, 1, regmatch_slab); 3482 PL_regmatch_slab->prev = NULL; 3483 PL_regmatch_slab->next = NULL; 3484 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); 3485 } 3486 3487 old_regmatch_state = PL_regmatch_state; 3488 old_regmatch_slab = PL_regmatch_slab; 3489 3490 for (i=0; i <= max; i++) { 3491 if (i == 1) 3492 reginfo->info_aux = &(PL_regmatch_state->u.info_aux); 3493 else if (i ==2) 3494 reginfo->info_aux_eval = 3495 reginfo->info_aux->info_aux_eval = 3496 &(PL_regmatch_state->u.info_aux_eval); 3497 3498 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) 3499 PL_regmatch_state = S_push_slab(aTHX); 3500 } 3501 3502 /* note initial PL_regmatch_state position; at end of match we'll 3503 * pop back to there and free any higher slabs */ 3504 3505 reginfo->info_aux->old_regmatch_state = old_regmatch_state; 3506 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; 3507 reginfo->info_aux->poscache = NULL; 3508 3509 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); 3510 3511 if ((prog->extflags & RXf_EVAL_SEEN)) 3512 S_setup_eval_state(aTHX_ reginfo); 3513 else 3514 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; 3515 } 3516 3517 /* If there is a "must appear" string, look for it. */ 3518 3519 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { 3520 /* We have to be careful. If the previous successful match 3521 was from this regex we don't want a subsequent partially 3522 successful match to clobber the old results. 3523 So when we detect this possibility we add a swap buffer 3524 to the re, and switch the buffer each match. If we fail, 3525 we switch it back; otherwise we leave it swapped. 3526 */ 3527 swap = prog->offs; 3528 /* avoid leak if we die, or clean up anyway if match completes */ 3529 SAVEFREEPV(swap); 3530 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); 3531 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 3532 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n", 3533 0, 3534 PTR2UV(prog), 3535 PTR2UV(swap), 3536 PTR2UV(prog->offs) 3537 )); 3538 } 3539 3540 if (prog->recurse_locinput) 3541 Zero(prog->recurse_locinput,prog->nparens + 1, char *); 3542 3543 /* Simplest case: anchored match need be tried only once, or with 3544 * MBOL, only at the beginning of each line. 3545 * 3546 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets 3547 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't 3548 * match at the start of the string then it won't match anywhere else 3549 * either; while with /.*.../, if it doesn't match at the beginning, 3550 * the earliest it could match is at the start of the next line */ 3551 3552 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { 3553 char *end; 3554 3555 if (regtry(reginfo, &s)) 3556 goto got_it; 3557 3558 if (!(prog->intflags & PREGf_ANCH_MBOL)) 3559 goto phooey; 3560 3561 /* didn't match at start, try at other newline positions */ 3562 3563 if (minlen) 3564 dontbother = minlen - 1; 3565 end = HOP3c(strend, -dontbother, strbeg) - 1; 3566 3567 /* skip to next newline */ 3568 3569 while (s <= end) { /* note it could be possible to match at the end of the string */ 3570 /* NB: newlines are the same in unicode as they are in latin */ 3571 if (*s++ != '\n') 3572 continue; 3573 if (prog->check_substr || prog->check_utf8) { 3574 /* note that with PREGf_IMPLICIT, intuit can only fail 3575 * or return the start position, so it's of limited utility. 3576 * Nevertheless, I made the decision that the potential for 3577 * quick fail was still worth it - DAPM */ 3578 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL); 3579 if (!s) 3580 goto phooey; 3581 } 3582 if (regtry(reginfo, &s)) 3583 goto got_it; 3584 } 3585 goto phooey; 3586 } /* end anchored search */ 3587 3588 if (prog->intflags & PREGf_ANCH_GPOS) 3589 { 3590 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ 3591 assert(prog->intflags & PREGf_GPOS_SEEN); 3592 /* For anchored \G, the only position it can match from is 3593 * (ganch-gofs); we already set startpos to this above; if intuit 3594 * moved us on from there, we can't possibly succeed */ 3595 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs)); 3596 if (s == startpos && regtry(reginfo, &s)) 3597 goto got_it; 3598 goto phooey; 3599 } 3600 3601 /* Messy cases: unanchored match. */ 3602 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { 3603 /* we have /x+whatever/ */ 3604 /* it must be a one character string (XXXX Except is_utf8_pat?) */ 3605 char ch; 3606 #ifdef DEBUGGING 3607 int did_match = 0; 3608 #endif 3609 if (utf8_target) { 3610 if (! prog->anchored_utf8) { 3611 to_utf8_substr(prog); 3612 } 3613 ch = SvPVX_const(prog->anchored_utf8)[0]; 3614 REXEC_FBC_SCAN(0, /* 0=>not-utf8 */ 3615 if (*s == ch) { 3616 DEBUG_EXECUTE_r( did_match = 1 ); 3617 if (regtry(reginfo, &s)) goto got_it; 3618 s += UTF8_SAFE_SKIP(s, strend); 3619 while (s < strend && *s == ch) 3620 s += UTF8SKIP(s); 3621 } 3622 ); 3623 3624 } 3625 else { 3626 if (! prog->anchored_substr) { 3627 if (! to_byte_substr(prog)) { 3628 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 3629 } 3630 } 3631 ch = SvPVX_const(prog->anchored_substr)[0]; 3632 REXEC_FBC_SCAN(0, /* 0=>not-utf8 */ 3633 if (*s == ch) { 3634 DEBUG_EXECUTE_r( did_match = 1 ); 3635 if (regtry(reginfo, &s)) goto got_it; 3636 s++; 3637 while (s < strend && *s == ch) 3638 s++; 3639 } 3640 ); 3641 } 3642 DEBUG_EXECUTE_r(if (!did_match) 3643 Perl_re_printf( aTHX_ 3644 "Did not find anchored character...\n") 3645 ); 3646 } 3647 else if (prog->anchored_substr != NULL 3648 || prog->anchored_utf8 != NULL 3649 || ((prog->float_substr != NULL || prog->float_utf8 != NULL) 3650 && prog->float_max_offset < strend - s)) { 3651 SV *must; 3652 SSize_t back_max; 3653 SSize_t back_min; 3654 char *last; 3655 char *last1; /* Last position checked before */ 3656 #ifdef DEBUGGING 3657 int did_match = 0; 3658 #endif 3659 if (prog->anchored_substr || prog->anchored_utf8) { 3660 if (utf8_target) { 3661 if (! prog->anchored_utf8) { 3662 to_utf8_substr(prog); 3663 } 3664 must = prog->anchored_utf8; 3665 } 3666 else { 3667 if (! prog->anchored_substr) { 3668 if (! to_byte_substr(prog)) { 3669 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 3670 } 3671 } 3672 must = prog->anchored_substr; 3673 } 3674 back_max = back_min = prog->anchored_offset; 3675 } else { 3676 if (utf8_target) { 3677 if (! prog->float_utf8) { 3678 to_utf8_substr(prog); 3679 } 3680 must = prog->float_utf8; 3681 } 3682 else { 3683 if (! prog->float_substr) { 3684 if (! to_byte_substr(prog)) { 3685 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 3686 } 3687 } 3688 must = prog->float_substr; 3689 } 3690 back_max = prog->float_max_offset; 3691 back_min = prog->float_min_offset; 3692 } 3693 3694 if (back_min<0) { 3695 last = strend; 3696 } else { 3697 last = HOP3c(strend, /* Cannot start after this */ 3698 -(SSize_t)(CHR_SVLEN(must) 3699 - (SvTAIL(must) != 0) + back_min), strbeg); 3700 } 3701 if (s > reginfo->strbeg) 3702 last1 = HOPc(s, -1); 3703 else 3704 last1 = s - 1; /* bogus */ 3705 3706 /* XXXX check_substr already used to find "s", can optimize if 3707 check_substr==must. */ 3708 dontbother = 0; 3709 strend = HOPc(strend, -dontbother); 3710 while ( (s <= last) && 3711 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), 3712 (unsigned char*)strend, must, 3713 multiline ? FBMrf_MULTILINE : 0)) ) { 3714 DEBUG_EXECUTE_r( did_match = 1 ); 3715 if (HOPc(s, -back_max) > last1) { 3716 last1 = HOPc(s, -back_min); 3717 s = HOPc(s, -back_max); 3718 } 3719 else { 3720 char * const t = (last1 >= reginfo->strbeg) 3721 ? HOPc(last1, 1) : last1 + 1; 3722 3723 last1 = HOPc(s, -back_min); 3724 s = t; 3725 } 3726 if (utf8_target) { 3727 while (s <= last1) { 3728 if (regtry(reginfo, &s)) 3729 goto got_it; 3730 if (s >= last1) { 3731 s++; /* to break out of outer loop */ 3732 break; 3733 } 3734 s += UTF8SKIP(s); 3735 } 3736 } 3737 else { 3738 while (s <= last1) { 3739 if (regtry(reginfo, &s)) 3740 goto got_it; 3741 s++; 3742 } 3743 } 3744 } 3745 DEBUG_EXECUTE_r(if (!did_match) { 3746 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 3747 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 3748 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n", 3749 ((must == prog->anchored_substr || must == prog->anchored_utf8) 3750 ? "anchored" : "floating"), 3751 quoted, RE_SV_TAIL(must)); 3752 }); 3753 goto phooey; 3754 } 3755 else if ( (c = progi->regstclass) ) { 3756 if (minlen) { 3757 const OPCODE op = OP(progi->regstclass); 3758 /* don't bother with what can't match */ 3759 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE) 3760 strend = HOPc(strend, -(minlen - 1)); 3761 } 3762 DEBUG_EXECUTE_r({ 3763 SV * const prop = sv_newmortal(); 3764 regprop(prog, prop, c, reginfo, NULL); 3765 { 3766 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), 3767 s,strend-s,PL_dump_re_max_len); 3768 Perl_re_printf( aTHX_ 3769 "Matching stclass %.*s against %s (%d bytes)\n", 3770 (int)SvCUR(prop), SvPVX_const(prop), 3771 quoted, (int)(strend - s)); 3772 } 3773 }); 3774 if (find_byclass(prog, c, s, strend, reginfo)) 3775 goto got_it; 3776 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n")); 3777 } 3778 else { 3779 dontbother = 0; 3780 if (prog->float_substr != NULL || prog->float_utf8 != NULL) { 3781 /* Trim the end. */ 3782 char *last= NULL; 3783 SV* float_real; 3784 STRLEN len; 3785 const char *little; 3786 3787 if (utf8_target) { 3788 if (! prog->float_utf8) { 3789 to_utf8_substr(prog); 3790 } 3791 float_real = prog->float_utf8; 3792 } 3793 else { 3794 if (! prog->float_substr) { 3795 if (! to_byte_substr(prog)) { 3796 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 3797 } 3798 } 3799 float_real = prog->float_substr; 3800 } 3801 3802 little = SvPV_const(float_real, len); 3803 if (SvTAIL(float_real)) { 3804 /* This means that float_real contains an artificial \n on 3805 * the end due to the presence of something like this: 3806 * /foo$/ where we can match both "foo" and "foo\n" at the 3807 * end of the string. So we have to compare the end of the 3808 * string first against the float_real without the \n and 3809 * then against the full float_real with the string. We 3810 * have to watch out for cases where the string might be 3811 * smaller than the float_real or the float_real without 3812 * the \n. */ 3813 char *checkpos= strend - len; 3814 DEBUG_OPTIMISE_r( 3815 Perl_re_printf( aTHX_ 3816 "%sChecking for float_real.%s\n", 3817 PL_colors[4], PL_colors[5])); 3818 if (checkpos + 1 < strbeg) { 3819 /* can't match, even if we remove the trailing \n 3820 * string is too short to match */ 3821 DEBUG_EXECUTE_r( 3822 Perl_re_printf( aTHX_ 3823 "%sString shorter than required trailing substring, cannot match.%s\n", 3824 PL_colors[4], PL_colors[5])); 3825 goto phooey; 3826 } else if (memEQ(checkpos + 1, little, len - 1)) { 3827 /* can match, the end of the string matches without the 3828 * "\n" */ 3829 last = checkpos + 1; 3830 } else if (checkpos < strbeg) { 3831 /* cant match, string is too short when the "\n" is 3832 * included */ 3833 DEBUG_EXECUTE_r( 3834 Perl_re_printf( aTHX_ 3835 "%sString does not contain required trailing substring, cannot match.%s\n", 3836 PL_colors[4], PL_colors[5])); 3837 goto phooey; 3838 } else if (!multiline) { 3839 /* non multiline match, so compare with the "\n" at the 3840 * end of the string */ 3841 if (memEQ(checkpos, little, len)) { 3842 last= checkpos; 3843 } else { 3844 DEBUG_EXECUTE_r( 3845 Perl_re_printf( aTHX_ 3846 "%sString does not contain required trailing substring, cannot match.%s\n", 3847 PL_colors[4], PL_colors[5])); 3848 goto phooey; 3849 } 3850 } else { 3851 /* multiline match, so we have to search for a place 3852 * where the full string is located */ 3853 goto find_last; 3854 } 3855 } else { 3856 find_last: 3857 if (len) 3858 last = rninstr(s, strend, little, little + len); 3859 else 3860 last = strend; /* matching "$" */ 3861 } 3862 if (!last) { 3863 /* at one point this block contained a comment which was 3864 * probably incorrect, which said that this was a "should not 3865 * happen" case. Even if it was true when it was written I am 3866 * pretty sure it is not anymore, so I have removed the comment 3867 * and replaced it with this one. Yves */ 3868 DEBUG_EXECUTE_r( 3869 Perl_re_printf( aTHX_ 3870 "%sString does not contain required substring, cannot match.%s\n", 3871 PL_colors[4], PL_colors[5] 3872 )); 3873 goto phooey; 3874 } 3875 dontbother = strend - last + prog->float_min_offset; 3876 } 3877 if (minlen && (dontbother < minlen)) 3878 dontbother = minlen - 1; 3879 strend -= dontbother; /* this one's always in bytes! */ 3880 /* We don't know much -- general case. */ 3881 if (utf8_target) { 3882 for (;;) { 3883 if (regtry(reginfo, &s)) 3884 goto got_it; 3885 if (s >= strend) 3886 break; 3887 s += UTF8SKIP(s); 3888 }; 3889 } 3890 else { 3891 do { 3892 if (regtry(reginfo, &s)) 3893 goto got_it; 3894 } while (s++ < strend); 3895 } 3896 } 3897 3898 /* Failure. */ 3899 goto phooey; 3900 3901 got_it: 3902 /* s/// doesn't like it if $& is earlier than where we asked it to 3903 * start searching (which can happen on something like /.\G/) */ 3904 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 3905 && (prog->offs[0].start < stringarg - strbeg)) 3906 { 3907 /* this should only be possible under \G */ 3908 assert(prog->intflags & PREGf_GPOS_SEEN); 3909 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3910 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 3911 goto phooey; 3912 } 3913 3914 /* clean up; this will trigger destructors that will free all slabs 3915 * above the current one, and cleanup the regmatch_info_aux 3916 * and regmatch_info_aux_eval sructs */ 3917 3918 LEAVE_SCOPE(oldsave); 3919 3920 if (RXp_PAREN_NAMES(prog)) 3921 (void)hv_iterinit(RXp_PAREN_NAMES(prog)); 3922 3923 /* make sure $`, $&, $', and $digit will work later */ 3924 if ( !(flags & REXEC_NOT_FIRST) ) 3925 S_reg_set_capture_string(aTHX_ rx, 3926 strbeg, reginfo->strend, 3927 sv, flags, utf8_target); 3928 3929 return 1; 3930 3931 phooey: 3932 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n", 3933 PL_colors[4], PL_colors[5])); 3934 3935 if (swap) { 3936 /* we failed :-( roll it back. 3937 * Since the swap buffer will be freed on scope exit which follows 3938 * shortly, restore the old captures by copying 'swap's original 3939 * data to the new offs buffer 3940 */ 3941 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 3942 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n", 3943 0, 3944 PTR2UV(prog), 3945 PTR2UV(prog->offs), 3946 PTR2UV(swap) 3947 )); 3948 3949 Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair); 3950 } 3951 3952 /* clean up; this will trigger destructors that will free all slabs 3953 * above the current one, and cleanup the regmatch_info_aux 3954 * and regmatch_info_aux_eval sructs */ 3955 3956 LEAVE_SCOPE(oldsave); 3957 3958 return 0; 3959 } 3960 3961 3962 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. 3963 * Do inc before dec, in case old and new rex are the same */ 3964 #define SET_reg_curpm(Re2) \ 3965 if (reginfo->info_aux_eval) { \ 3966 (void)ReREFCNT_inc(Re2); \ 3967 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ 3968 PM_SETRE((PL_reg_curpm), (Re2)); \ 3969 } 3970 3971 3972 /* 3973 - regtry - try match at specific point 3974 */ 3975 STATIC bool /* 0 failure, 1 success */ 3976 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) 3977 { 3978 CHECKPOINT lastcp; 3979 REGEXP *const rx = reginfo->prog; 3980 regexp *const prog = ReANY(rx); 3981 SSize_t result; 3982 #ifdef DEBUGGING 3983 U32 depth = 0; /* used by REGCP_SET */ 3984 #endif 3985 RXi_GET_DECL(prog,progi); 3986 DECLARE_AND_GET_RE_DEBUG_FLAGS; 3987 3988 PERL_ARGS_ASSERT_REGTRY; 3989 3990 reginfo->cutpoint=NULL; 3991 3992 prog->offs[0].start = *startposp - reginfo->strbeg; 3993 prog->lastparen = 0; 3994 prog->lastcloseparen = 0; 3995 3996 /* XXXX What this code is doing here?!!! There should be no need 3997 to do this again and again, prog->lastparen should take care of 3998 this! --ilya*/ 3999 4000 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. 4001 * Actually, the code in regcppop() (which Ilya may be meaning by 4002 * prog->lastparen), is not needed at all by the test suite 4003 * (op/regexp, op/pat, op/split), but that code is needed otherwise 4004 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 4005 * Meanwhile, this code *is* needed for the 4006 * above-mentioned test suite tests to succeed. The common theme 4007 * on those tests seems to be returning null fields from matches. 4008 * --jhi updated by dapm */ 4009 4010 /* After encountering a variant of the issue mentioned above I think 4011 * the point Ilya was making is that if we properly unwind whenever 4012 * we set lastparen to a smaller value then we should not need to do 4013 * this every time, only when needed. So if we have tests that fail if 4014 * we remove this, then it suggests somewhere else we are improperly 4015 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and 4016 * places it is called, and related regcp() routines. - Yves */ 4017 #if 1 4018 if (prog->nparens) { 4019 regexp_paren_pair *pp = prog->offs; 4020 I32 i; 4021 for (i = prog->nparens; i > (I32)prog->lastparen; i--) { 4022 ++pp; 4023 pp->start = -1; 4024 pp->end = -1; 4025 } 4026 } 4027 #endif 4028 REGCP_SET(lastcp); 4029 result = regmatch(reginfo, *startposp, progi->program + 1); 4030 if (result != -1) { 4031 prog->offs[0].end = result; 4032 return 1; 4033 } 4034 if (reginfo->cutpoint) 4035 *startposp= reginfo->cutpoint; 4036 REGCP_UNWIND(lastcp); 4037 return 0; 4038 } 4039 4040 /* this is used to determine how far from the left messages like 4041 'failed...' are printed in regexec.c. It should be set such that 4042 messages are inline with the regop output that created them. 4043 */ 4044 #define REPORT_CODE_OFF 29 4045 #define INDENT_CHARS(depth) ((int)(depth) % 20) 4046 #ifdef DEBUGGING 4047 int 4048 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...) 4049 { 4050 va_list ap; 4051 int result; 4052 PerlIO *f= Perl_debug_log; 4053 PERL_ARGS_ASSERT_RE_EXEC_INDENTF; 4054 va_start(ap, depth); 4055 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" ); 4056 result = PerlIO_vprintf(f, fmt, ap); 4057 va_end(ap); 4058 return result; 4059 } 4060 #endif /* DEBUGGING */ 4061 4062 /* grab a new slab and return the first slot in it */ 4063 4064 STATIC regmatch_state * 4065 S_push_slab(pTHX) 4066 { 4067 regmatch_slab *s = PL_regmatch_slab->next; 4068 if (!s) { 4069 Newx(s, 1, regmatch_slab); 4070 s->prev = PL_regmatch_slab; 4071 s->next = NULL; 4072 PL_regmatch_slab->next = s; 4073 } 4074 PL_regmatch_slab = s; 4075 return SLAB_FIRST(s); 4076 } 4077 4078 #ifdef DEBUGGING 4079 4080 STATIC void 4081 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, 4082 const char *start, const char *end, const char *blurb) 4083 { 4084 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; 4085 4086 PERL_ARGS_ASSERT_DEBUG_START_MATCH; 4087 4088 if (!PL_colorset) 4089 reginitcolors(); 4090 { 4091 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), 4092 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len); 4093 4094 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), 4095 start, end - start, PL_dump_re_max_len); 4096 4097 Perl_re_printf( aTHX_ 4098 "%s%s REx%s %s against %s\n", 4099 PL_colors[4], blurb, PL_colors[5], s0, s1); 4100 4101 if (utf8_target||utf8_pat) 4102 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n", 4103 utf8_pat ? "pattern" : "", 4104 utf8_pat && utf8_target ? " and " : "", 4105 utf8_target ? "string" : "" 4106 ); 4107 } 4108 } 4109 4110 STATIC void 4111 S_dump_exec_pos(pTHX_ const char *locinput, 4112 const regnode *scan, 4113 const char *loc_regeol, 4114 const char *loc_bostr, 4115 const char *loc_reg_starttry, 4116 const bool utf8_target, 4117 const U32 depth 4118 ) 4119 { 4120 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; 4121 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ 4122 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); 4123 /* The part of the string before starttry has one color 4124 (pref0_len chars), between starttry and current 4125 position another one (pref_len - pref0_len chars), 4126 after the current position the third one. 4127 We assume that pref0_len <= pref_len, otherwise we 4128 decrease pref0_len. */ 4129 int pref_len = (locinput - loc_bostr) > (5 + taill) - l 4130 ? (5 + taill) - l : locinput - loc_bostr; 4131 int pref0_len; 4132 4133 PERL_ARGS_ASSERT_DUMP_EXEC_POS; 4134 4135 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) 4136 pref_len++; 4137 pref0_len = pref_len - (locinput - loc_reg_starttry); 4138 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) 4139 l = ( loc_regeol - locinput > (5 + taill) - pref_len 4140 ? (5 + taill) - pref_len : loc_regeol - locinput); 4141 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) 4142 l--; 4143 if (pref0_len < 0) 4144 pref0_len = 0; 4145 if (pref0_len > pref_len) 4146 pref0_len = pref_len; 4147 { 4148 const int is_uni = utf8_target ? 1 : 0; 4149 4150 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), 4151 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5); 4152 4153 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), 4154 (locinput - pref_len + pref0_len), 4155 pref_len - pref0_len, PL_dump_re_max_len, 2, 3); 4156 4157 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), 4158 locinput, loc_regeol - locinput, 10, 0, 1); 4159 4160 const STRLEN tlen=len0+len1+len2; 4161 Perl_re_printf( aTHX_ 4162 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ", 4163 (IV)(locinput - loc_bostr), 4164 len0, s0, 4165 len1, s1, 4166 (docolor ? "" : "> <"), 4167 len2, s2, 4168 (int)(tlen > 19 ? 0 : 19 - tlen), 4169 "", 4170 depth); 4171 } 4172 } 4173 4174 #endif 4175 4176 /* reg_check_named_buff_matched() 4177 * Checks to see if a named buffer has matched. The data array of 4178 * buffer numbers corresponding to the buffer is expected to reside 4179 * in the regexp->data->data array in the slot stored in the ARG() of 4180 * node involved. Note that this routine doesn't actually care about the 4181 * name, that information is not preserved from compilation to execution. 4182 * Returns the index of the leftmost defined buffer with the given name 4183 * or 0 if non of the buffers matched. 4184 */ 4185 STATIC I32 4186 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) 4187 { 4188 I32 n; 4189 RXi_GET_DECL(rex,rexi); 4190 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 4191 I32 *nums=(I32*)SvPVX(sv_dat); 4192 4193 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; 4194 4195 for ( n=0; n<SvIVX(sv_dat); n++ ) { 4196 if ((I32)rex->lastparen >= nums[n] && 4197 rex->offs[nums[n]].end != -1) 4198 { 4199 return nums[n]; 4200 } 4201 } 4202 return 0; 4203 } 4204 4205 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ 4206 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ 4207 #define CHRTEST_NOT_A_CP_1 -999 4208 #define CHRTEST_NOT_A_CP_2 -998 4209 4210 static bool 4211 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, 4212 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) 4213 { 4214 /* This function determines if there are zero, one, two, or more characters 4215 * that match the first character of the passed-in EXACTish node 4216 * <text_node>, and if there are one or two, it returns them in the 4217 * passed-in pointers. 4218 * 4219 * If it determines that no possible character in the target string can 4220 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if 4221 * the first character in <text_node> requires UTF-8 to represent, and the 4222 * target string isn't in UTF-8.) 4223 * 4224 * If there are more than two characters that could match the beginning of 4225 * <text_node>, or if more context is required to determine a match or not, 4226 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID. 4227 * 4228 * The motiviation behind this function is to allow the caller to set up 4229 * tight loops for matching. If <text_node> is of type EXACT, there is 4230 * only one possible character that can match its first character, and so 4231 * the situation is quite simple. But things get much more complicated if 4232 * folding is involved. It may be that the first character of an EXACTFish 4233 * node doesn't participate in any possible fold, e.g., punctuation, so it 4234 * can be matched only by itself. The vast majority of characters that are 4235 * in folds match just two things, their lower and upper-case equivalents. 4236 * But not all are like that; some have multiple possible matches, or match 4237 * sequences of more than one character. This function sorts all that out. 4238 * 4239 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a 4240 * loop of trying to match A*, we know we can't exit where the thing 4241 * following it isn't a B. And something can't be a B unless it is the 4242 * beginning of B. By putting a quick test for that beginning in a tight 4243 * loop, we can rule out things that can't possibly be B without having to 4244 * break out of the loop, thus avoiding work. Similarly, if A is a single 4245 * character, we can make a tight loop matching A*, using the outputs of 4246 * this function. 4247 * 4248 * If the target string to match isn't in UTF-8, and there aren't 4249 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to 4250 * the one or two possible octets (which are characters in this situation) 4251 * that can match. In all cases, if there is only one character that can 4252 * match, *<c1p> and *<c2p> will be identical. 4253 * 4254 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8> 4255 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that 4256 * can match the beginning of <text_node>. They should be declared with at 4257 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is 4258 * undefined what these contain.) If one or both of the buffers are 4259 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the 4260 * corresponding invariant. If variant, the corresponding *<c1p> and/or 4261 * *<c2p> will be set to a negative number(s) that shouldn't match any code 4262 * point (unless inappropriately coerced to unsigned). *<c1p> will equal 4263 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */ 4264 4265 const bool utf8_target = reginfo->is_utf8_target; 4266 4267 UV c1 = (UV)CHRTEST_NOT_A_CP_1; 4268 UV c2 = (UV)CHRTEST_NOT_A_CP_2; 4269 bool use_chrtest_void = FALSE; 4270 const bool is_utf8_pat = reginfo->is_utf8_pat; 4271 4272 /* Used when we have both utf8 input and utf8 output, to avoid converting 4273 * to/from code points */ 4274 bool utf8_has_been_setup = FALSE; 4275 4276 dVAR; 4277 4278 U8 *pat = (U8*)STRING(text_node); 4279 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; 4280 4281 if ( OP(text_node) == EXACT 4282 || OP(text_node) == LEXACT 4283 || OP(text_node) == EXACT_REQ8 4284 || OP(text_node) == LEXACT_REQ8 4285 || OP(text_node) == EXACTL) 4286 { 4287 4288 /* In an exact node, only one thing can be matched, that first 4289 * character. If both the pat and the target are UTF-8, we can just 4290 * copy the input to the output, avoiding finding the code point of 4291 * that character */ 4292 if (!is_utf8_pat) { 4293 assert( OP(text_node) != EXACT_REQ8 4294 && OP(text_node) != LEXACT_REQ8); 4295 c2 = c1 = *pat; 4296 } 4297 else if (utf8_target) { 4298 Copy(pat, c1_utf8, UTF8SKIP(pat), U8); 4299 Copy(pat, c2_utf8, UTF8SKIP(pat), U8); 4300 utf8_has_been_setup = TRUE; 4301 } 4302 else if ( OP(text_node) == EXACT_REQ8 4303 || OP(text_node) == LEXACT_REQ8) 4304 { 4305 return FALSE; /* Can only match UTF-8 target */ 4306 } 4307 else { 4308 c2 = c1 = valid_utf8_to_uvchr(pat, NULL); 4309 } 4310 } 4311 else { /* an EXACTFish node */ 4312 U8 *pat_end = pat + STR_LENs(text_node); 4313 4314 /* An EXACTFL node has at least some characters unfolded, because what 4315 * they match is not known until now. So, now is the time to fold 4316 * the first few of them, as many as are needed to determine 'c1' and 4317 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need 4318 * to fold if in a UTF-8 locale, and then only the Sharp S; everything 4319 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we 4320 * need to fold as many characters as a single character can fold to, 4321 * so that later we can check if the first ones are such a multi-char 4322 * fold. But, in such a pattern only locale-problematic characters 4323 * aren't folded, so we can skip this completely if the first character 4324 * in the node isn't one of the tricky ones */ 4325 if (OP(text_node) == EXACTFL) { 4326 4327 if (! is_utf8_pat) { 4328 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) 4329 { 4330 folded[0] = folded[1] = 's'; 4331 pat = folded; 4332 pat_end = folded + 2; 4333 } 4334 } 4335 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { 4336 U8 *s = pat; 4337 U8 *d = folded; 4338 int i; 4339 4340 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { 4341 if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) { 4342 *(d++) = (U8) toFOLD_LC(*s); 4343 s++; 4344 } 4345 else { 4346 STRLEN len; 4347 _toFOLD_utf8_flags(s, 4348 pat_end, 4349 d, 4350 &len, 4351 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); 4352 d += len; 4353 s += UTF8SKIP(s); 4354 } 4355 } 4356 4357 pat = folded; 4358 pat_end = d; 4359 } 4360 } 4361 4362 if ( ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) 4363 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) 4364 { 4365 /* Multi-character folds require more context to sort out. Also 4366 * PL_utf8_foldclosures used below doesn't handle them, so have to 4367 * be handled outside this routine */ 4368 use_chrtest_void = TRUE; 4369 } 4370 else { /* an EXACTFish node which doesn't begin with a multi-char fold */ 4371 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; 4372 4373 if ( UNLIKELY(PL_in_utf8_turkic_locale) 4374 && OP(text_node) == EXACTFL 4375 && UNLIKELY( c1 == 'i' || c1 == 'I' 4376 || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 4377 || c1 == LATIN_SMALL_LETTER_DOTLESS_I)) 4378 { /* Hard-coded Turkish locale rules for these 4 characters 4379 override normal rules */ 4380 if (c1 == 'i') { 4381 c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; 4382 } 4383 else if (c1 == 'I') { 4384 c2 = LATIN_SMALL_LETTER_DOTLESS_I; 4385 } 4386 else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 4387 c2 = 'i'; 4388 } 4389 else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) { 4390 c2 = 'I'; 4391 } 4392 } 4393 else if (c1 > 255) { 4394 const U32 * remaining_folds; 4395 U32 first_fold; 4396 4397 /* Look up what code points (besides c1) fold to c1; e.g., 4398 * [ 'K', KELVIN_SIGN ] both fold to 'k'. */ 4399 Size_t folds_count = _inverse_folds(c1, &first_fold, 4400 &remaining_folds); 4401 if (folds_count == 0) { 4402 c2 = c1; /* there is only a single character that could 4403 match */ 4404 } 4405 else if (folds_count != 1) { 4406 /* If there aren't exactly two folds to this (itself and 4407 * another), it is outside the scope of this function */ 4408 use_chrtest_void = TRUE; 4409 } 4410 else { /* There are two. We already have one, get the other */ 4411 c2 = first_fold; 4412 4413 /* Folds that cross the 255/256 boundary are forbidden if 4414 * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is 4415 * ASCIII. The only other match to c1 is c2, and since c1 4416 * is above 255, c2 better be as well under these 4417 * circumstances. If it isn't, it means the only legal 4418 * match of c1 is itself. */ 4419 if ( c2 < 256 4420 && ( ( OP(text_node) == EXACTFL 4421 && ! IN_UTF8_CTYPE_LOCALE) 4422 || (( OP(text_node) == EXACTFAA 4423 || OP(text_node) == EXACTFAA_NO_TRIE) 4424 && (isASCII(c1) || isASCII(c2))))) 4425 { 4426 c2 = c1; 4427 } 4428 } 4429 } 4430 else /* Here, c1 is <= 255 */ 4431 if ( utf8_target 4432 && HAS_NONLATIN1_FOLD_CLOSURE(c1) 4433 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) 4434 && ( ( OP(text_node) != EXACTFAA 4435 && OP(text_node) != EXACTFAA_NO_TRIE) 4436 || ! isASCII(c1))) 4437 { 4438 /* Here, there could be something above Latin1 in the target 4439 * which folds to this character in the pattern. All such 4440 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more 4441 * than two characters involved in their folds, so are outside 4442 * the scope of this function */ 4443 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { 4444 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; 4445 } 4446 else { 4447 use_chrtest_void = TRUE; 4448 } 4449 } 4450 else { /* Here nothing above Latin1 can fold to the pattern 4451 character */ 4452 switch (OP(text_node)) { 4453 4454 case EXACTFL: /* /l rules */ 4455 c2 = PL_fold_locale[c1]; 4456 break; 4457 4458 case EXACTF: /* This node only generated for non-utf8 4459 patterns */ 4460 assert(! is_utf8_pat); 4461 if (! utf8_target) { /* /d rules */ 4462 c2 = PL_fold[c1]; 4463 break; 4464 } 4465 /* FALLTHROUGH */ 4466 /* /u rules for all these. This happens to work for 4467 * EXACTFAA as nothing in Latin1 folds to ASCII */ 4468 case EXACTFAA_NO_TRIE: /* This node only generated for 4469 non-utf8 patterns */ 4470 assert(! is_utf8_pat); 4471 /* FALLTHROUGH */ 4472 case EXACTFAA: 4473 case EXACTFUP: 4474 case EXACTFU: 4475 c2 = PL_fold_latin1[c1]; 4476 break; 4477 case EXACTFU_REQ8: 4478 return FALSE; 4479 NOT_REACHED; /* NOTREACHED */ 4480 4481 default: 4482 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); 4483 NOT_REACHED; /* NOTREACHED */ 4484 } 4485 } 4486 } 4487 } 4488 4489 /* Here have figured things out. Set up the returns */ 4490 if (use_chrtest_void) { 4491 *c2p = *c1p = CHRTEST_VOID; 4492 } 4493 else if (utf8_target) { 4494 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ 4495 uvchr_to_utf8(c1_utf8, c1); 4496 uvchr_to_utf8(c2_utf8, c2); 4497 } 4498 4499 /* Invariants are stored in both the utf8 and byte outputs; Use 4500 * negative numbers otherwise for the byte ones. Make sure that the 4501 * byte ones are the same iff the utf8 ones are the same */ 4502 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; 4503 *c2p = (UTF8_IS_INVARIANT(*c2_utf8)) 4504 ? *c2_utf8 4505 : (c1 == c2) 4506 ? CHRTEST_NOT_A_CP_1 4507 : CHRTEST_NOT_A_CP_2; 4508 } 4509 else if (c1 > 255) { 4510 if (c2 > 255) { /* both possibilities are above what a non-utf8 string 4511 can represent */ 4512 return FALSE; 4513 } 4514 4515 *c1p = *c2p = c2; /* c2 is the only representable value */ 4516 } 4517 else { /* c1 is representable; see about c2 */ 4518 *c1p = c1; 4519 *c2p = (c2 < 256) ? c2 : c1; 4520 } 4521 4522 return TRUE; 4523 } 4524 4525 STATIC bool 4526 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target) 4527 { 4528 /* returns a boolean indicating if there is a Grapheme Cluster Boundary 4529 * between the inputs. See https://www.unicode.org/reports/tr29/. */ 4530 4531 PERL_ARGS_ASSERT_ISGCB; 4532 4533 switch (GCB_table[before][after]) { 4534 case GCB_BREAKABLE: 4535 return TRUE; 4536 4537 case GCB_NOBREAK: 4538 return FALSE; 4539 4540 case GCB_RI_then_RI: 4541 { 4542 int RI_count = 1; 4543 U8 * temp_pos = (U8 *) curpos; 4544 4545 /* Do not break within emoji flag sequences. That is, do not 4546 * break between regional indicator (RI) symbols if there is an 4547 * odd number of RI characters before the break point. 4548 * GB12 sot (RI RI)* RI × RI 4549 * GB13 [^RI] (RI RI)* RI × RI */ 4550 4551 while (backup_one_GCB(strbeg, 4552 &temp_pos, 4553 utf8_target) == GCB_Regional_Indicator) 4554 { 4555 RI_count++; 4556 } 4557 4558 return RI_count % 2 != 1; 4559 } 4560 4561 case GCB_EX_then_EM: 4562 4563 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */ 4564 { 4565 U8 * temp_pos = (U8 *) curpos; 4566 GCB_enum prev; 4567 4568 do { 4569 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target); 4570 } 4571 while (prev == GCB_Extend); 4572 4573 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ; 4574 } 4575 4576 case GCB_Maybe_Emoji_NonBreak: 4577 4578 { 4579 4580 /* Do not break within emoji modifier sequences or emoji zwj sequences. 4581 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic} 4582 */ 4583 U8 * temp_pos = (U8 *) curpos; 4584 GCB_enum prev; 4585 4586 do { 4587 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target); 4588 } 4589 while (prev == GCB_Extend); 4590 4591 return prev != GCB_ExtPict_XX; 4592 } 4593 4594 default: 4595 break; 4596 } 4597 4598 #ifdef DEBUGGING 4599 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n", 4600 before, after, GCB_table[before][after]); 4601 assert(0); 4602 #endif 4603 return TRUE; 4604 } 4605 4606 STATIC GCB_enum 4607 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 4608 { 4609 dVAR; 4610 GCB_enum gcb; 4611 4612 PERL_ARGS_ASSERT_BACKUP_ONE_GCB; 4613 4614 if (*curpos < strbeg) { 4615 return GCB_EDGE; 4616 } 4617 4618 if (utf8_target) { 4619 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 4620 U8 * prev_prev_char_pos; 4621 4622 if (! prev_char_pos) { 4623 return GCB_EDGE; 4624 } 4625 4626 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { 4627 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 4628 *curpos = prev_char_pos; 4629 prev_char_pos = prev_prev_char_pos; 4630 } 4631 else { 4632 *curpos = (U8 *) strbeg; 4633 return GCB_EDGE; 4634 } 4635 } 4636 else { 4637 if (*curpos - 2 < strbeg) { 4638 *curpos = (U8 *) strbeg; 4639 return GCB_EDGE; 4640 } 4641 (*curpos)--; 4642 gcb = getGCB_VAL_CP(*(*curpos - 1)); 4643 } 4644 4645 return gcb; 4646 } 4647 4648 /* Combining marks attach to most classes that precede them, but this defines 4649 * the exceptions (from TR14) */ 4650 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \ 4651 || prev == LB_Mandatory_Break \ 4652 || prev == LB_Carriage_Return \ 4653 || prev == LB_Line_Feed \ 4654 || prev == LB_Next_Line \ 4655 || prev == LB_Space \ 4656 || prev == LB_ZWSpace)) 4657 4658 STATIC bool 4659 S_isLB(pTHX_ LB_enum before, 4660 LB_enum after, 4661 const U8 * const strbeg, 4662 const U8 * const curpos, 4663 const U8 * const strend, 4664 const bool utf8_target) 4665 { 4666 U8 * temp_pos = (U8 *) curpos; 4667 LB_enum prev = before; 4668 4669 /* Is the boundary between 'before' and 'after' line-breakable? 4670 * Most of this is just a table lookup of a generated table from Unicode 4671 * rules. But some rules require context to decide, and so have to be 4672 * implemented in code */ 4673 4674 PERL_ARGS_ASSERT_ISLB; 4675 4676 /* Rule numbers in the comments below are as of Unicode 9.0 */ 4677 4678 redo: 4679 before = prev; 4680 switch (LB_table[before][after]) { 4681 case LB_BREAKABLE: 4682 return TRUE; 4683 4684 case LB_NOBREAK: 4685 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN: 4686 return FALSE; 4687 4688 case LB_SP_foo + LB_BREAKABLE: 4689 case LB_SP_foo + LB_NOBREAK: 4690 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN: 4691 4692 /* When we have something following a SP, we have to look at the 4693 * context in order to know what to do. 4694 * 4695 * SP SP should not reach here because LB7: Do not break before 4696 * spaces. (For two spaces in a row there is nothing that 4697 * overrides that) */ 4698 assert(after != LB_Space); 4699 4700 /* Here we have a space followed by a non-space. Mostly this is a 4701 * case of LB18: "Break after spaces". But there are complications 4702 * as the handling of spaces is somewhat tricky. They are in a 4703 * number of rules, which have to be applied in priority order, but 4704 * something earlier in the string can cause a rule to be skipped 4705 * and a lower priority rule invoked. A prime example is LB7 which 4706 * says don't break before a space. But rule LB8 (lower priority) 4707 * says that the first break opportunity after a ZW is after any 4708 * span of spaces immediately after it. If a ZW comes before a SP 4709 * in the input, rule LB8 applies, and not LB7. Other such rules 4710 * involve combining marks which are rules 9 and 10, but they may 4711 * override higher priority rules if they come earlier in the 4712 * string. Since we're doing random access into the middle of the 4713 * string, we have to look for rules that should get applied based 4714 * on both string position and priority. Combining marks do not 4715 * attach to either ZW nor SP, so we don't have to consider them 4716 * until later. 4717 * 4718 * To check for LB8, we have to find the first non-space character 4719 * before this span of spaces */ 4720 do { 4721 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 4722 } 4723 while (prev == LB_Space); 4724 4725 /* LB8 Break before any character following a zero-width space, 4726 * even if one or more spaces intervene. 4727 * ZW SP* ÷ 4728 * So if we have a ZW just before this span, and to get here this 4729 * is the final space in the span. */ 4730 if (prev == LB_ZWSpace) { 4731 return TRUE; 4732 } 4733 4734 /* Here, not ZW SP+. There are several rules that have higher 4735 * priority than LB18 and can be resolved now, as they don't depend 4736 * on anything earlier in the string (except ZW, which we have 4737 * already handled). One of these rules is LB11 Do not break 4738 * before Word joiner, but we have specially encoded that in the 4739 * lookup table so it is caught by the single test below which 4740 * catches the other ones. */ 4741 if (LB_table[LB_Space][after] - LB_SP_foo 4742 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN) 4743 { 4744 return FALSE; 4745 } 4746 4747 /* If we get here, we have to XXX consider combining marks. */ 4748 if (prev == LB_Combining_Mark) { 4749 4750 /* What happens with these depends on the character they 4751 * follow. */ 4752 do { 4753 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 4754 } 4755 while (prev == LB_Combining_Mark); 4756 4757 /* Most times these attach to and inherit the characteristics 4758 * of that character, but not always, and when not, they are to 4759 * be treated as AL by rule LB10. */ 4760 if (! LB_CM_ATTACHES_TO(prev)) { 4761 prev = LB_Alphabetic; 4762 } 4763 } 4764 4765 /* Here, we have the character preceding the span of spaces all set 4766 * up. We follow LB18: "Break after spaces" unless the table shows 4767 * that is overriden */ 4768 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN; 4769 4770 case LB_CM_ZWJ_foo: 4771 4772 /* We don't know how to treat the CM except by looking at the first 4773 * non-CM character preceding it. ZWJ is treated as CM */ 4774 do { 4775 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 4776 } 4777 while (prev == LB_Combining_Mark || prev == LB_ZWJ); 4778 4779 /* Here, 'prev' is that first earlier non-CM character. If the CM 4780 * attatches to it, then it inherits the behavior of 'prev'. If it 4781 * doesn't attach, it is to be treated as an AL */ 4782 if (! LB_CM_ATTACHES_TO(prev)) { 4783 prev = LB_Alphabetic; 4784 } 4785 4786 goto redo; 4787 4788 case LB_HY_or_BA_then_foo + LB_BREAKABLE: 4789 case LB_HY_or_BA_then_foo + LB_NOBREAK: 4790 4791 /* LB21a Don't break after Hebrew + Hyphen. 4792 * HL (HY | BA) × */ 4793 4794 if (backup_one_LB(strbeg, &temp_pos, utf8_target) 4795 == LB_Hebrew_Letter) 4796 { 4797 return FALSE; 4798 } 4799 4800 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE; 4801 4802 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE: 4803 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK: 4804 4805 /* LB25a (PR | PO) × ( OP | HY )? NU */ 4806 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) { 4807 return FALSE; 4808 } 4809 4810 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY 4811 == LB_BREAKABLE; 4812 4813 case LB_SY_or_IS_then_various + LB_BREAKABLE: 4814 case LB_SY_or_IS_then_various + LB_NOBREAK: 4815 { 4816 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */ 4817 4818 LB_enum temp = prev; 4819 do { 4820 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 4821 } 4822 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric); 4823 if (temp == LB_Numeric) { 4824 return FALSE; 4825 } 4826 4827 return LB_table[prev][after] - LB_SY_or_IS_then_various 4828 == LB_BREAKABLE; 4829 } 4830 4831 case LB_various_then_PO_or_PR + LB_BREAKABLE: 4832 case LB_various_then_PO_or_PR + LB_NOBREAK: 4833 { 4834 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */ 4835 4836 LB_enum temp = prev; 4837 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis) 4838 { 4839 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 4840 } 4841 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) { 4842 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 4843 } 4844 if (temp == LB_Numeric) { 4845 return FALSE; 4846 } 4847 return LB_various_then_PO_or_PR; 4848 } 4849 4850 case LB_RI_then_RI + LB_NOBREAK: 4851 case LB_RI_then_RI + LB_BREAKABLE: 4852 { 4853 int RI_count = 1; 4854 4855 /* LB30a Break between two regional indicator symbols if and 4856 * only if there are an even number of regional indicators 4857 * preceding the position of the break. 4858 * 4859 * sot (RI RI)* RI × RI 4860 * [^RI] (RI RI)* RI × RI */ 4861 4862 while (backup_one_LB(strbeg, 4863 &temp_pos, 4864 utf8_target) == LB_Regional_Indicator) 4865 { 4866 RI_count++; 4867 } 4868 4869 return RI_count % 2 == 0; 4870 } 4871 4872 default: 4873 break; 4874 } 4875 4876 #ifdef DEBUGGING 4877 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n", 4878 before, after, LB_table[before][after]); 4879 assert(0); 4880 #endif 4881 return TRUE; 4882 } 4883 4884 STATIC LB_enum 4885 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) 4886 { 4887 dVAR; 4888 4889 LB_enum lb; 4890 4891 PERL_ARGS_ASSERT_ADVANCE_ONE_LB; 4892 4893 if (*curpos >= strend) { 4894 return LB_EDGE; 4895 } 4896 4897 if (utf8_target) { 4898 *curpos += UTF8SKIP(*curpos); 4899 if (*curpos >= strend) { 4900 return LB_EDGE; 4901 } 4902 lb = getLB_VAL_UTF8(*curpos, strend); 4903 } 4904 else { 4905 (*curpos)++; 4906 if (*curpos >= strend) { 4907 return LB_EDGE; 4908 } 4909 lb = getLB_VAL_CP(**curpos); 4910 } 4911 4912 return lb; 4913 } 4914 4915 STATIC LB_enum 4916 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 4917 { 4918 dVAR; 4919 LB_enum lb; 4920 4921 PERL_ARGS_ASSERT_BACKUP_ONE_LB; 4922 4923 if (*curpos < strbeg) { 4924 return LB_EDGE; 4925 } 4926 4927 if (utf8_target) { 4928 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 4929 U8 * prev_prev_char_pos; 4930 4931 if (! prev_char_pos) { 4932 return LB_EDGE; 4933 } 4934 4935 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { 4936 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 4937 *curpos = prev_char_pos; 4938 prev_char_pos = prev_prev_char_pos; 4939 } 4940 else { 4941 *curpos = (U8 *) strbeg; 4942 return LB_EDGE; 4943 } 4944 } 4945 else { 4946 if (*curpos - 2 < strbeg) { 4947 *curpos = (U8 *) strbeg; 4948 return LB_EDGE; 4949 } 4950 (*curpos)--; 4951 lb = getLB_VAL_CP(*(*curpos - 1)); 4952 } 4953 4954 return lb; 4955 } 4956 4957 STATIC bool 4958 S_isSB(pTHX_ SB_enum before, 4959 SB_enum after, 4960 const U8 * const strbeg, 4961 const U8 * const curpos, 4962 const U8 * const strend, 4963 const bool utf8_target) 4964 { 4965 /* returns a boolean indicating if there is a Sentence Boundary Break 4966 * between the inputs. See https://www.unicode.org/reports/tr29/ */ 4967 4968 U8 * lpos = (U8 *) curpos; 4969 bool has_para_sep = FALSE; 4970 bool has_sp = FALSE; 4971 4972 PERL_ARGS_ASSERT_ISSB; 4973 4974 /* Break at the start and end of text. 4975 SB1. sot ÷ 4976 SB2. ÷ eot 4977 But unstated in Unicode is don't break if the text is empty */ 4978 if (before == SB_EDGE || after == SB_EDGE) { 4979 return before != after; 4980 } 4981 4982 /* SB 3: Do not break within CRLF. */ 4983 if (before == SB_CR && after == SB_LF) { 4984 return FALSE; 4985 } 4986 4987 /* Break after paragraph separators. CR and LF are considered 4988 * so because Unicode views text as like word processing text where there 4989 * are no newlines except between paragraphs, and the word processor takes 4990 * care of wrapping without there being hard line-breaks in the text *./ 4991 SB4. Sep | CR | LF ÷ */ 4992 if (before == SB_Sep || before == SB_CR || before == SB_LF) { 4993 return TRUE; 4994 } 4995 4996 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF. 4997 * (See Section 6.2, Replacing Ignore Rules.) 4998 SB5. X (Extend | Format)* → X */ 4999 if (after == SB_Extend || after == SB_Format) { 5000 5001 /* Implied is that the these characters attach to everything 5002 * immediately prior to them except for those separator-type 5003 * characters. And the rules earlier have already handled the case 5004 * when one of those immediately precedes the extend char */ 5005 return FALSE; 5006 } 5007 5008 if (before == SB_Extend || before == SB_Format) { 5009 U8 * temp_pos = lpos; 5010 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); 5011 if ( backup != SB_EDGE 5012 && backup != SB_Sep 5013 && backup != SB_CR 5014 && backup != SB_LF) 5015 { 5016 before = backup; 5017 lpos = temp_pos; 5018 } 5019 5020 /* Here, both 'before' and 'backup' are these types; implied is that we 5021 * don't break between them */ 5022 if (backup == SB_Extend || backup == SB_Format) { 5023 return FALSE; 5024 } 5025 } 5026 5027 /* Do not break after ambiguous terminators like period, if they are 5028 * immediately followed by a number or lowercase letter, if they are 5029 * between uppercase letters, if the first following letter (optionally 5030 * after certain punctuation) is lowercase, or if they are followed by 5031 * "continuation" punctuation such as comma, colon, or semicolon. For 5032 * example, a period may be an abbreviation or numeric period, and thus may 5033 * not mark the end of a sentence. 5034 5035 * SB6. ATerm × Numeric */ 5036 if (before == SB_ATerm && after == SB_Numeric) { 5037 return FALSE; 5038 } 5039 5040 /* SB7. (Upper | Lower) ATerm × Upper */ 5041 if (before == SB_ATerm && after == SB_Upper) { 5042 U8 * temp_pos = lpos; 5043 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); 5044 if (backup == SB_Upper || backup == SB_Lower) { 5045 return FALSE; 5046 } 5047 } 5048 5049 /* The remaining rules that aren't the final one, all require an STerm or 5050 * an ATerm after having backed up over some Close* Sp*, and in one case an 5051 * optional Paragraph separator, although one rule doesn't have any Sp's in it. 5052 * So do that backup now, setting flags if either Sp or a paragraph 5053 * separator are found */ 5054 5055 if (before == SB_Sep || before == SB_CR || before == SB_LF) { 5056 has_para_sep = TRUE; 5057 before = backup_one_SB(strbeg, &lpos, utf8_target); 5058 } 5059 5060 if (before == SB_Sp) { 5061 has_sp = TRUE; 5062 do { 5063 before = backup_one_SB(strbeg, &lpos, utf8_target); 5064 } 5065 while (before == SB_Sp); 5066 } 5067 5068 while (before == SB_Close) { 5069 before = backup_one_SB(strbeg, &lpos, utf8_target); 5070 } 5071 5072 /* The next few rules apply only when the backed-up-to is an ATerm, and in 5073 * most cases an STerm */ 5074 if (before == SB_STerm || before == SB_ATerm) { 5075 5076 /* So, here the lhs matches 5077 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)? 5078 * and we have set flags if we found an Sp, or the optional Sep,CR,LF. 5079 * The rules that apply here are: 5080 * 5081 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR 5082 | LF | STerm | ATerm) )* Lower 5083 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm) 5084 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF) 5085 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF) 5086 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷ 5087 */ 5088 5089 /* And all but SB11 forbid having seen a paragraph separator */ 5090 if (! has_para_sep) { 5091 if (before == SB_ATerm) { /* SB8 */ 5092 U8 * rpos = (U8 *) curpos; 5093 SB_enum later = after; 5094 5095 while ( later != SB_OLetter 5096 && later != SB_Upper 5097 && later != SB_Lower 5098 && later != SB_Sep 5099 && later != SB_CR 5100 && later != SB_LF 5101 && later != SB_STerm 5102 && later != SB_ATerm 5103 && later != SB_EDGE) 5104 { 5105 later = advance_one_SB(&rpos, strend, utf8_target); 5106 } 5107 if (later == SB_Lower) { 5108 return FALSE; 5109 } 5110 } 5111 5112 if ( after == SB_SContinue /* SB8a */ 5113 || after == SB_STerm 5114 || after == SB_ATerm) 5115 { 5116 return FALSE; 5117 } 5118 5119 if (! has_sp) { /* SB9 applies only if there was no Sp* */ 5120 if ( after == SB_Close 5121 || after == SB_Sp 5122 || after == SB_Sep 5123 || after == SB_CR 5124 || after == SB_LF) 5125 { 5126 return FALSE; 5127 } 5128 } 5129 5130 /* SB10. This and SB9 could probably be combined some way, but khw 5131 * has decided to follow the Unicode rule book precisely for 5132 * simplified maintenance */ 5133 if ( after == SB_Sp 5134 || after == SB_Sep 5135 || after == SB_CR 5136 || after == SB_LF) 5137 { 5138 return FALSE; 5139 } 5140 } 5141 5142 /* SB11. */ 5143 return TRUE; 5144 } 5145 5146 /* Otherwise, do not break. 5147 SB12. Any × Any */ 5148 5149 return FALSE; 5150 } 5151 5152 STATIC SB_enum 5153 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) 5154 { 5155 dVAR; 5156 SB_enum sb; 5157 5158 PERL_ARGS_ASSERT_ADVANCE_ONE_SB; 5159 5160 if (*curpos >= strend) { 5161 return SB_EDGE; 5162 } 5163 5164 if (utf8_target) { 5165 do { 5166 *curpos += UTF8SKIP(*curpos); 5167 if (*curpos >= strend) { 5168 return SB_EDGE; 5169 } 5170 sb = getSB_VAL_UTF8(*curpos, strend); 5171 } while (sb == SB_Extend || sb == SB_Format); 5172 } 5173 else { 5174 do { 5175 (*curpos)++; 5176 if (*curpos >= strend) { 5177 return SB_EDGE; 5178 } 5179 sb = getSB_VAL_CP(**curpos); 5180 } while (sb == SB_Extend || sb == SB_Format); 5181 } 5182 5183 return sb; 5184 } 5185 5186 STATIC SB_enum 5187 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5188 { 5189 dVAR; 5190 SB_enum sb; 5191 5192 PERL_ARGS_ASSERT_BACKUP_ONE_SB; 5193 5194 if (*curpos < strbeg) { 5195 return SB_EDGE; 5196 } 5197 5198 if (utf8_target) { 5199 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5200 if (! prev_char_pos) { 5201 return SB_EDGE; 5202 } 5203 5204 /* Back up over Extend and Format. curpos is always just to the right 5205 * of the characater whose value we are getting */ 5206 do { 5207 U8 * prev_prev_char_pos; 5208 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, 5209 strbeg))) 5210 { 5211 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5212 *curpos = prev_char_pos; 5213 prev_char_pos = prev_prev_char_pos; 5214 } 5215 else { 5216 *curpos = (U8 *) strbeg; 5217 return SB_EDGE; 5218 } 5219 } while (sb == SB_Extend || sb == SB_Format); 5220 } 5221 else { 5222 do { 5223 if (*curpos - 2 < strbeg) { 5224 *curpos = (U8 *) strbeg; 5225 return SB_EDGE; 5226 } 5227 (*curpos)--; 5228 sb = getSB_VAL_CP(*(*curpos - 1)); 5229 } while (sb == SB_Extend || sb == SB_Format); 5230 } 5231 5232 return sb; 5233 } 5234 5235 STATIC bool 5236 S_isWB(pTHX_ WB_enum previous, 5237 WB_enum before, 5238 WB_enum after, 5239 const U8 * const strbeg, 5240 const U8 * const curpos, 5241 const U8 * const strend, 5242 const bool utf8_target) 5243 { 5244 /* Return a boolean as to if the boundary between 'before' and 'after' is 5245 * a Unicode word break, using their published algorithm, but tailored for 5246 * Perl by treating spans of white space as one unit. Context may be 5247 * needed to make this determination. If the value for the character 5248 * before 'before' is known, it is passed as 'previous'; otherwise that 5249 * should be set to WB_UNKNOWN. The other input parameters give the 5250 * boundaries and current position in the matching of the string. That 5251 * is, 'curpos' marks the position where the character whose wb value is 5252 * 'after' begins. See http://www.unicode.org/reports/tr29/ */ 5253 5254 U8 * before_pos = (U8 *) curpos; 5255 U8 * after_pos = (U8 *) curpos; 5256 WB_enum prev = before; 5257 WB_enum next; 5258 5259 PERL_ARGS_ASSERT_ISWB; 5260 5261 /* Rule numbers in the comments below are as of Unicode 9.0 */ 5262 5263 redo: 5264 before = prev; 5265 switch (WB_table[before][after]) { 5266 case WB_BREAKABLE: 5267 return TRUE; 5268 5269 case WB_NOBREAK: 5270 return FALSE; 5271 5272 case WB_hs_then_hs: /* 2 horizontal spaces in a row */ 5273 next = advance_one_WB(&after_pos, strend, utf8_target, 5274 FALSE /* Don't skip Extend nor Format */ ); 5275 /* A space immediately preceeding an Extend or Format is attached 5276 * to by them, and hence gets separated from previous spaces. 5277 * Otherwise don't break between horizontal white space */ 5278 return next == WB_Extend || next == WB_Format; 5279 5280 /* WB4 Ignore Format and Extend characters, except when they appear at 5281 * the beginning of a region of text. This code currently isn't 5282 * general purpose, but it works as the rules are currently and likely 5283 * to be laid out. The reason it works is that when 'they appear at 5284 * the beginning of a region of text', the rule is to break before 5285 * them, just like any other character. Therefore, the default rule 5286 * applies and we don't have to look in more depth. Should this ever 5287 * change, we would have to have 2 'case' statements, like in the rules 5288 * below, and backup a single character (not spacing over the extend 5289 * ones) and then see if that is one of the region-end characters and 5290 * go from there */ 5291 case WB_Ex_or_FO_or_ZWJ_then_foo: 5292 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); 5293 goto redo; 5294 5295 case WB_DQ_then_HL + WB_BREAKABLE: 5296 case WB_DQ_then_HL + WB_NOBREAK: 5297 5298 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */ 5299 5300 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) 5301 == WB_Hebrew_Letter) 5302 { 5303 return FALSE; 5304 } 5305 5306 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE; 5307 5308 case WB_HL_then_DQ + WB_BREAKABLE: 5309 case WB_HL_then_DQ + WB_NOBREAK: 5310 5311 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */ 5312 5313 if (advance_one_WB(&after_pos, strend, utf8_target, 5314 TRUE /* Do skip Extend and Format */ ) 5315 == WB_Hebrew_Letter) 5316 { 5317 return FALSE; 5318 } 5319 5320 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE; 5321 5322 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK: 5323 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE: 5324 5325 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet 5326 * | Single_Quote) (ALetter | Hebrew_Letter) */ 5327 5328 next = advance_one_WB(&after_pos, strend, utf8_target, 5329 TRUE /* Do skip Extend and Format */ ); 5330 5331 if (next == WB_ALetter || next == WB_Hebrew_Letter) 5332 { 5333 return FALSE; 5334 } 5335 5336 return WB_table[before][after] 5337 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE; 5338 5339 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK: 5340 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE: 5341 5342 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet 5343 * | Single_Quote) × (ALetter | Hebrew_Letter) */ 5344 5345 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); 5346 if (prev == WB_ALetter || prev == WB_Hebrew_Letter) 5347 { 5348 return FALSE; 5349 } 5350 5351 return WB_table[before][after] 5352 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE; 5353 5354 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK: 5355 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE: 5356 5357 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric 5358 * */ 5359 5360 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) 5361 == WB_Numeric) 5362 { 5363 return FALSE; 5364 } 5365 5366 return WB_table[before][after] 5367 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE; 5368 5369 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK: 5370 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE: 5371 5372 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */ 5373 5374 if (advance_one_WB(&after_pos, strend, utf8_target, 5375 TRUE /* Do skip Extend and Format */ ) 5376 == WB_Numeric) 5377 { 5378 return FALSE; 5379 } 5380 5381 return WB_table[before][after] 5382 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE; 5383 5384 case WB_RI_then_RI + WB_NOBREAK: 5385 case WB_RI_then_RI + WB_BREAKABLE: 5386 { 5387 int RI_count = 1; 5388 5389 /* Do not break within emoji flag sequences. That is, do not 5390 * break between regional indicator (RI) symbols if there is an 5391 * odd number of RI characters before the potential break 5392 * point. 5393 * 5394 * WB15 sot (RI RI)* RI × RI 5395 * WB16 [^RI] (RI RI)* RI × RI */ 5396 5397 while (backup_one_WB(&previous, 5398 strbeg, 5399 &before_pos, 5400 utf8_target) == WB_Regional_Indicator) 5401 { 5402 RI_count++; 5403 } 5404 5405 return RI_count % 2 != 1; 5406 } 5407 5408 default: 5409 break; 5410 } 5411 5412 #ifdef DEBUGGING 5413 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n", 5414 before, after, WB_table[before][after]); 5415 assert(0); 5416 #endif 5417 return TRUE; 5418 } 5419 5420 STATIC WB_enum 5421 S_advance_one_WB(pTHX_ U8 ** curpos, 5422 const U8 * const strend, 5423 const bool utf8_target, 5424 const bool skip_Extend_Format) 5425 { 5426 dVAR; 5427 WB_enum wb; 5428 5429 PERL_ARGS_ASSERT_ADVANCE_ONE_WB; 5430 5431 if (*curpos >= strend) { 5432 return WB_EDGE; 5433 } 5434 5435 if (utf8_target) { 5436 5437 /* Advance over Extend and Format */ 5438 do { 5439 *curpos += UTF8SKIP(*curpos); 5440 if (*curpos >= strend) { 5441 return WB_EDGE; 5442 } 5443 wb = getWB_VAL_UTF8(*curpos, strend); 5444 } while ( skip_Extend_Format 5445 && (wb == WB_Extend || wb == WB_Format)); 5446 } 5447 else { 5448 do { 5449 (*curpos)++; 5450 if (*curpos >= strend) { 5451 return WB_EDGE; 5452 } 5453 wb = getWB_VAL_CP(**curpos); 5454 } while ( skip_Extend_Format 5455 && (wb == WB_Extend || wb == WB_Format)); 5456 } 5457 5458 return wb; 5459 } 5460 5461 STATIC WB_enum 5462 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5463 { 5464 dVAR; 5465 WB_enum wb; 5466 5467 PERL_ARGS_ASSERT_BACKUP_ONE_WB; 5468 5469 /* If we know what the previous character's break value is, don't have 5470 * to look it up */ 5471 if (*previous != WB_UNKNOWN) { 5472 wb = *previous; 5473 5474 /* But we need to move backwards by one */ 5475 if (utf8_target) { 5476 *curpos = reghopmaybe3(*curpos, -1, strbeg); 5477 if (! *curpos) { 5478 *previous = WB_EDGE; 5479 *curpos = (U8 *) strbeg; 5480 } 5481 else { 5482 *previous = WB_UNKNOWN; 5483 } 5484 } 5485 else { 5486 (*curpos)--; 5487 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN; 5488 } 5489 5490 /* And we always back up over these three types */ 5491 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) { 5492 return wb; 5493 } 5494 } 5495 5496 if (*curpos < strbeg) { 5497 return WB_EDGE; 5498 } 5499 5500 if (utf8_target) { 5501 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5502 if (! prev_char_pos) { 5503 return WB_EDGE; 5504 } 5505 5506 /* Back up over Extend and Format. curpos is always just to the right 5507 * of the characater whose value we are getting */ 5508 do { 5509 U8 * prev_prev_char_pos; 5510 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, 5511 -1, 5512 strbeg))) 5513 { 5514 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5515 *curpos = prev_char_pos; 5516 prev_char_pos = prev_prev_char_pos; 5517 } 5518 else { 5519 *curpos = (U8 *) strbeg; 5520 return WB_EDGE; 5521 } 5522 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ); 5523 } 5524 else { 5525 do { 5526 if (*curpos - 2 < strbeg) { 5527 *curpos = (U8 *) strbeg; 5528 return WB_EDGE; 5529 } 5530 (*curpos)--; 5531 wb = getWB_VAL_CP(*(*curpos - 1)); 5532 } while (wb == WB_Extend || wb == WB_Format); 5533 } 5534 5535 return wb; 5536 } 5537 5538 /* Macros for regmatch(), using its internal variables */ 5539 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ 5540 #define NEXTCHR_IS_EOS (nextchr < 0) 5541 5542 #define SET_nextchr \ 5543 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) 5544 5545 #define SET_locinput(p) \ 5546 locinput = (p); \ 5547 SET_nextchr 5548 5549 #define sayYES goto yes 5550 #define sayNO goto no 5551 #define sayNO_SILENT goto no_silent 5552 5553 /* we dont use STMT_START/END here because it leads to 5554 "unreachable code" warnings, which are bogus, but distracting. */ 5555 #define CACHEsayNO \ 5556 if (ST.cache_mask) \ 5557 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ 5558 sayNO 5559 5560 #define EVAL_CLOSE_PAREN_IS(st,expr) \ 5561 ( \ 5562 ( ( st ) ) && \ 5563 ( ( st )->u.eval.close_paren ) && \ 5564 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \ 5565 ) 5566 5567 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \ 5568 ( \ 5569 ( ( st ) ) && \ 5570 ( ( st )->u.eval.close_paren ) && \ 5571 ( ( expr ) ) && \ 5572 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \ 5573 ) 5574 5575 5576 #define EVAL_CLOSE_PAREN_SET(st,expr) \ 5577 (st)->u.eval.close_paren = ( (expr) + 1 ) 5578 5579 #define EVAL_CLOSE_PAREN_CLEAR(st) \ 5580 (st)->u.eval.close_paren = 0 5581 5582 /* push a new state then goto it */ 5583 5584 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \ 5585 pushinput = input; \ 5586 pusheol = eol; \ 5587 pushsr0 = sr0; \ 5588 scan = node; \ 5589 st->resume_state = state; \ 5590 goto push_state; 5591 5592 /* push a new state with success backtracking, then goto it */ 5593 5594 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \ 5595 pushinput = input; \ 5596 pusheol = eol; \ 5597 pushsr0 = sr0; \ 5598 scan = node; \ 5599 st->resume_state = state; \ 5600 goto push_yes_state; 5601 5602 #define DEBUG_STATE_pp(pp) \ 5603 DEBUG_STATE_r({ \ 5604 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \ 5605 Perl_re_printf( aTHX_ \ 5606 "%*s" pp " %s%s%s%s%s\n", \ 5607 INDENT_CHARS(depth), "", \ 5608 PL_reg_name[st->resume_state], \ 5609 ((st==yes_state||st==mark_state) ? "[" : ""), \ 5610 ((st==yes_state) ? "Y" : ""), \ 5611 ((st==mark_state) ? "M" : ""), \ 5612 ((st==yes_state||st==mark_state) ? "]" : "") \ 5613 ); \ 5614 }); 5615 5616 /* 5617 5618 regmatch() - main matching routine 5619 5620 This is basically one big switch statement in a loop. We execute an op, 5621 set 'next' to point the next op, and continue. If we come to a point which 5622 we may need to backtrack to on failure such as (A|B|C), we push a 5623 backtrack state onto the backtrack stack. On failure, we pop the top 5624 state, and re-enter the loop at the state indicated. If there are no more 5625 states to pop, we return failure. 5626 5627 Sometimes we also need to backtrack on success; for example /A+/, where 5628 after successfully matching one A, we need to go back and try to 5629 match another one; similarly for lookahead assertions: if the assertion 5630 completes successfully, we backtrack to the state just before the assertion 5631 and then carry on. In these cases, the pushed state is marked as 5632 'backtrack on success too'. This marking is in fact done by a chain of 5633 pointers, each pointing to the previous 'yes' state. On success, we pop to 5634 the nearest yes state, discarding any intermediate failure-only states. 5635 Sometimes a yes state is pushed just to force some cleanup code to be 5636 called at the end of a successful match or submatch; e.g. (??{$re}) uses 5637 it to free the inner regex. 5638 5639 Note that failure backtracking rewinds the cursor position, while 5640 success backtracking leaves it alone. 5641 5642 A pattern is complete when the END op is executed, while a subpattern 5643 such as (?=foo) is complete when the SUCCESS op is executed. Both of these 5644 ops trigger the "pop to last yes state if any, otherwise return true" 5645 behaviour. 5646 5647 A common convention in this function is to use A and B to refer to the two 5648 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is 5649 the subpattern to be matched possibly multiple times, while B is the entire 5650 rest of the pattern. Variable and state names reflect this convention. 5651 5652 The states in the main switch are the union of ops and failure/success of 5653 substates associated with that op. For example, IFMATCH is the op 5654 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means 5655 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just 5656 successfully matched A and IFMATCH_A_fail is a state saying that we have 5657 just failed to match A. Resume states always come in pairs. The backtrack 5658 state we push is marked as 'IFMATCH_A', but when that is popped, we resume 5659 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking 5660 on success or failure. 5661 5662 The struct that holds a backtracking state is actually a big union, with 5663 one variant for each major type of op. The variable st points to the 5664 top-most backtrack struct. To make the code clearer, within each 5665 block of code we #define ST to alias the relevant union. 5666 5667 Here's a concrete example of a (vastly oversimplified) IFMATCH 5668 implementation: 5669 5670 switch (state) { 5671 .... 5672 5673 #define ST st->u.ifmatch 5674 5675 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 5676 ST.foo = ...; // some state we wish to save 5677 ... 5678 // push a yes backtrack state with a resume value of 5679 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the 5680 // first node of A: 5681 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); 5682 // NOTREACHED 5683 5684 case IFMATCH_A: // we have successfully executed A; now continue with B 5685 next = B; 5686 bar = ST.foo; // do something with the preserved value 5687 break; 5688 5689 case IFMATCH_A_fail: // A failed, so the assertion failed 5690 ...; // do some housekeeping, then ... 5691 sayNO; // propagate the failure 5692 5693 #undef ST 5694 5695 ... 5696 } 5697 5698 For any old-timers reading this who are familiar with the old recursive 5699 approach, the code above is equivalent to: 5700 5701 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 5702 { 5703 int foo = ... 5704 ... 5705 if (regmatch(A)) { 5706 next = B; 5707 bar = foo; 5708 break; 5709 } 5710 ...; // do some housekeeping, then ... 5711 sayNO; // propagate the failure 5712 } 5713 5714 The topmost backtrack state, pointed to by st, is usually free. If you 5715 want to claim it, populate any ST.foo fields in it with values you wish to 5716 save, then do one of 5717 5718 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol); 5719 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol); 5720 5721 which sets that backtrack state's resume value to 'resume_state', pushes a 5722 new free entry to the top of the backtrack stack, then goes to 'node'. 5723 On backtracking, the free slot is popped, and the saved state becomes the 5724 new free state. An ST.foo field in this new top state can be temporarily 5725 accessed to retrieve values, but once the main loop is re-entered, it 5726 becomes available for reuse. 5727 5728 Note that the depth of the backtrack stack constantly increases during the 5729 left-to-right execution of the pattern, rather than going up and down with 5730 the pattern nesting. For example the stack is at its maximum at Z at the 5731 end of the pattern, rather than at X in the following: 5732 5733 /(((X)+)+)+....(Y)+....Z/ 5734 5735 The only exceptions to this are lookahead/behind assertions and the cut, 5736 (?>A), which pop all the backtrack states associated with A before 5737 continuing. 5738 5739 Backtrack state structs are allocated in slabs of about 4K in size. 5740 PL_regmatch_state and st always point to the currently active state, 5741 and PL_regmatch_slab points to the slab currently containing 5742 PL_regmatch_state. The first time regmatch() is called, the first slab is 5743 allocated, and is never freed until interpreter destruction. When the slab 5744 is full, a new one is allocated and chained to the end. At exit from 5745 regmatch(), slabs allocated since entry are freed. 5746 5747 In order to work with variable length lookbehinds, an upper limit is placed on 5748 lookbehinds which is set to where the match position is at the end of where the 5749 lookbehind would get to. Nothing in the lookbehind should match above that, 5750 except we should be able to look beyond if for things like \b, which need the 5751 next character in the string to be able to determine if this is a boundary or 5752 not. We also can't match the end of string/line unless we are also at the end 5753 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs 5754 that match a width, we have to add a condition that they are within the legal 5755 bounds of our window into the string. 5756 5757 */ 5758 5759 /* returns -1 on failure, $+[0] on success */ 5760 STATIC SSize_t 5761 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) 5762 { 5763 dVAR; 5764 const bool utf8_target = reginfo->is_utf8_target; 5765 const U32 uniflags = UTF8_ALLOW_DEFAULT; 5766 REGEXP *rex_sv = reginfo->prog; 5767 regexp *rex = ReANY(rex_sv); 5768 RXi_GET_DECL(rex,rexi); 5769 /* the current state. This is a cached copy of PL_regmatch_state */ 5770 regmatch_state *st; 5771 /* cache heavy used fields of st in registers */ 5772 regnode *scan; 5773 regnode *next; 5774 U32 n = 0; /* general value; init to avoid compiler warning */ 5775 SSize_t ln = 0; /* len or last; init to avoid compiler warning */ 5776 SSize_t endref = 0; /* offset of end of backref when ln is start */ 5777 char *locinput = startpos; 5778 char *loceol = reginfo->strend; 5779 char *pushinput; /* where to continue after a PUSH */ 5780 char *pusheol; /* where to stop matching (loceol) after a PUSH */ 5781 U8 *pushsr0; /* save starting pos of script run */ 5782 I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */ 5783 5784 bool result = 0; /* return value of S_regmatch */ 5785 U32 depth = 0; /* depth of backtrack stack */ 5786 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ 5787 const U32 max_nochange_depth = 5788 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? 5789 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; 5790 regmatch_state *yes_state = NULL; /* state to pop to on success of 5791 subpattern */ 5792 /* mark_state piggy backs on the yes_state logic so that when we unwind 5793 the stack on success we can update the mark_state as we go */ 5794 regmatch_state *mark_state = NULL; /* last mark state we have seen */ 5795 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ 5796 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ 5797 U32 state_num; 5798 bool no_final = 0; /* prevent failure from backtracking? */ 5799 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ 5800 char *startpoint = locinput; 5801 SV *popmark = NULL; /* are we looking for a mark? */ 5802 SV *sv_commit = NULL; /* last mark name seen in failure */ 5803 SV *sv_yes_mark = NULL; /* last mark name we have seen 5804 during a successful match */ 5805 U32 lastopen = 0; /* last open we saw */ 5806 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0; 5807 SV* const oreplsv = GvSVn(PL_replgv); 5808 /* these three flags are set by various ops to signal information to 5809 * the very next op. They have a useful lifetime of exactly one loop 5810 * iteration, and are not preserved or restored by state pushes/pops 5811 */ 5812 bool sw = 0; /* the condition value in (?(cond)a|b) */ 5813 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ 5814 int logical = 0; /* the following EVAL is: 5815 0: (?{...}) 5816 1: (?(?{...})X|Y) 5817 2: (??{...}) 5818 or the following IFMATCH/UNLESSM is: 5819 false: plain (?=foo) 5820 true: used as a condition: (?(?=foo)) 5821 */ 5822 PAD* last_pad = NULL; 5823 dMULTICALL; 5824 U8 gimme = G_SCALAR; 5825 CV *caller_cv = NULL; /* who called us */ 5826 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ 5827 U32 maxopenparen = 0; /* max '(' index seen so far */ 5828 int to_complement; /* Invert the result? */ 5829 _char_class_number classnum; 5830 bool is_utf8_pat = reginfo->is_utf8_pat; 5831 bool match = FALSE; 5832 I32 orig_savestack_ix = PL_savestack_ix; 5833 U8 * script_run_begin = NULL; 5834 5835 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */ 5836 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL)) 5837 # define SOLARIS_BAD_OPTIMIZER 5838 const U32 *pl_charclass_dup = PL_charclass; 5839 # define PL_charclass pl_charclass_dup 5840 #endif 5841 5842 #ifdef DEBUGGING 5843 DECLARE_AND_GET_RE_DEBUG_FLAGS; 5844 #endif 5845 5846 /* protect against undef(*^R) */ 5847 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); 5848 5849 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ 5850 multicall_oldcatch = 0; 5851 PERL_UNUSED_VAR(multicall_cop); 5852 5853 PERL_ARGS_ASSERT_REGMATCH; 5854 5855 st = PL_regmatch_state; 5856 5857 /* Note that nextchr is a byte even in UTF */ 5858 SET_nextchr; 5859 scan = prog; 5860 5861 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ 5862 DUMP_EXEC_POS( locinput, scan, utf8_target, depth ); 5863 Perl_re_printf( aTHX_ "regmatch start\n" ); 5864 })); 5865 5866 while (scan != NULL) { 5867 next = scan + NEXT_OFF(scan); 5868 if (next == scan) 5869 next = NULL; 5870 state_num = OP(scan); 5871 5872 reenter_switch: 5873 DEBUG_EXECUTE_r( 5874 if (state_num <= REGNODE_MAX) { 5875 SV * const prop = sv_newmortal(); 5876 regnode *rnext = regnext(scan); 5877 5878 DUMP_EXEC_POS( locinput, scan, utf8_target, depth ); 5879 regprop(rex, prop, scan, reginfo, NULL); 5880 Perl_re_printf( aTHX_ 5881 "%*s%" IVdf ":%s(%" IVdf ")\n", 5882 INDENT_CHARS(depth), "", 5883 (IV)(scan - rexi->program), 5884 SvPVX_const(prop), 5885 (PL_regkind[OP(scan)] == END || !rnext) ? 5886 0 : (IV)(rnext - rexi->program)); 5887 } 5888 ); 5889 5890 to_complement = 0; 5891 5892 SET_nextchr; 5893 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); 5894 5895 switch (state_num) { 5896 case SBOL: /* /^../ and /\A../ */ 5897 if (locinput == reginfo->strbeg) 5898 break; 5899 sayNO; 5900 5901 case MBOL: /* /^../m */ 5902 if (locinput == reginfo->strbeg || 5903 (!NEXTCHR_IS_EOS && locinput[-1] == '\n')) 5904 { 5905 break; 5906 } 5907 sayNO; 5908 5909 case GPOS: /* \G */ 5910 if (locinput == reginfo->ganch) 5911 break; 5912 sayNO; 5913 5914 case KEEPS: /* \K */ 5915 /* update the startpoint */ 5916 st->u.keeper.val = rex->offs[0].start; 5917 rex->offs[0].start = locinput - reginfo->strbeg; 5918 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol, 5919 script_run_begin); 5920 NOT_REACHED; /* NOTREACHED */ 5921 5922 case KEEPS_next_fail: 5923 /* rollback the start point change */ 5924 rex->offs[0].start = st->u.keeper.val; 5925 sayNO_SILENT; 5926 NOT_REACHED; /* NOTREACHED */ 5927 5928 case MEOL: /* /..$/m */ 5929 if (!NEXTCHR_IS_EOS && nextchr != '\n') 5930 sayNO; 5931 break; 5932 5933 case SEOL: /* /..$/ */ 5934 if (!NEXTCHR_IS_EOS && nextchr != '\n') 5935 sayNO; 5936 if (reginfo->strend - locinput > 1) 5937 sayNO; 5938 break; 5939 5940 case EOS: /* \z */ 5941 if (!NEXTCHR_IS_EOS) 5942 sayNO; 5943 break; 5944 5945 case SANY: /* /./s */ 5946 if (NEXTCHR_IS_EOS || locinput >= loceol) 5947 sayNO; 5948 goto increment_locinput; 5949 5950 case REG_ANY: /* /./ */ 5951 if ( NEXTCHR_IS_EOS 5952 || locinput >= loceol 5953 || nextchr == '\n') 5954 { 5955 sayNO; 5956 } 5957 goto increment_locinput; 5958 5959 5960 #undef ST 5961 #define ST st->u.trie 5962 case TRIEC: /* (ab|cd) with known charclass */ 5963 /* In this case the charclass data is available inline so 5964 we can fail fast without a lot of extra overhead. 5965 */ 5966 if ( ! NEXTCHR_IS_EOS 5967 && locinput < loceol 5968 && ! ANYOF_BITMAP_TEST(scan, nextchr)) 5969 { 5970 DEBUG_EXECUTE_r( 5971 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", 5972 depth, PL_colors[4], PL_colors[5]) 5973 ); 5974 sayNO_SILENT; 5975 NOT_REACHED; /* NOTREACHED */ 5976 } 5977 /* FALLTHROUGH */ 5978 case TRIE: /* (ab|cd) */ 5979 /* the basic plan of execution of the trie is: 5980 * At the beginning, run though all the states, and 5981 * find the longest-matching word. Also remember the position 5982 * of the shortest matching word. For example, this pattern: 5983 * 1 2 3 4 5 5984 * ab|a|x|abcd|abc 5985 * when matched against the string "abcde", will generate 5986 * accept states for all words except 3, with the longest 5987 * matching word being 4, and the shortest being 2 (with 5988 * the position being after char 1 of the string). 5989 * 5990 * Then for each matching word, in word order (i.e. 1,2,4,5), 5991 * we run the remainder of the pattern; on each try setting 5992 * the current position to the character following the word, 5993 * returning to try the next word on failure. 5994 * 5995 * We avoid having to build a list of words at runtime by 5996 * using a compile-time structure, wordinfo[].prev, which 5997 * gives, for each word, the previous accepting word (if any). 5998 * In the case above it would contain the mappings 1->2, 2->0, 5999 * 3->0, 4->5, 5->1. We can use this table to generate, from 6000 * the longest word (4 above), a list of all words, by 6001 * following the list of prev pointers; this gives us the 6002 * unordered list 4,5,1,2. Then given the current word we have 6003 * just tried, we can go through the list and find the 6004 * next-biggest word to try (so if we just failed on word 2, 6005 * the next in the list is 4). 6006 * 6007 * Since at runtime we don't record the matching position in 6008 * the string for each word, we have to work that out for 6009 * each word we're about to process. The wordinfo table holds 6010 * the character length of each word; given that we recorded 6011 * at the start: the position of the shortest word and its 6012 * length in chars, we just need to move the pointer the 6013 * difference between the two char lengths. Depending on 6014 * Unicode status and folding, that's cheap or expensive. 6015 * 6016 * This algorithm is optimised for the case where are only a 6017 * small number of accept states, i.e. 0,1, or maybe 2. 6018 * With lots of accepts states, and having to try all of them, 6019 * it becomes quadratic on number of accept states to find all 6020 * the next words. 6021 */ 6022 6023 { 6024 /* what type of TRIE am I? (utf8 makes this contextual) */ 6025 DECL_TRIE_TYPE(scan); 6026 6027 /* what trie are we using right now */ 6028 reg_trie_data * const trie 6029 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; 6030 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); 6031 U32 state = trie->startstate; 6032 6033 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) { 6034 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 6035 if (utf8_target 6036 && ! NEXTCHR_IS_EOS 6037 && UTF8_IS_ABOVE_LATIN1(nextchr) 6038 && scan->flags == EXACTL) 6039 { 6040 /* We only output for EXACTL, as we let the folder 6041 * output this message for EXACTFLU8 to avoid 6042 * duplication */ 6043 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, 6044 reginfo->strend); 6045 } 6046 } 6047 if ( trie->bitmap 6048 && ( NEXTCHR_IS_EOS 6049 || locinput >= loceol 6050 || ! TRIE_BITMAP_TEST(trie, nextchr))) 6051 { 6052 if (trie->states[ state ].wordnum) { 6053 DEBUG_EXECUTE_r( 6054 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n", 6055 depth, PL_colors[4], PL_colors[5]) 6056 ); 6057 if (!trie->jump) 6058 break; 6059 } else { 6060 DEBUG_EXECUTE_r( 6061 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", 6062 depth, PL_colors[4], PL_colors[5]) 6063 ); 6064 sayNO_SILENT; 6065 } 6066 } 6067 6068 { 6069 U8 *uc = ( U8* )locinput; 6070 6071 STRLEN len = 0; 6072 STRLEN foldlen = 0; 6073 U8 *uscan = (U8*)NULL; 6074 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 6075 U32 charcount = 0; /* how many input chars we have matched */ 6076 U32 accepted = 0; /* have we seen any accepting states? */ 6077 6078 ST.jump = trie->jump; 6079 ST.me = scan; 6080 ST.firstpos = NULL; 6081 ST.longfold = FALSE; /* char longer if folded => it's harder */ 6082 ST.nextword = 0; 6083 6084 /* fully traverse the TRIE; note the position of the 6085 shortest accept state and the wordnum of the longest 6086 accept state */ 6087 6088 while ( state && uc <= (U8*)(loceol) ) { 6089 U32 base = trie->states[ state ].trans.base; 6090 UV uvc = 0; 6091 U16 charid = 0; 6092 U16 wordnum; 6093 wordnum = trie->states[ state ].wordnum; 6094 6095 if (wordnum) { /* it's an accept state */ 6096 if (!accepted) { 6097 accepted = 1; 6098 /* record first match position */ 6099 if (ST.longfold) { 6100 ST.firstpos = (U8*)locinput; 6101 ST.firstchars = 0; 6102 } 6103 else { 6104 ST.firstpos = uc; 6105 ST.firstchars = charcount; 6106 } 6107 } 6108 if (!ST.nextword || wordnum < ST.nextword) 6109 ST.nextword = wordnum; 6110 ST.topword = wordnum; 6111 } 6112 6113 DEBUG_TRIE_EXECUTE_r({ 6114 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth ); 6115 /* HERE */ 6116 PerlIO_printf( Perl_debug_log, 6117 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ", 6118 INDENT_CHARS(depth), "", PL_colors[4], 6119 (UV)state, (accepted ? 'Y' : 'N')); 6120 }); 6121 6122 /* read a char and goto next state */ 6123 if ( base && (foldlen || uc < (U8*)(loceol))) { 6124 I32 offset; 6125 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 6126 (U8 *) loceol, uscan, 6127 len, uvc, charid, foldlen, 6128 foldbuf, uniflags); 6129 charcount++; 6130 if (foldlen>0) 6131 ST.longfold = TRUE; 6132 if (charid && 6133 ( ((offset = 6134 base + charid - 1 - trie->uniquecharcount)) >= 0) 6135 6136 && ((U32)offset < trie->lasttrans) 6137 && trie->trans[offset].check == state) 6138 { 6139 state = trie->trans[offset].next; 6140 } 6141 else { 6142 state = 0; 6143 } 6144 uc += len; 6145 6146 } 6147 else { 6148 state = 0; 6149 } 6150 DEBUG_TRIE_EXECUTE_r( 6151 Perl_re_printf( aTHX_ 6152 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n", 6153 charid, uvc, (UV)state, PL_colors[5] ); 6154 ); 6155 } 6156 if (!accepted) 6157 sayNO; 6158 6159 /* calculate total number of accept states */ 6160 { 6161 U16 w = ST.topword; 6162 accepted = 0; 6163 while (w) { 6164 w = trie->wordinfo[w].prev; 6165 accepted++; 6166 } 6167 ST.accepted = accepted; 6168 } 6169 6170 DEBUG_EXECUTE_r( 6171 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n", 6172 depth, 6173 PL_colors[4], (IV)ST.accepted, PL_colors[5] ); 6174 ); 6175 goto trie_first_try; /* jump into the fail handler */ 6176 }} 6177 NOT_REACHED; /* NOTREACHED */ 6178 6179 case TRIE_next_fail: /* we failed - try next alternative */ 6180 { 6181 U8 *uc; 6182 if ( ST.jump ) { 6183 /* undo any captures done in the tail part of a branch, 6184 * e.g. 6185 * /(?:X(.)(.)|Y(.)).../ 6186 * where the trie just matches X then calls out to do the 6187 * rest of the branch */ 6188 REGCP_UNWIND(ST.cp); 6189 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 6190 } 6191 if (!--ST.accepted) { 6192 DEBUG_EXECUTE_r({ 6193 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n", 6194 depth, 6195 PL_colors[4], 6196 PL_colors[5] ); 6197 }); 6198 sayNO_SILENT; 6199 } 6200 { 6201 /* Find next-highest word to process. Note that this code 6202 * is O(N^2) per trie run (O(N) per branch), so keep tight */ 6203 U16 min = 0; 6204 U16 word; 6205 U16 const nextword = ST.nextword; 6206 reg_trie_wordinfo * const wordinfo 6207 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; 6208 for (word=ST.topword; word; word=wordinfo[word].prev) { 6209 if (word > nextword && (!min || word < min)) 6210 min = word; 6211 } 6212 ST.nextword = min; 6213 } 6214 6215 trie_first_try: 6216 if (do_cutgroup) { 6217 do_cutgroup = 0; 6218 no_final = 0; 6219 } 6220 6221 if ( ST.jump ) { 6222 ST.lastparen = rex->lastparen; 6223 ST.lastcloseparen = rex->lastcloseparen; 6224 REGCP_SET(ST.cp); 6225 } 6226 6227 /* find start char of end of current word */ 6228 { 6229 U32 chars; /* how many chars to skip */ 6230 reg_trie_data * const trie 6231 = (reg_trie_data*)rexi->data->data[ARG(ST.me)]; 6232 6233 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) 6234 >= ST.firstchars); 6235 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) 6236 - ST.firstchars; 6237 uc = ST.firstpos; 6238 6239 if (ST.longfold) { 6240 /* the hard option - fold each char in turn and find 6241 * its folded length (which may be different */ 6242 U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; 6243 STRLEN foldlen; 6244 STRLEN len; 6245 UV uvc; 6246 U8 *uscan; 6247 6248 while (chars) { 6249 if (utf8_target) { 6250 /* XXX This assumes the length is well-formed, as 6251 * does the UTF8SKIP below */ 6252 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, 6253 uniflags); 6254 uc += len; 6255 } 6256 else { 6257 uvc = *uc; 6258 uc++; 6259 } 6260 uvc = to_uni_fold(uvc, foldbuf, &foldlen); 6261 uscan = foldbuf; 6262 while (foldlen) { 6263 if (!--chars) 6264 break; 6265 uvc = utf8n_to_uvchr(uscan, foldlen, &len, 6266 uniflags); 6267 uscan += len; 6268 foldlen -= len; 6269 } 6270 } 6271 } 6272 else { 6273 if (utf8_target) 6274 while (chars--) 6275 uc += UTF8SKIP(uc); 6276 else 6277 uc += chars; 6278 } 6279 } 6280 6281 scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) 6282 ? ST.jump[ST.nextword] 6283 : NEXT_OFF(ST.me)); 6284 6285 DEBUG_EXECUTE_r({ 6286 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n", 6287 depth, 6288 PL_colors[4], 6289 ST.nextword, 6290 PL_colors[5] 6291 ); 6292 }); 6293 6294 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) { 6295 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol, 6296 script_run_begin); 6297 NOT_REACHED; /* NOTREACHED */ 6298 } 6299 /* only one choice left - just continue */ 6300 DEBUG_EXECUTE_r({ 6301 AV *const trie_words 6302 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); 6303 SV ** const tmp = trie_words 6304 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL; 6305 SV *sv= tmp ? sv_newmortal() : NULL; 6306 6307 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n", 6308 depth, PL_colors[4], 6309 ST.nextword, 6310 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 6311 PL_colors[0], PL_colors[1], 6312 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII 6313 ) 6314 : "not compiled under -Dr", 6315 PL_colors[5] ); 6316 }); 6317 6318 locinput = (char*)uc; 6319 continue; /* execute rest of RE */ 6320 /* NOTREACHED */ 6321 } 6322 #undef ST 6323 6324 case LEXACT_REQ8: 6325 if (! utf8_target) { 6326 sayNO; 6327 } 6328 /* FALLTHROUGH */ 6329 6330 case LEXACT: 6331 { 6332 char *s; 6333 6334 s = STRINGl(scan); 6335 ln = STR_LENl(scan); 6336 goto join_short_long_exact; 6337 6338 case EXACTL: /* /abc/l */ 6339 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 6340 6341 /* Complete checking would involve going through every character 6342 * matched by the string to see if any is above latin1. But the 6343 * comparision otherwise might very well be a fast assembly 6344 * language routine, and I (khw) don't think slowing things down 6345 * just to check for this warning is worth it. So this just checks 6346 * the first character */ 6347 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) { 6348 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); 6349 } 6350 goto do_exact; 6351 case EXACT_REQ8: 6352 if (! utf8_target) { 6353 sayNO; 6354 } 6355 /* FALLTHROUGH */ 6356 6357 case EXACT: /* /abc/ */ 6358 do_exact: 6359 s = STRINGs(scan); 6360 ln = STR_LENs(scan); 6361 6362 join_short_long_exact: 6363 if (utf8_target != is_utf8_pat) { 6364 /* The target and the pattern have differing utf8ness. */ 6365 char *l = locinput; 6366 const char * const e = s + ln; 6367 6368 if (utf8_target) { 6369 /* The target is utf8, the pattern is not utf8. 6370 * Above-Latin1 code points can't match the pattern; 6371 * invariants match exactly, and the other Latin1 ones need 6372 * to be downgraded to a single byte in order to do the 6373 * comparison. (If we could be confident that the target 6374 * is not malformed, this could be refactored to have fewer 6375 * tests by just assuming that if the first bytes match, it 6376 * is an invariant, but there are tests in the test suite 6377 * dealing with (??{...}) which violate this) */ 6378 while (s < e) { 6379 if ( l >= loceol 6380 || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) 6381 { 6382 sayNO; 6383 } 6384 if (UTF8_IS_INVARIANT(*(U8*)l)) { 6385 if (*l != *s) { 6386 sayNO; 6387 } 6388 l++; 6389 } 6390 else { 6391 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) 6392 { 6393 sayNO; 6394 } 6395 l += 2; 6396 } 6397 s++; 6398 } 6399 } 6400 else { 6401 /* The target is not utf8, the pattern is utf8. */ 6402 while (s < e) { 6403 if ( l >= loceol 6404 || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) 6405 { 6406 sayNO; 6407 } 6408 if (UTF8_IS_INVARIANT(*(U8*)s)) { 6409 if (*s != *l) { 6410 sayNO; 6411 } 6412 s++; 6413 } 6414 else { 6415 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) 6416 { 6417 sayNO; 6418 } 6419 s += 2; 6420 } 6421 l++; 6422 } 6423 } 6424 locinput = l; 6425 } 6426 else { 6427 /* The target and the pattern have the same utf8ness. */ 6428 /* Inline the first character, for speed. */ 6429 if ( loceol - locinput < ln 6430 || UCHARAT(s) != nextchr 6431 || (ln > 1 && memNE(s, locinput, ln))) 6432 { 6433 sayNO; 6434 } 6435 locinput += ln; 6436 } 6437 break; 6438 } 6439 6440 case EXACTFL: /* /abc/il */ 6441 { 6442 re_fold_t folder; 6443 const U8 * fold_array; 6444 const char * s; 6445 U32 fold_utf8_flags; 6446 6447 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 6448 folder = foldEQ_locale; 6449 fold_array = PL_fold_locale; 6450 fold_utf8_flags = FOLDEQ_LOCALE; 6451 goto do_exactf; 6452 6453 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so 6454 is effectively /u; hence to match, target 6455 must be UTF-8. */ 6456 if (! utf8_target) { 6457 sayNO; 6458 } 6459 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED 6460 | FOLDEQ_S2_FOLDS_SANE; 6461 folder = foldEQ_latin1_s2_folded; 6462 fold_array = PL_fold_latin1; 6463 goto do_exactf; 6464 6465 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */ 6466 if (! utf8_target) { 6467 sayNO; 6468 } 6469 assert(is_utf8_pat); 6470 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 6471 goto do_exactf; 6472 6473 case EXACTFUP: /* /foo/iu, and something is problematic in 6474 'foo' so can't take shortcuts. */ 6475 assert(! is_utf8_pat); 6476 folder = foldEQ_latin1; 6477 fold_array = PL_fold_latin1; 6478 fold_utf8_flags = 0; 6479 goto do_exactf; 6480 6481 case EXACTFU: /* /abc/iu */ 6482 folder = foldEQ_latin1_s2_folded; 6483 fold_array = PL_fold_latin1; 6484 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 6485 goto do_exactf; 6486 6487 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 6488 patterns */ 6489 assert(! is_utf8_pat); 6490 /* FALLTHROUGH */ 6491 case EXACTFAA: /* /abc/iaa */ 6492 folder = foldEQ_latin1_s2_folded; 6493 fold_array = PL_fold_latin1; 6494 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; 6495 if (is_utf8_pat || ! utf8_target) { 6496 6497 /* The possible presence of a MICRO SIGN in the pattern forbids 6498 * us to view a non-UTF-8 pattern as folded when there is a 6499 * UTF-8 target */ 6500 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED 6501 |FOLDEQ_S2_FOLDS_SANE; 6502 } 6503 goto do_exactf; 6504 6505 6506 case EXACTF: /* /abc/i This node only generated for 6507 non-utf8 patterns */ 6508 assert(! is_utf8_pat); 6509 folder = foldEQ; 6510 fold_array = PL_fold; 6511 fold_utf8_flags = 0; 6512 6513 do_exactf: 6514 s = STRINGs(scan); 6515 ln = STR_LENs(scan); 6516 6517 if ( utf8_target 6518 || is_utf8_pat 6519 || state_num == EXACTFUP 6520 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) 6521 { 6522 /* Either target or the pattern are utf8, or has the issue where 6523 * the fold lengths may differ. */ 6524 const char * const l = locinput; 6525 char *e = loceol; 6526 6527 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target, 6528 s, 0, ln, is_utf8_pat,fold_utf8_flags)) 6529 { 6530 sayNO; 6531 } 6532 locinput = e; 6533 break; 6534 } 6535 6536 /* Neither the target nor the pattern are utf8 */ 6537 if (UCHARAT(s) != nextchr 6538 && !NEXTCHR_IS_EOS 6539 && UCHARAT(s) != fold_array[nextchr]) 6540 { 6541 sayNO; 6542 } 6543 if (loceol - locinput < ln) 6544 sayNO; 6545 if (ln > 1 && ! folder(locinput, s, ln)) 6546 sayNO; 6547 locinput += ln; 6548 break; 6549 } 6550 6551 case NBOUNDL: /* /\B/l */ 6552 to_complement = 1; 6553 /* FALLTHROUGH */ 6554 6555 case BOUNDL: /* /\b/l */ 6556 { 6557 bool b1, b2; 6558 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 6559 6560 if (FLAGS(scan) != TRADITIONAL_BOUND) { 6561 if (! IN_UTF8_CTYPE_LOCALE) { 6562 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 6563 B_ON_NON_UTF8_LOCALE_IS_WRONG); 6564 } 6565 goto boundu; 6566 } 6567 6568 if (utf8_target) { 6569 if (locinput == reginfo->strbeg) 6570 b1 = isWORDCHAR_LC('\n'); 6571 else { 6572 U8 *p = reghop3((U8*)locinput, -1, 6573 (U8*)(reginfo->strbeg)); 6574 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend)); 6575 } 6576 b2 = (NEXTCHR_IS_EOS) 6577 ? isWORDCHAR_LC('\n') 6578 : isWORDCHAR_LC_utf8_safe((U8*) locinput, 6579 (U8*) reginfo->strend); 6580 } 6581 else { /* Here the string isn't utf8 */ 6582 b1 = (locinput == reginfo->strbeg) 6583 ? isWORDCHAR_LC('\n') 6584 : isWORDCHAR_LC(UCHARAT(locinput - 1)); 6585 b2 = (NEXTCHR_IS_EOS) 6586 ? isWORDCHAR_LC('\n') 6587 : isWORDCHAR_LC(nextchr); 6588 } 6589 if (to_complement ^ (b1 == b2)) { 6590 sayNO; 6591 } 6592 break; 6593 } 6594 6595 case NBOUND: /* /\B/ */ 6596 to_complement = 1; 6597 /* FALLTHROUGH */ 6598 6599 case BOUND: /* /\b/ */ 6600 if (utf8_target) { 6601 goto bound_utf8; 6602 } 6603 goto bound_ascii_match_only; 6604 6605 case NBOUNDA: /* /\B/a */ 6606 to_complement = 1; 6607 /* FALLTHROUGH */ 6608 6609 case BOUNDA: /* /\b/a */ 6610 { 6611 bool b1, b2; 6612 6613 bound_ascii_match_only: 6614 /* Here the string isn't utf8, or is utf8 and only ascii characters 6615 * are to match \w. In the latter case looking at the byte just 6616 * prior to the current one may be just the final byte of a 6617 * multi-byte character. This is ok. There are two cases: 6618 * 1) it is a single byte character, and then the test is doing 6619 * just what it's supposed to. 6620 * 2) it is a multi-byte character, in which case the final byte is 6621 * never mistakable for ASCII, and so the test will say it is 6622 * not a word character, which is the correct answer. */ 6623 b1 = (locinput == reginfo->strbeg) 6624 ? isWORDCHAR_A('\n') 6625 : isWORDCHAR_A(UCHARAT(locinput - 1)); 6626 b2 = (NEXTCHR_IS_EOS) 6627 ? isWORDCHAR_A('\n') 6628 : isWORDCHAR_A(nextchr); 6629 if (to_complement ^ (b1 == b2)) { 6630 sayNO; 6631 } 6632 break; 6633 } 6634 6635 case NBOUNDU: /* /\B/u */ 6636 to_complement = 1; 6637 /* FALLTHROUGH */ 6638 6639 case BOUNDU: /* /\b/u */ 6640 6641 boundu: 6642 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) { 6643 match = FALSE; 6644 } 6645 else if (utf8_target) { 6646 bound_utf8: 6647 switch((bound_type) FLAGS(scan)) { 6648 case TRADITIONAL_BOUND: 6649 { 6650 bool b1, b2; 6651 if (locinput == reginfo->strbeg) { 6652 b1 = 0 /* isWORDCHAR_L1('\n') */; 6653 } 6654 else { 6655 U8 *p = reghop3((U8*)locinput, -1, 6656 (U8*)(reginfo->strbeg)); 6657 6658 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend); 6659 } 6660 b2 = (NEXTCHR_IS_EOS) 6661 ? 0 /* isWORDCHAR_L1('\n') */ 6662 : isWORDCHAR_utf8_safe((U8*)locinput, 6663 (U8*) reginfo->strend); 6664 match = cBOOL(b1 != b2); 6665 break; 6666 } 6667 case GCB_BOUND: 6668 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 6669 match = TRUE; /* GCB always matches at begin and 6670 end */ 6671 } 6672 else { 6673 /* Find the gcb values of previous and current 6674 * chars, then see if is a break point */ 6675 match = isGCB(getGCB_VAL_UTF8( 6676 reghop3((U8*)locinput, 6677 -1, 6678 (U8*)(reginfo->strbeg)), 6679 (U8*) reginfo->strend), 6680 getGCB_VAL_UTF8((U8*) locinput, 6681 (U8*) reginfo->strend), 6682 (U8*) reginfo->strbeg, 6683 (U8*) locinput, 6684 utf8_target); 6685 } 6686 break; 6687 6688 case LB_BOUND: 6689 if (locinput == reginfo->strbeg) { 6690 match = FALSE; 6691 } 6692 else if (NEXTCHR_IS_EOS) { 6693 match = TRUE; 6694 } 6695 else { 6696 match = isLB(getLB_VAL_UTF8( 6697 reghop3((U8*)locinput, 6698 -1, 6699 (U8*)(reginfo->strbeg)), 6700 (U8*) reginfo->strend), 6701 getLB_VAL_UTF8((U8*) locinput, 6702 (U8*) reginfo->strend), 6703 (U8*) reginfo->strbeg, 6704 (U8*) locinput, 6705 (U8*) reginfo->strend, 6706 utf8_target); 6707 } 6708 break; 6709 6710 case SB_BOUND: /* Always matches at begin and end */ 6711 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 6712 match = TRUE; 6713 } 6714 else { 6715 match = isSB(getSB_VAL_UTF8( 6716 reghop3((U8*)locinput, 6717 -1, 6718 (U8*)(reginfo->strbeg)), 6719 (U8*) reginfo->strend), 6720 getSB_VAL_UTF8((U8*) locinput, 6721 (U8*) reginfo->strend), 6722 (U8*) reginfo->strbeg, 6723 (U8*) locinput, 6724 (U8*) reginfo->strend, 6725 utf8_target); 6726 } 6727 break; 6728 6729 case WB_BOUND: 6730 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 6731 match = TRUE; 6732 } 6733 else { 6734 match = isWB(WB_UNKNOWN, 6735 getWB_VAL_UTF8( 6736 reghop3((U8*)locinput, 6737 -1, 6738 (U8*)(reginfo->strbeg)), 6739 (U8*) reginfo->strend), 6740 getWB_VAL_UTF8((U8*) locinput, 6741 (U8*) reginfo->strend), 6742 (U8*) reginfo->strbeg, 6743 (U8*) locinput, 6744 (U8*) reginfo->strend, 6745 utf8_target); 6746 } 6747 break; 6748 } 6749 } 6750 else { /* Not utf8 target */ 6751 switch((bound_type) FLAGS(scan)) { 6752 case TRADITIONAL_BOUND: 6753 { 6754 bool b1, b2; 6755 b1 = (locinput == reginfo->strbeg) 6756 ? 0 /* isWORDCHAR_L1('\n') */ 6757 : isWORDCHAR_L1(UCHARAT(locinput - 1)); 6758 b2 = (NEXTCHR_IS_EOS) 6759 ? 0 /* isWORDCHAR_L1('\n') */ 6760 : isWORDCHAR_L1(nextchr); 6761 match = cBOOL(b1 != b2); 6762 break; 6763 } 6764 6765 case GCB_BOUND: 6766 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 6767 match = TRUE; /* GCB always matches at begin and 6768 end */ 6769 } 6770 else { /* Only CR-LF combo isn't a GCB in 0-255 6771 range */ 6772 match = UCHARAT(locinput - 1) != '\r' 6773 || UCHARAT(locinput) != '\n'; 6774 } 6775 break; 6776 6777 case LB_BOUND: 6778 if (locinput == reginfo->strbeg) { 6779 match = FALSE; 6780 } 6781 else if (NEXTCHR_IS_EOS) { 6782 match = TRUE; 6783 } 6784 else { 6785 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)), 6786 getLB_VAL_CP(UCHARAT(locinput)), 6787 (U8*) reginfo->strbeg, 6788 (U8*) locinput, 6789 (U8*) reginfo->strend, 6790 utf8_target); 6791 } 6792 break; 6793 6794 case SB_BOUND: /* Always matches at begin and end */ 6795 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 6796 match = TRUE; 6797 } 6798 else { 6799 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)), 6800 getSB_VAL_CP(UCHARAT(locinput)), 6801 (U8*) reginfo->strbeg, 6802 (U8*) locinput, 6803 (U8*) reginfo->strend, 6804 utf8_target); 6805 } 6806 break; 6807 6808 case WB_BOUND: 6809 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 6810 match = TRUE; 6811 } 6812 else { 6813 match = isWB(WB_UNKNOWN, 6814 getWB_VAL_CP(UCHARAT(locinput -1)), 6815 getWB_VAL_CP(UCHARAT(locinput)), 6816 (U8*) reginfo->strbeg, 6817 (U8*) locinput, 6818 (U8*) reginfo->strend, 6819 utf8_target); 6820 } 6821 break; 6822 } 6823 } 6824 6825 if (to_complement ^ ! match) { 6826 sayNO; 6827 } 6828 break; 6829 6830 case ANYOFPOSIXL: 6831 case ANYOFL: /* /[abc]/l */ 6832 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 6833 6834 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE) 6835 { 6836 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); 6837 } 6838 /* FALLTHROUGH */ 6839 case ANYOFD: /* /[abc]/d */ 6840 case ANYOF: /* /[abc]/ */ 6841 if (NEXTCHR_IS_EOS || locinput >= loceol) 6842 sayNO; 6843 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput)) 6844 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP)) 6845 { 6846 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) { 6847 sayNO; 6848 } 6849 locinput++; 6850 } 6851 else { 6852 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 6853 utf8_target)) 6854 { 6855 sayNO; 6856 } 6857 goto increment_locinput; 6858 } 6859 break; 6860 6861 case ANYOFM: 6862 if ( NEXTCHR_IS_EOS 6863 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan) 6864 || locinput >= loceol) 6865 { 6866 sayNO; 6867 } 6868 locinput++; /* ANYOFM is always single byte */ 6869 break; 6870 6871 case NANYOFM: 6872 if ( NEXTCHR_IS_EOS 6873 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan) 6874 || locinput >= loceol) 6875 { 6876 sayNO; 6877 } 6878 goto increment_locinput; 6879 break; 6880 6881 case ANYOFH: 6882 if ( ! utf8_target 6883 || NEXTCHR_IS_EOS 6884 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) 6885 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 6886 utf8_target)) 6887 { 6888 sayNO; 6889 } 6890 goto increment_locinput; 6891 break; 6892 6893 case ANYOFHb: 6894 if ( ! utf8_target 6895 || NEXTCHR_IS_EOS 6896 || ANYOF_FLAGS(scan) != (U8) *locinput 6897 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 6898 utf8_target)) 6899 { 6900 sayNO; 6901 } 6902 goto increment_locinput; 6903 break; 6904 6905 case ANYOFHr: 6906 if ( ! utf8_target 6907 || NEXTCHR_IS_EOS 6908 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput), 6909 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)), 6910 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan))) 6911 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 6912 utf8_target)) 6913 { 6914 sayNO; 6915 } 6916 goto increment_locinput; 6917 break; 6918 6919 case ANYOFHs: 6920 if ( ! utf8_target 6921 || NEXTCHR_IS_EOS 6922 || loceol - locinput < FLAGS(scan) 6923 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan)) 6924 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 6925 utf8_target)) 6926 { 6927 sayNO; 6928 } 6929 goto increment_locinput; 6930 break; 6931 6932 case ANYOFR: 6933 if (NEXTCHR_IS_EOS) { 6934 sayNO; 6935 } 6936 6937 if (utf8_target) { 6938 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) 6939 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, 6940 (U8 *) reginfo->strend, 6941 NULL), 6942 ANYOFRbase(scan), ANYOFRdelta(scan))) 6943 { 6944 sayNO; 6945 } 6946 } 6947 else { 6948 if (! withinCOUNT((U8) *locinput, 6949 ANYOFRbase(scan), ANYOFRdelta(scan))) 6950 { 6951 sayNO; 6952 } 6953 } 6954 goto increment_locinput; 6955 break; 6956 6957 case ANYOFRb: 6958 if (NEXTCHR_IS_EOS) { 6959 sayNO; 6960 } 6961 6962 if (utf8_target) { 6963 if ( ANYOF_FLAGS(scan) != (U8) *locinput 6964 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, 6965 (U8 *) reginfo->strend, 6966 NULL), 6967 ANYOFRbase(scan), ANYOFRdelta(scan))) 6968 { 6969 sayNO; 6970 } 6971 } 6972 else { 6973 if (! withinCOUNT((U8) *locinput, 6974 ANYOFRbase(scan), ANYOFRdelta(scan))) 6975 { 6976 sayNO; 6977 } 6978 } 6979 goto increment_locinput; 6980 break; 6981 6982 /* The argument (FLAGS) to all the POSIX node types is the class number 6983 * */ 6984 6985 case NPOSIXL: /* \W or [:^punct:] etc. under /l */ 6986 to_complement = 1; 6987 /* FALLTHROUGH */ 6988 6989 case POSIXL: /* \w or [:punct:] etc. under /l */ 6990 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 6991 if (NEXTCHR_IS_EOS || locinput >= loceol) 6992 sayNO; 6993 6994 /* Use isFOO_lc() for characters within Latin1. (Note that 6995 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 6996 * wouldn't be invariant) */ 6997 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { 6998 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) { 6999 sayNO; 7000 } 7001 7002 locinput++; 7003 break; 7004 } 7005 7006 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { 7007 /* An above Latin-1 code point, or malformed */ 7008 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, 7009 reginfo->strend); 7010 goto utf8_posix_above_latin1; 7011 } 7012 7013 /* Here is a UTF-8 variant code point below 256 and the target is 7014 * UTF-8 */ 7015 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), 7016 EIGHT_BIT_UTF8_TO_NATIVE(nextchr, 7017 *(locinput + 1)))))) 7018 { 7019 sayNO; 7020 } 7021 7022 goto increment_locinput; 7023 7024 case NPOSIXD: /* \W or [:^punct:] etc. under /d */ 7025 to_complement = 1; 7026 /* FALLTHROUGH */ 7027 7028 case POSIXD: /* \w or [:punct:] etc. under /d */ 7029 if (utf8_target) { 7030 goto utf8_posix; 7031 } 7032 goto posixa; 7033 7034 case NPOSIXA: /* \W or [:^punct:] etc. under /a */ 7035 7036 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7037 sayNO; 7038 } 7039 7040 /* All UTF-8 variants match */ 7041 if (! UTF8_IS_INVARIANT(nextchr)) { 7042 goto increment_locinput; 7043 } 7044 7045 to_complement = 1; 7046 goto join_nposixa; 7047 7048 case POSIXA: /* \w or [:punct:] etc. under /a */ 7049 7050 posixa: 7051 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in 7052 * UTF-8, and also from NPOSIXA even in UTF-8 when the current 7053 * character is a single byte */ 7054 7055 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7056 sayNO; 7057 } 7058 7059 join_nposixa: 7060 7061 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, 7062 FLAGS(scan))))) 7063 { 7064 sayNO; 7065 } 7066 7067 /* Here we are either not in utf8, or we matched a utf8-invariant, 7068 * so the next char is the next byte */ 7069 locinput++; 7070 break; 7071 7072 case NPOSIXU: /* \W or [:^punct:] etc. under /u */ 7073 to_complement = 1; 7074 /* FALLTHROUGH */ 7075 7076 case POSIXU: /* \w or [:punct:] etc. under /u */ 7077 utf8_posix: 7078 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7079 sayNO; 7080 } 7081 7082 /* Use _generic_isCC() for characters within Latin1. (Note that 7083 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 7084 * wouldn't be invariant) */ 7085 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { 7086 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr, 7087 FLAGS(scan))))) 7088 { 7089 sayNO; 7090 } 7091 locinput++; 7092 } 7093 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { 7094 if (! (to_complement 7095 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr, 7096 *(locinput + 1)), 7097 FLAGS(scan))))) 7098 { 7099 sayNO; 7100 } 7101 locinput += 2; 7102 } 7103 else { /* Handle above Latin-1 code points */ 7104 utf8_posix_above_latin1: 7105 classnum = (_char_class_number) FLAGS(scan); 7106 switch (classnum) { 7107 default: 7108 if (! (to_complement 7109 ^ cBOOL(_invlist_contains_cp( 7110 PL_XPosix_ptrs[classnum], 7111 utf8_to_uvchr_buf((U8 *) locinput, 7112 (U8 *) reginfo->strend, 7113 NULL))))) 7114 { 7115 sayNO; 7116 } 7117 break; 7118 case _CC_ENUM_SPACE: 7119 if (! (to_complement 7120 ^ cBOOL(is_XPERLSPACE_high(locinput)))) 7121 { 7122 sayNO; 7123 } 7124 break; 7125 case _CC_ENUM_BLANK: 7126 if (! (to_complement 7127 ^ cBOOL(is_HORIZWS_high(locinput)))) 7128 { 7129 sayNO; 7130 } 7131 break; 7132 case _CC_ENUM_XDIGIT: 7133 if (! (to_complement 7134 ^ cBOOL(is_XDIGIT_high(locinput)))) 7135 { 7136 sayNO; 7137 } 7138 break; 7139 case _CC_ENUM_VERTSPACE: 7140 if (! (to_complement 7141 ^ cBOOL(is_VERTWS_high(locinput)))) 7142 { 7143 sayNO; 7144 } 7145 break; 7146 case _CC_ENUM_CNTRL: /* These can't match above Latin1 */ 7147 case _CC_ENUM_ASCII: 7148 if (! to_complement) { 7149 sayNO; 7150 } 7151 break; 7152 } 7153 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); 7154 } 7155 break; 7156 7157 case CLUMP: /* Match \X: logical Unicode character. This is defined as 7158 a Unicode extended Grapheme Cluster */ 7159 if (NEXTCHR_IS_EOS || locinput >= loceol) 7160 sayNO; 7161 if (! utf8_target) { 7162 7163 /* Match either CR LF or '.', as all the other possibilities 7164 * require utf8 */ 7165 locinput++; /* Match the . or CR */ 7166 if (nextchr == '\r' /* And if it was CR, and the next is LF, 7167 match the LF */ 7168 && locinput < loceol 7169 && UCHARAT(locinput) == '\n') 7170 { 7171 locinput++; 7172 } 7173 } 7174 else { 7175 7176 /* Get the gcb type for the current character */ 7177 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput, 7178 (U8*) reginfo->strend); 7179 7180 /* Then scan through the input until we get to the first 7181 * character whose type is supposed to be a gcb with the 7182 * current character. (There is always a break at the 7183 * end-of-input) */ 7184 locinput += UTF8SKIP(locinput); 7185 while (locinput < loceol) { 7186 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput, 7187 (U8*) reginfo->strend); 7188 if (isGCB(prev_gcb, cur_gcb, 7189 (U8*) reginfo->strbeg, (U8*) locinput, 7190 utf8_target)) 7191 { 7192 break; 7193 } 7194 7195 prev_gcb = cur_gcb; 7196 locinput += UTF8SKIP(locinput); 7197 } 7198 7199 7200 } 7201 break; 7202 7203 case REFFLN: /* /\g{name}/il */ 7204 { /* The capture buffer cases. The ones beginning with N for the 7205 named buffers just convert to the equivalent numbered and 7206 pretend they were called as the corresponding numbered buffer 7207 op. */ 7208 /* don't initialize these in the declaration, it makes C++ 7209 unhappy */ 7210 const char *s; 7211 char type; 7212 re_fold_t folder; 7213 const U8 *fold_array; 7214 UV utf8_fold_flags; 7215 7216 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 7217 folder = foldEQ_locale; 7218 fold_array = PL_fold_locale; 7219 type = REFFL; 7220 utf8_fold_flags = FOLDEQ_LOCALE; 7221 goto do_nref; 7222 7223 case REFFAN: /* /\g{name}/iaa */ 7224 folder = foldEQ_latin1; 7225 fold_array = PL_fold_latin1; 7226 type = REFFA; 7227 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7228 goto do_nref; 7229 7230 case REFFUN: /* /\g{name}/iu */ 7231 folder = foldEQ_latin1; 7232 fold_array = PL_fold_latin1; 7233 type = REFFU; 7234 utf8_fold_flags = 0; 7235 goto do_nref; 7236 7237 case REFFN: /* /\g{name}/i */ 7238 folder = foldEQ; 7239 fold_array = PL_fold; 7240 type = REFF; 7241 utf8_fold_flags = 0; 7242 goto do_nref; 7243 7244 case REFN: /* /\g{name}/ */ 7245 type = REF; 7246 folder = NULL; 7247 fold_array = NULL; 7248 utf8_fold_flags = 0; 7249 do_nref: 7250 7251 /* For the named back references, find the corresponding buffer 7252 * number */ 7253 n = reg_check_named_buff_matched(rex,scan); 7254 7255 if ( ! n ) { 7256 sayNO; 7257 } 7258 goto do_nref_ref_common; 7259 7260 case REFFL: /* /\1/il */ 7261 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 7262 folder = foldEQ_locale; 7263 fold_array = PL_fold_locale; 7264 utf8_fold_flags = FOLDEQ_LOCALE; 7265 goto do_ref; 7266 7267 case REFFA: /* /\1/iaa */ 7268 folder = foldEQ_latin1; 7269 fold_array = PL_fold_latin1; 7270 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7271 goto do_ref; 7272 7273 case REFFU: /* /\1/iu */ 7274 folder = foldEQ_latin1; 7275 fold_array = PL_fold_latin1; 7276 utf8_fold_flags = 0; 7277 goto do_ref; 7278 7279 case REFF: /* /\1/i */ 7280 folder = foldEQ; 7281 fold_array = PL_fold; 7282 utf8_fold_flags = 0; 7283 goto do_ref; 7284 7285 case REF: /* /\1/ */ 7286 folder = NULL; 7287 fold_array = NULL; 7288 utf8_fold_flags = 0; 7289 7290 do_ref: 7291 type = OP(scan); 7292 n = ARG(scan); /* which paren pair */ 7293 7294 do_nref_ref_common: 7295 ln = rex->offs[n].start; 7296 endref = rex->offs[n].end; 7297 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 7298 if (rex->lastparen < n || ln == -1 || endref == -1) 7299 sayNO; /* Do not match unless seen CLOSEn. */ 7300 if (ln == endref) 7301 break; 7302 7303 s = reginfo->strbeg + ln; 7304 if (type != REF /* REF can do byte comparison */ 7305 && (utf8_target || type == REFFU || type == REFFL)) 7306 { 7307 char * limit = loceol; 7308 7309 /* This call case insensitively compares the entire buffer 7310 * at s, with the current input starting at locinput, but 7311 * not going off the end given by loceol, and 7312 * returns in <limit> upon success, how much of the 7313 * current input was matched */ 7314 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target, 7315 locinput, &limit, 0, utf8_target, utf8_fold_flags)) 7316 { 7317 sayNO; 7318 } 7319 locinput = limit; 7320 break; 7321 } 7322 7323 /* Not utf8: Inline the first character, for speed. */ 7324 if ( ! NEXTCHR_IS_EOS 7325 && locinput < loceol 7326 && UCHARAT(s) != nextchr 7327 && ( type == REF 7328 || UCHARAT(s) != fold_array[nextchr])) 7329 { 7330 sayNO; 7331 } 7332 ln = endref - ln; 7333 if (locinput + ln > loceol) 7334 sayNO; 7335 if (ln > 1 && (type == REF 7336 ? memNE(s, locinput, ln) 7337 : ! folder(locinput, s, ln))) 7338 sayNO; 7339 locinput += ln; 7340 break; 7341 } 7342 7343 case NOTHING: /* null op; e.g. the 'nothing' following 7344 * the '*' in m{(a+|b)*}' */ 7345 break; 7346 case TAIL: /* placeholder while compiling (A|B|C) */ 7347 break; 7348 7349 #undef ST 7350 #define ST st->u.eval 7351 #define CUR_EVAL cur_eval->u.eval 7352 7353 { 7354 SV *ret; 7355 REGEXP *re_sv; 7356 regexp *re; 7357 regexp_internal *rei; 7358 regnode *startpoint; 7359 U32 arg; 7360 7361 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ 7362 arg= (U32)ARG(scan); 7363 if (cur_eval && cur_eval->locinput == locinput) { 7364 if ( ++nochange_depth > max_nochange_depth ) 7365 Perl_croak(aTHX_ 7366 "Pattern subroutine nesting without pos change" 7367 " exceeded limit in regex"); 7368 } else { 7369 nochange_depth = 0; 7370 } 7371 re_sv = rex_sv; 7372 re = rex; 7373 rei = rexi; 7374 startpoint = scan + ARG2L(scan); 7375 EVAL_CLOSE_PAREN_SET( st, arg ); 7376 /* Detect infinite recursion 7377 * 7378 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/ 7379 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever. 7380 * So we track the position in the string we are at each time 7381 * we recurse and if we try to enter the same routine twice from 7382 * the same position we throw an error. 7383 */ 7384 if ( rex->recurse_locinput[arg] == locinput ) { 7385 /* FIXME: we should show the regop that is failing as part 7386 * of the error message. */ 7387 Perl_croak(aTHX_ "Infinite recursion in regex"); 7388 } else { 7389 ST.prev_recurse_locinput= rex->recurse_locinput[arg]; 7390 rex->recurse_locinput[arg]= locinput; 7391 7392 DEBUG_r({ 7393 DECLARE_AND_GET_RE_DEBUG_FLAGS; 7394 DEBUG_STACK_r({ 7395 Perl_re_exec_indentf( aTHX_ 7396 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n", 7397 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg] 7398 ); 7399 }); 7400 }); 7401 } 7402 7403 /* Save all the positions seen so far. */ 7404 ST.cp = regcppush(rex, 0, maxopenparen); 7405 REGCP_SET(ST.lastcp); 7406 7407 /* and then jump to the code we share with EVAL */ 7408 goto eval_recurse_doit; 7409 /* NOTREACHED */ 7410 7411 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */ 7412 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) { 7413 if ( ++nochange_depth > max_nochange_depth ) 7414 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); 7415 } else { 7416 nochange_depth = 0; 7417 } 7418 { 7419 /* execute the code in the {...} */ 7420 7421 dSP; 7422 IV before; 7423 OP * const oop = PL_op; 7424 COP * const ocurcop = PL_curcop; 7425 OP *nop; 7426 CV *newcv; 7427 7428 /* save *all* paren positions */ 7429 regcppush(rex, 0, maxopenparen); 7430 REGCP_SET(ST.lastcp); 7431 7432 if (!caller_cv) 7433 caller_cv = find_runcv(NULL); 7434 7435 n = ARG(scan); 7436 7437 if (rexi->data->what[n] == 'r') { /* code from an external qr */ 7438 newcv = (ReANY( 7439 (REGEXP*)(rexi->data->data[n]) 7440 ))->qr_anoncv; 7441 nop = (OP*)rexi->data->data[n+1]; 7442 } 7443 else if (rexi->data->what[n] == 'l') { /* literal code */ 7444 newcv = caller_cv; 7445 nop = (OP*)rexi->data->data[n]; 7446 assert(CvDEPTH(newcv)); 7447 } 7448 else { 7449 /* literal with own CV */ 7450 assert(rexi->data->what[n] == 'L'); 7451 newcv = rex->qr_anoncv; 7452 nop = (OP*)rexi->data->data[n]; 7453 } 7454 7455 /* Some notes about MULTICALL and the context and save stacks. 7456 * 7457 * In something like 7458 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../ 7459 * since codeblocks don't introduce a new scope (so that 7460 * local() etc accumulate), at the end of a successful 7461 * match there will be a SAVEt_CLEARSV on the savestack 7462 * for each of $x, $y, $z. If the three code blocks above 7463 * happen to have come from different CVs (e.g. via 7464 * embedded qr//s), then we must ensure that during any 7465 * savestack unwinding, PL_comppad always points to the 7466 * right pad at each moment. We achieve this by 7467 * interleaving SAVEt_COMPPAD's on the savestack whenever 7468 * there is a change of pad. 7469 * In theory whenever we call a code block, we should 7470 * push a CXt_SUB context, then pop it on return from 7471 * that code block. This causes a bit of an issue in that 7472 * normally popping a context also clears the savestack 7473 * back to cx->blk_oldsaveix, but here we specifically 7474 * don't want to clear the save stack on exit from the 7475 * code block. 7476 * Also for efficiency we don't want to keep pushing and 7477 * popping the single SUB context as we backtrack etc. 7478 * So instead, we push a single context the first time 7479 * we need, it, then hang onto it until the end of this 7480 * function. Whenever we encounter a new code block, we 7481 * update the CV etc if that's changed. During the times 7482 * in this function where we're not executing a code 7483 * block, having the SUB context still there is a bit 7484 * naughty - but we hope that no-one notices. 7485 * When the SUB context is initially pushed, we fake up 7486 * cx->blk_oldsaveix to be as if we'd pushed this context 7487 * on first entry to S_regmatch rather than at some random 7488 * point during the regexe execution. That way if we 7489 * croak, popping the context stack will ensure that 7490 * *everything* SAVEd by this function is undone and then 7491 * the context popped, rather than e.g., popping the 7492 * context (and restoring the original PL_comppad) then 7493 * popping more of the savestack and restoring a bad 7494 * PL_comppad. 7495 */ 7496 7497 /* If this is the first EVAL, push a MULTICALL. On 7498 * subsequent calls, if we're executing a different CV, or 7499 * if PL_comppad has got messed up from backtracking 7500 * through SAVECOMPPADs, then refresh the context. 7501 */ 7502 if (newcv != last_pushed_cv || PL_comppad != last_pad) 7503 { 7504 U8 flags = (CXp_SUB_RE | 7505 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); 7506 SAVECOMPPAD(); 7507 if (last_pushed_cv) { 7508 CHANGE_MULTICALL_FLAGS(newcv, flags); 7509 } 7510 else { 7511 PUSH_MULTICALL_FLAGS(newcv, flags); 7512 } 7513 /* see notes above */ 7514 CX_CUR()->blk_oldsaveix = orig_savestack_ix; 7515 7516 last_pushed_cv = newcv; 7517 } 7518 else { 7519 /* these assignments are just to silence compiler 7520 * warnings */ 7521 multicall_cop = NULL; 7522 } 7523 last_pad = PL_comppad; 7524 7525 /* the initial nextstate you would normally execute 7526 * at the start of an eval (which would cause error 7527 * messages to come from the eval), may be optimised 7528 * away from the execution path in the regex code blocks; 7529 * so manually set PL_curcop to it initially */ 7530 { 7531 OP *o = cUNOPx(nop)->op_first; 7532 assert(o->op_type == OP_NULL); 7533 if (o->op_targ == OP_SCOPE) { 7534 o = cUNOPo->op_first; 7535 } 7536 else { 7537 assert(o->op_targ == OP_LEAVE); 7538 o = cUNOPo->op_first; 7539 assert(o->op_type == OP_ENTER); 7540 o = OpSIBLING(o); 7541 } 7542 7543 if (o->op_type != OP_STUB) { 7544 assert( o->op_type == OP_NEXTSTATE 7545 || o->op_type == OP_DBSTATE 7546 || (o->op_type == OP_NULL 7547 && ( o->op_targ == OP_NEXTSTATE 7548 || o->op_targ == OP_DBSTATE 7549 ) 7550 ) 7551 ); 7552 PL_curcop = (COP*)o; 7553 } 7554 } 7555 nop = nop->op_next; 7556 7557 DEBUG_STATE_r( Perl_re_printf( aTHX_ 7558 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) ); 7559 7560 rex->offs[0].end = locinput - reginfo->strbeg; 7561 if (reginfo->info_aux_eval->pos_magic) 7562 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, 7563 reginfo->sv, reginfo->strbeg, 7564 locinput - reginfo->strbeg); 7565 7566 if (sv_yes_mark) { 7567 SV *sv_mrk = get_sv("REGMARK", 1); 7568 sv_setsv(sv_mrk, sv_yes_mark); 7569 } 7570 7571 /* we don't use MULTICALL here as we want to call the 7572 * first op of the block of interest, rather than the 7573 * first op of the sub. Also, we don't want to free 7574 * the savestack frame */ 7575 before = (IV)(SP-PL_stack_base); 7576 PL_op = nop; 7577 CALLRUNOPS(aTHX); /* Scalar context. */ 7578 SPAGAIN; 7579 if ((IV)(SP-PL_stack_base) == before) 7580 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ 7581 else { 7582 ret = POPs; 7583 PUTBACK; 7584 } 7585 7586 /* before restoring everything, evaluate the returned 7587 * value, so that 'uninit' warnings don't use the wrong 7588 * PL_op or pad. Also need to process any magic vars 7589 * (e.g. $1) *before* parentheses are restored */ 7590 7591 PL_op = NULL; 7592 7593 re_sv = NULL; 7594 if (logical == 0) { /* (?{})/ */ 7595 SV *replsv = save_scalar(PL_replgv); 7596 sv_setsv(replsv, ret); /* $^R */ 7597 SvSETMAGIC(replsv); 7598 } 7599 else if (logical == 1) { /* /(?(?{...})X|Y)/ */ 7600 sw = cBOOL(SvTRUE_NN(ret)); 7601 logical = 0; 7602 } 7603 else { /* /(??{}) */ 7604 /* if its overloaded, let the regex compiler handle 7605 * it; otherwise extract regex, or stringify */ 7606 if (SvGMAGICAL(ret)) 7607 ret = sv_mortalcopy(ret); 7608 if (!SvAMAGIC(ret)) { 7609 SV *sv = ret; 7610 if (SvROK(sv)) 7611 sv = SvRV(sv); 7612 if (SvTYPE(sv) == SVt_REGEXP) 7613 re_sv = (REGEXP*) sv; 7614 else if (SvSMAGICAL(ret)) { 7615 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); 7616 if (mg) 7617 re_sv = (REGEXP *) mg->mg_obj; 7618 } 7619 7620 /* force any undef warnings here */ 7621 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { 7622 ret = sv_mortalcopy(ret); 7623 (void) SvPV_force_nolen(ret); 7624 } 7625 } 7626 7627 } 7628 7629 /* *** Note that at this point we don't restore 7630 * PL_comppad, (or pop the CxSUB) on the assumption it may 7631 * be used again soon. This is safe as long as nothing 7632 * in the regexp code uses the pad ! */ 7633 PL_op = oop; 7634 PL_curcop = ocurcop; 7635 regcp_restore(rex, ST.lastcp, &maxopenparen); 7636 PL_curpm_under = PL_curpm; 7637 PL_curpm = PL_reg_curpm; 7638 7639 if (logical != 2) { 7640 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol, 7641 script_run_begin); 7642 /* NOTREACHED */ 7643 } 7644 } 7645 7646 /* only /(??{})/ from now on */ 7647 logical = 0; 7648 { 7649 /* extract RE object from returned value; compiling if 7650 * necessary */ 7651 7652 if (re_sv) { 7653 re_sv = reg_temp_copy(NULL, re_sv); 7654 } 7655 else { 7656 U32 pm_flags = 0; 7657 7658 if (SvUTF8(ret) && IN_BYTES) { 7659 /* In use 'bytes': make a copy of the octet 7660 * sequence, but without the flag on */ 7661 STRLEN len; 7662 const char *const p = SvPV(ret, len); 7663 ret = newSVpvn_flags(p, len, SVs_TEMP); 7664 } 7665 if (rex->intflags & PREGf_USE_RE_EVAL) 7666 pm_flags |= PMf_USE_RE_EVAL; 7667 7668 /* if we got here, it should be an engine which 7669 * supports compiling code blocks and stuff */ 7670 assert(rex->engine && rex->engine->op_comp); 7671 assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); 7672 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, 7673 rex->engine, NULL, NULL, 7674 /* copy /msixn etc to inner pattern */ 7675 ARG2L(scan), 7676 pm_flags); 7677 7678 if (!(SvFLAGS(ret) 7679 & (SVs_TEMP | SVs_GMG | SVf_ROK)) 7680 && (!SvPADTMP(ret) || SvREADONLY(ret))) { 7681 /* This isn't a first class regexp. Instead, it's 7682 caching a regexp onto an existing, Perl visible 7683 scalar. */ 7684 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); 7685 } 7686 } 7687 SAVEFREESV(re_sv); 7688 re = ReANY(re_sv); 7689 } 7690 RXp_MATCH_COPIED_off(re); 7691 re->subbeg = rex->subbeg; 7692 re->sublen = rex->sublen; 7693 re->suboffset = rex->suboffset; 7694 re->subcoffset = rex->subcoffset; 7695 re->lastparen = 0; 7696 re->lastcloseparen = 0; 7697 rei = RXi_GET(re); 7698 DEBUG_EXECUTE_r( 7699 debug_start_match(re_sv, utf8_target, locinput, 7700 reginfo->strend, "EVAL/GOSUB: Matching embedded"); 7701 ); 7702 startpoint = rei->program + 1; 7703 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0; 7704 * close_paren only for GOSUB */ 7705 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */ 7706 /* Save all the seen positions so far. */ 7707 ST.cp = regcppush(rex, 0, maxopenparen); 7708 REGCP_SET(ST.lastcp); 7709 /* and set maxopenparen to 0, since we are starting a "fresh" match */ 7710 maxopenparen = 0; 7711 /* run the pattern returned from (??{...}) */ 7712 7713 eval_recurse_doit: /* Share code with GOSUB below this line 7714 * At this point we expect the stack context to be 7715 * set up correctly */ 7716 7717 /* invalidate the S-L poscache. We're now executing a 7718 * different set of WHILEM ops (and their associated 7719 * indexes) against the same string, so the bits in the 7720 * cache are meaningless. Setting maxiter to zero forces 7721 * the cache to be invalidated and zeroed before reuse. 7722 * XXX This is too dramatic a measure. Ideally we should 7723 * save the old cache and restore when running the outer 7724 * pattern again */ 7725 reginfo->poscache_maxiter = 0; 7726 7727 /* the new regexp might have a different is_utf8_pat than we do */ 7728 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); 7729 7730 ST.prev_rex = rex_sv; 7731 ST.prev_curlyx = cur_curlyx; 7732 rex_sv = re_sv; 7733 SET_reg_curpm(rex_sv); 7734 rex = re; 7735 rexi = rei; 7736 cur_curlyx = NULL; 7737 ST.B = next; 7738 ST.prev_eval = cur_eval; 7739 cur_eval = st; 7740 /* now continue from first node in postoned RE */ 7741 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, 7742 loceol, script_run_begin); 7743 NOT_REACHED; /* NOTREACHED */ 7744 } 7745 7746 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */ 7747 /* note: this is called twice; first after popping B, then A */ 7748 DEBUG_STACK_r({ 7749 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n", 7750 depth, cur_eval, ST.prev_eval); 7751 }); 7752 7753 #define SET_RECURSE_LOCINPUT(STR,VAL)\ 7754 if ( cur_eval && CUR_EVAL.close_paren ) {\ 7755 DEBUG_STACK_r({ \ 7756 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\ 7757 depth, \ 7758 CUR_EVAL.close_paren - 1,\ 7759 cur_eval, \ 7760 VAL); \ 7761 }); \ 7762 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\ 7763 } 7764 7765 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput); 7766 7767 rex_sv = ST.prev_rex; 7768 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 7769 SET_reg_curpm(rex_sv); 7770 rex = ReANY(rex_sv); 7771 rexi = RXi_GET(rex); 7772 { 7773 /* preserve $^R across LEAVE's. See Bug 121070. */ 7774 SV *save_sv= GvSV(PL_replgv); 7775 SV *replsv; 7776 SvREFCNT_inc(save_sv); 7777 regcpblow(ST.cp); /* LEAVE in disguise */ 7778 /* don't move this initialization up */ 7779 replsv = GvSV(PL_replgv); 7780 sv_setsv(replsv, save_sv); 7781 SvSETMAGIC(replsv); 7782 SvREFCNT_dec(save_sv); 7783 } 7784 cur_eval = ST.prev_eval; 7785 cur_curlyx = ST.prev_curlyx; 7786 7787 /* Invalidate cache. See "invalidate" comment above. */ 7788 reginfo->poscache_maxiter = 0; 7789 if ( nochange_depth ) 7790 nochange_depth--; 7791 7792 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput); 7793 sayYES; 7794 7795 7796 case EVAL_B_fail: /* unsuccessful B in (?{...})B */ 7797 REGCP_UNWIND(ST.lastcp); 7798 sayNO; 7799 7800 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ 7801 /* note: this is called twice; first after popping B, then A */ 7802 DEBUG_STACK_r({ 7803 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n", 7804 depth, cur_eval, ST.prev_eval); 7805 }); 7806 7807 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput); 7808 7809 rex_sv = ST.prev_rex; 7810 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 7811 SET_reg_curpm(rex_sv); 7812 rex = ReANY(rex_sv); 7813 rexi = RXi_GET(rex); 7814 7815 REGCP_UNWIND(ST.lastcp); 7816 regcppop(rex, &maxopenparen); 7817 cur_eval = ST.prev_eval; 7818 cur_curlyx = ST.prev_curlyx; 7819 7820 /* Invalidate cache. See "invalidate" comment above. */ 7821 reginfo->poscache_maxiter = 0; 7822 if ( nochange_depth ) 7823 nochange_depth--; 7824 7825 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput); 7826 sayNO_SILENT; 7827 #undef ST 7828 7829 case OPEN: /* ( */ 7830 n = ARG(scan); /* which paren pair */ 7831 rex->offs[n].start_tmp = locinput - reginfo->strbeg; 7832 if (n > maxopenparen) 7833 maxopenparen = n; 7834 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 7835 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n", 7836 depth, 7837 PTR2UV(rex), 7838 PTR2UV(rex->offs), 7839 (UV)n, 7840 (IV)rex->offs[n].start_tmp, 7841 (UV)maxopenparen 7842 )); 7843 lastopen = n; 7844 break; 7845 7846 case SROPEN: /* (*SCRIPT_RUN: */ 7847 script_run_begin = (U8 *) locinput; 7848 break; 7849 7850 7851 case CLOSE: /* ) */ 7852 n = ARG(scan); /* which paren pair */ 7853 CLOSE_CAPTURE(n, rex->offs[n].start_tmp, 7854 locinput - reginfo->strbeg); 7855 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) ) 7856 goto fake_end; 7857 7858 break; 7859 7860 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */ 7861 7862 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target)) 7863 { 7864 sayNO; 7865 } 7866 7867 break; 7868 7869 7870 case ACCEPT: /* (*ACCEPT) */ 7871 if (scan->flags) 7872 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 7873 if (ARG2L(scan)){ 7874 regnode *cursor; 7875 for (cursor=scan; 7876 cursor && OP(cursor)!=END; 7877 cursor=regnext(cursor)) 7878 { 7879 if ( OP(cursor)==CLOSE ){ 7880 n = ARG(cursor); 7881 if ( n <= lastopen ) { 7882 CLOSE_CAPTURE(n, rex->offs[n].start_tmp, 7883 locinput - reginfo->strbeg); 7884 if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) ) 7885 break; 7886 } 7887 } 7888 } 7889 } 7890 goto fake_end; 7891 /* NOTREACHED */ 7892 7893 case GROUPP: /* (?(1)) */ 7894 n = ARG(scan); /* which paren pair */ 7895 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); 7896 break; 7897 7898 case GROUPPN: /* (?(<name>)) */ 7899 /* reg_check_named_buff_matched returns 0 for no match */ 7900 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); 7901 break; 7902 7903 case INSUBP: /* (?(R)) */ 7904 n = ARG(scan); 7905 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg 7906 * of SCAN is already set up as matches a eval.close_paren */ 7907 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n); 7908 break; 7909 7910 case DEFINEP: /* (?(DEFINE)) */ 7911 sw = 0; 7912 break; 7913 7914 case IFTHEN: /* (?(cond)A|B) */ 7915 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 7916 if (sw) 7917 next = NEXTOPER(NEXTOPER(scan)); 7918 else { 7919 next = scan + ARG(scan); 7920 if (OP(next) == IFTHEN) /* Fake one. */ 7921 next = NEXTOPER(NEXTOPER(next)); 7922 } 7923 break; 7924 7925 case LOGICAL: /* modifier for EVAL and IFMATCH */ 7926 logical = scan->flags; 7927 break; 7928 7929 /******************************************************************* 7930 7931 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ 7932 pattern, where A and B are subpatterns. (For simple A, CURLYM or 7933 STAR/PLUS/CURLY/CURLYN are used instead.) 7934 7935 A*B is compiled as <CURLYX><A><WHILEM><B> 7936 7937 On entry to the subpattern, CURLYX is called. This pushes a CURLYX 7938 state, which contains the current count, initialised to -1. It also sets 7939 cur_curlyx to point to this state, with any previous value saved in the 7940 state block. 7941 7942 CURLYX then jumps straight to the WHILEM op, rather than executing A, 7943 since the pattern may possibly match zero times (i.e. it's a while {} loop 7944 rather than a do {} while loop). 7945 7946 Each entry to WHILEM represents a successful match of A. The count in the 7947 CURLYX block is incremented, another WHILEM state is pushed, and execution 7948 passes to A or B depending on greediness and the current count. 7949 7950 For example, if matching against the string a1a2a3b (where the aN are 7951 substrings that match /A/), then the match progresses as follows: (the 7952 pushed states are interspersed with the bits of strings matched so far): 7953 7954 <CURLYX cnt=-1> 7955 <CURLYX cnt=0><WHILEM> 7956 <CURLYX cnt=1><WHILEM> a1 <WHILEM> 7957 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM> 7958 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> 7959 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b 7960 7961 (Contrast this with something like CURLYM, which maintains only a single 7962 backtrack state: 7963 7964 <CURLYM cnt=0> a1 7965 a1 <CURLYM cnt=1> a2 7966 a1 a2 <CURLYM cnt=2> a3 7967 a1 a2 a3 <CURLYM cnt=3> b 7968 ) 7969 7970 Each WHILEM state block marks a point to backtrack to upon partial failure 7971 of A or B, and also contains some minor state data related to that 7972 iteration. The CURLYX block, pointed to by cur_curlyx, contains the 7973 overall state, such as the count, and pointers to the A and B ops. 7974 7975 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx 7976 must always point to the *current* CURLYX block, the rules are: 7977 7978 When executing CURLYX, save the old cur_curlyx in the CURLYX state block, 7979 and set cur_curlyx to point the new block. 7980 7981 When popping the CURLYX block after a successful or unsuccessful match, 7982 restore the previous cur_curlyx. 7983 7984 When WHILEM is about to execute B, save the current cur_curlyx, and set it 7985 to the outer one saved in the CURLYX block. 7986 7987 When popping the WHILEM block after a successful or unsuccessful B match, 7988 restore the previous cur_curlyx. 7989 7990 Here's an example for the pattern (AI* BI)*BO 7991 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: 7992 7993 cur_ 7994 curlyx backtrack stack 7995 ------ --------------- 7996 NULL 7997 CO <CO prev=NULL> <WO> 7998 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 7999 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 8000 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo 8001 8002 At this point the pattern succeeds, and we work back down the stack to 8003 clean up, restoring as we go: 8004 8005 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 8006 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 8007 CO <CO prev=NULL> <WO> 8008 NULL 8009 8010 *******************************************************************/ 8011 8012 #define ST st->u.curlyx 8013 8014 case CURLYX: /* start of /A*B/ (for complex A) */ 8015 { 8016 /* No need to save/restore up to this paren */ 8017 I32 parenfloor = scan->flags; 8018 8019 assert(next); /* keep Coverity happy */ 8020 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ 8021 next += ARG(next); 8022 8023 /* XXXX Probably it is better to teach regpush to support 8024 parenfloor > maxopenparen ... */ 8025 if (parenfloor > (I32)rex->lastparen) 8026 parenfloor = rex->lastparen; /* Pessimization... */ 8027 8028 ST.prev_curlyx= cur_curlyx; 8029 cur_curlyx = st; 8030 ST.cp = PL_savestack_ix; 8031 8032 /* these fields contain the state of the current curly. 8033 * they are accessed by subsequent WHILEMs */ 8034 ST.parenfloor = parenfloor; 8035 ST.me = scan; 8036 ST.B = next; 8037 ST.minmod = minmod; 8038 minmod = 0; 8039 ST.count = -1; /* this will be updated by WHILEM */ 8040 ST.lastloc = NULL; /* this will be updated by WHILEM */ 8041 8042 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol, 8043 script_run_begin); 8044 NOT_REACHED; /* NOTREACHED */ 8045 } 8046 8047 case CURLYX_end: /* just finished matching all of A*B */ 8048 cur_curlyx = ST.prev_curlyx; 8049 sayYES; 8050 NOT_REACHED; /* NOTREACHED */ 8051 8052 case CURLYX_end_fail: /* just failed to match all of A*B */ 8053 regcpblow(ST.cp); 8054 cur_curlyx = ST.prev_curlyx; 8055 sayNO; 8056 NOT_REACHED; /* NOTREACHED */ 8057 8058 8059 #undef ST 8060 #define ST st->u.whilem 8061 8062 case WHILEM: /* just matched an A in /A*B/ (for complex A) */ 8063 { 8064 /* see the discussion above about CURLYX/WHILEM */ 8065 I32 n; 8066 int min, max; 8067 regnode *A; 8068 8069 assert(cur_curlyx); /* keep Coverity happy */ 8070 8071 min = ARG1(cur_curlyx->u.curlyx.me); 8072 max = ARG2(cur_curlyx->u.curlyx.me); 8073 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS; 8074 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ 8075 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; 8076 ST.cache_offset = 0; 8077 ST.cache_mask = 0; 8078 8079 8080 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n", 8081 depth, (long)n, min, max) 8082 ); 8083 8084 /* First just match a string of min A's. */ 8085 8086 if (n < min) { 8087 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen); 8088 cur_curlyx->u.curlyx.lastloc = locinput; 8089 REGCP_SET(ST.lastcp); 8090 8091 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol, 8092 script_run_begin); 8093 NOT_REACHED; /* NOTREACHED */ 8094 } 8095 8096 /* If degenerate A matches "", assume A done. */ 8097 8098 if (locinput == cur_curlyx->u.curlyx.lastloc) { 8099 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n", 8100 depth) 8101 ); 8102 goto do_whilem_B_max; 8103 } 8104 8105 /* super-linear cache processing. 8106 * 8107 * The idea here is that for certain types of CURLYX/WHILEM - 8108 * principally those whose upper bound is infinity (and 8109 * excluding regexes that have things like \1 and other very 8110 * non-regular expresssiony things), then if a pattern like 8111 * /....A*.../ fails and we backtrack to the WHILEM, then we 8112 * make a note that this particular WHILEM op was at string 8113 * position 47 (say) when the rest of pattern failed. Then, if 8114 * we ever find ourselves back at that WHILEM, and at string 8115 * position 47 again, we can just fail immediately rather than 8116 * running the rest of the pattern again. 8117 * 8118 * This is very handy when patterns start to go 8119 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up 8120 * with a combinatorial explosion of backtracking. 8121 * 8122 * The cache is implemented as a bit array, with one bit per 8123 * string byte position per WHILEM op (up to 16) - so its 8124 * between 0.25 and 2x the string size. 8125 * 8126 * To avoid allocating a poscache buffer every time, we do an 8127 * initially countdown; only after we have executed a WHILEM 8128 * op (string-length x #WHILEMs) times do we allocate the 8129 * cache. 8130 * 8131 * The top 4 bits of scan->flags byte say how many different 8132 * relevant CURLLYX/WHILEM op pairs there are, while the 8133 * bottom 4-bits is the identifying index number of this 8134 * WHILEM. 8135 */ 8136 8137 if (scan->flags) { 8138 8139 if (!reginfo->poscache_maxiter) { 8140 /* start the countdown: Postpone detection until we 8141 * know the match is not *that* much linear. */ 8142 reginfo->poscache_maxiter 8143 = (reginfo->strend - reginfo->strbeg + 1) 8144 * (scan->flags>>4); 8145 /* possible overflow for long strings and many CURLYX's */ 8146 if (reginfo->poscache_maxiter < 0) 8147 reginfo->poscache_maxiter = I32_MAX; 8148 reginfo->poscache_iter = reginfo->poscache_maxiter; 8149 } 8150 8151 if (reginfo->poscache_iter-- == 0) { 8152 /* initialise cache */ 8153 const SSize_t size = (reginfo->poscache_maxiter + 7)/8; 8154 regmatch_info_aux *const aux = reginfo->info_aux; 8155 if (aux->poscache) { 8156 if ((SSize_t)reginfo->poscache_size < size) { 8157 Renew(aux->poscache, size, char); 8158 reginfo->poscache_size = size; 8159 } 8160 Zero(aux->poscache, size, char); 8161 } 8162 else { 8163 reginfo->poscache_size = size; 8164 Newxz(aux->poscache, size, char); 8165 } 8166 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 8167 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n", 8168 PL_colors[4], PL_colors[5]) 8169 ); 8170 } 8171 8172 if (reginfo->poscache_iter < 0) { 8173 /* have we already failed at this position? */ 8174 SSize_t offset, mask; 8175 8176 reginfo->poscache_iter = -1; /* stop eventual underflow */ 8177 offset = (scan->flags & 0xf) - 1 8178 + (locinput - reginfo->strbeg) 8179 * (scan->flags>>4); 8180 mask = 1 << (offset % 8); 8181 offset /= 8; 8182 if (reginfo->info_aux->poscache[offset] & mask) { 8183 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n", 8184 depth) 8185 ); 8186 cur_curlyx->u.curlyx.count--; 8187 sayNO; /* cache records failure */ 8188 } 8189 ST.cache_offset = offset; 8190 ST.cache_mask = mask; 8191 } 8192 } 8193 8194 /* Prefer B over A for minimal matching. */ 8195 8196 if (cur_curlyx->u.curlyx.minmod) { 8197 ST.save_curlyx = cur_curlyx; 8198 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 8199 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, 8200 locinput, loceol, script_run_begin); 8201 NOT_REACHED; /* NOTREACHED */ 8202 } 8203 8204 /* Prefer A over B for maximal matching. */ 8205 8206 if (n < max) { /* More greed allowed? */ 8207 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, 8208 maxopenparen); 8209 cur_curlyx->u.curlyx.lastloc = locinput; 8210 REGCP_SET(ST.lastcp); 8211 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol, 8212 script_run_begin); 8213 NOT_REACHED; /* NOTREACHED */ 8214 } 8215 goto do_whilem_B_max; 8216 } 8217 NOT_REACHED; /* NOTREACHED */ 8218 8219 case WHILEM_B_min: /* just matched B in a minimal match */ 8220 case WHILEM_B_max: /* just matched B in a maximal match */ 8221 cur_curlyx = ST.save_curlyx; 8222 sayYES; 8223 NOT_REACHED; /* NOTREACHED */ 8224 8225 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ 8226 cur_curlyx = ST.save_curlyx; 8227 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 8228 cur_curlyx->u.curlyx.count--; 8229 CACHEsayNO; 8230 NOT_REACHED; /* NOTREACHED */ 8231 8232 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ 8233 /* FALLTHROUGH */ 8234 case WHILEM_A_pre_fail: /* just failed to match even minimal A */ 8235 REGCP_UNWIND(ST.lastcp); 8236 regcppop(rex, &maxopenparen); 8237 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 8238 cur_curlyx->u.curlyx.count--; 8239 CACHEsayNO; 8240 NOT_REACHED; /* NOTREACHED */ 8241 8242 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ 8243 REGCP_UNWIND(ST.lastcp); 8244 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */ 8245 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n", 8246 depth) 8247 ); 8248 do_whilem_B_max: 8249 if (cur_curlyx->u.curlyx.count >= REG_INFTY 8250 && ckWARN(WARN_REGEXP) 8251 && !reginfo->warned) 8252 { 8253 reginfo->warned = TRUE; 8254 Perl_warner(aTHX_ packWARN(WARN_REGEXP), 8255 "Complex regular subexpression recursion limit (%d) " 8256 "exceeded", 8257 REG_INFTY - 1); 8258 } 8259 8260 /* now try B */ 8261 ST.save_curlyx = cur_curlyx; 8262 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 8263 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, 8264 locinput, loceol, script_run_begin); 8265 NOT_REACHED; /* NOTREACHED */ 8266 8267 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ 8268 cur_curlyx = ST.save_curlyx; 8269 8270 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { 8271 /* Maximum greed exceeded */ 8272 if (cur_curlyx->u.curlyx.count >= REG_INFTY 8273 && ckWARN(WARN_REGEXP) 8274 && !reginfo->warned) 8275 { 8276 reginfo->warned = TRUE; 8277 Perl_warner(aTHX_ packWARN(WARN_REGEXP), 8278 "Complex regular subexpression recursion " 8279 "limit (%d) exceeded", 8280 REG_INFTY - 1); 8281 } 8282 cur_curlyx->u.curlyx.count--; 8283 CACHEsayNO; 8284 } 8285 8286 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth) 8287 ); 8288 /* Try grabbing another A and see if it helps. */ 8289 cur_curlyx->u.curlyx.lastloc = locinput; 8290 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, 8291 maxopenparen); 8292 REGCP_SET(ST.lastcp); 8293 PUSH_STATE_GOTO(WHILEM_A_min, 8294 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, 8295 locinput, loceol, script_run_begin); 8296 NOT_REACHED; /* NOTREACHED */ 8297 8298 #undef ST 8299 #define ST st->u.branch 8300 8301 case BRANCHJ: /* /(...|A|...)/ with long next pointer */ 8302 next = scan + ARG(scan); 8303 if (next == scan) 8304 next = NULL; 8305 scan = NEXTOPER(scan); 8306 /* FALLTHROUGH */ 8307 8308 case BRANCH: /* /(...|A|...)/ */ 8309 scan = NEXTOPER(scan); /* scan now points to inner node */ 8310 ST.lastparen = rex->lastparen; 8311 ST.lastcloseparen = rex->lastcloseparen; 8312 ST.next_branch = next; 8313 REGCP_SET(ST.cp); 8314 8315 /* Now go into the branch */ 8316 if (has_cutgroup) { 8317 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol, 8318 script_run_begin); 8319 } else { 8320 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol, 8321 script_run_begin); 8322 } 8323 NOT_REACHED; /* NOTREACHED */ 8324 8325 case CUTGROUP: /* /(*THEN)/ */ 8326 sv_yes_mark = st->u.mark.mark_name = scan->flags 8327 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ]) 8328 : NULL; 8329 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol, 8330 script_run_begin); 8331 NOT_REACHED; /* NOTREACHED */ 8332 8333 case CUTGROUP_next_fail: 8334 do_cutgroup = 1; 8335 no_final = 1; 8336 if (st->u.mark.mark_name) 8337 sv_commit = st->u.mark.mark_name; 8338 sayNO; 8339 NOT_REACHED; /* NOTREACHED */ 8340 8341 case BRANCH_next: 8342 sayYES; 8343 NOT_REACHED; /* NOTREACHED */ 8344 8345 case BRANCH_next_fail: /* that branch failed; try the next, if any */ 8346 if (do_cutgroup) { 8347 do_cutgroup = 0; 8348 no_final = 0; 8349 } 8350 REGCP_UNWIND(ST.cp); 8351 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 8352 scan = ST.next_branch; 8353 /* no more branches? */ 8354 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { 8355 DEBUG_EXECUTE_r({ 8356 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n", 8357 depth, 8358 PL_colors[4], 8359 PL_colors[5] ); 8360 }); 8361 sayNO_SILENT; 8362 } 8363 continue; /* execute next BRANCH[J] op */ 8364 /* NOTREACHED */ 8365 8366 case MINMOD: /* next op will be non-greedy, e.g. A*? */ 8367 minmod = 1; 8368 break; 8369 8370 #undef ST 8371 #define ST st->u.curlym 8372 8373 case CURLYM: /* /A{m,n}B/ where A is fixed-length */ 8374 8375 /* This is an optimisation of CURLYX that enables us to push 8376 * only a single backtracking state, no matter how many matches 8377 * there are in {m,n}. It relies on the pattern being constant 8378 * length, with no parens to influence future backrefs 8379 */ 8380 8381 ST.me = scan; 8382 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 8383 8384 ST.lastparen = rex->lastparen; 8385 ST.lastcloseparen = rex->lastcloseparen; 8386 8387 /* if paren positive, emulate an OPEN/CLOSE around A */ 8388 if (ST.me->flags) { 8389 U32 paren = ST.me->flags; 8390 if (paren > maxopenparen) 8391 maxopenparen = paren; 8392 scan += NEXT_OFF(scan); /* Skip former OPEN. */ 8393 } 8394 ST.A = scan; 8395 ST.B = next; 8396 ST.alen = 0; 8397 ST.count = 0; 8398 ST.minmod = minmod; 8399 minmod = 0; 8400 ST.c1 = CHRTEST_UNINIT; 8401 REGCP_SET(ST.cp); 8402 8403 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ 8404 goto curlym_do_B; 8405 8406 curlym_do_A: /* execute the A in /A{m,n}B/ */ 8407 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */ 8408 script_run_begin); 8409 NOT_REACHED; /* NOTREACHED */ 8410 8411 case CURLYM_A: /* we've just matched an A */ 8412 ST.count++; 8413 /* after first match, determine A's length: u.curlym.alen */ 8414 if (ST.count == 1) { 8415 if (reginfo->is_utf8_target) { 8416 char *s = st->locinput; 8417 while (s < locinput) { 8418 ST.alen++; 8419 s += UTF8SKIP(s); 8420 } 8421 } 8422 else { 8423 ST.alen = locinput - st->locinput; 8424 } 8425 if (ST.alen == 0) 8426 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); 8427 } 8428 DEBUG_EXECUTE_r( 8429 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n", 8430 depth, (IV) ST.count, (IV)ST.alen) 8431 ); 8432 8433 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags)) 8434 goto fake_end; 8435 8436 { 8437 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); 8438 if ( max == REG_INFTY || ST.count < max ) 8439 goto curlym_do_A; /* try to match another A */ 8440 } 8441 goto curlym_do_B; /* try to match B */ 8442 8443 case CURLYM_A_fail: /* just failed to match an A */ 8444 REGCP_UNWIND(ST.cp); 8445 8446 8447 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ 8448 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags)) 8449 sayNO; 8450 8451 curlym_do_B: /* execute the B in /A{m,n}B/ */ 8452 if (ST.c1 == CHRTEST_UNINIT) { 8453 /* calculate c1 and c2 for possible match of 1st char 8454 * following curly */ 8455 ST.c1 = ST.c2 = CHRTEST_VOID; 8456 assert(ST.B); 8457 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { 8458 regnode *text_node = ST.B; 8459 if (! HAS_TEXT(text_node)) 8460 FIND_NEXT_IMPT(text_node); 8461 if (PL_regkind[OP(text_node)] == EXACT) { 8462 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ 8463 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, 8464 reginfo)) 8465 { 8466 sayNO; 8467 } 8468 } 8469 } 8470 } 8471 8472 DEBUG_EXECUTE_r( 8473 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n", 8474 depth, (IV)ST.count) 8475 ); 8476 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { 8477 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { 8478 8479 /* (We can use memEQ and memNE in this file without 8480 * having to worry about one being shorter than the 8481 * other, since the first byte of each gives the 8482 * length of the character) */ 8483 if ( memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput, 8484 reginfo->strend)) 8485 && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput, 8486 reginfo->strend))) 8487 { 8488 /* simulate B failing */ 8489 DEBUG_OPTIMISE_r( 8490 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n", 8491 depth, 8492 valid_utf8_to_uvchr((U8 *) locinput, NULL), 8493 valid_utf8_to_uvchr(ST.c1_utf8, NULL), 8494 valid_utf8_to_uvchr(ST.c2_utf8, NULL)) 8495 ); 8496 state_num = CURLYM_B_fail; 8497 goto reenter_switch; 8498 } 8499 } 8500 else if (nextchr != ST.c1 && nextchr != ST.c2) { 8501 /* simulate B failing */ 8502 DEBUG_OPTIMISE_r( 8503 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", 8504 depth, 8505 (int) nextchr, ST.c1, ST.c2) 8506 ); 8507 state_num = CURLYM_B_fail; 8508 goto reenter_switch; 8509 } 8510 } 8511 8512 if (ST.me->flags) { 8513 /* emulate CLOSE: mark current A as captured */ 8514 U32 paren = (U32)ST.me->flags; 8515 if (ST.count) { 8516 CLOSE_CAPTURE(paren, 8517 HOPc(locinput, -ST.alen) - reginfo->strbeg, 8518 locinput - reginfo->strbeg); 8519 } 8520 else 8521 rex->offs[paren].end = -1; 8522 8523 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags)) 8524 { 8525 if (ST.count) 8526 goto fake_end; 8527 else 8528 sayNO; 8529 } 8530 } 8531 8532 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */ 8533 script_run_begin); 8534 NOT_REACHED; /* NOTREACHED */ 8535 8536 case CURLYM_B_fail: /* just failed to match a B */ 8537 REGCP_UNWIND(ST.cp); 8538 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 8539 if (ST.minmod) { 8540 I32 max = ARG2(ST.me); 8541 if (max != REG_INFTY && ST.count == max) 8542 sayNO; 8543 goto curlym_do_A; /* try to match a further A */ 8544 } 8545 /* backtrack one A */ 8546 if (ST.count == ARG1(ST.me) /* min */) 8547 sayNO; 8548 ST.count--; 8549 SET_locinput(HOPc(locinput, -ST.alen)); 8550 goto curlym_do_B; /* try to match B */ 8551 8552 #undef ST 8553 #define ST st->u.curly 8554 8555 #define CURLY_SETPAREN(paren, success) \ 8556 if (paren) { \ 8557 if (success) { \ 8558 CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \ 8559 locinput - reginfo->strbeg); \ 8560 } \ 8561 else { \ 8562 rex->offs[paren].end = -1; \ 8563 rex->lastparen = ST.lastparen; \ 8564 rex->lastcloseparen = ST.lastcloseparen; \ 8565 } \ 8566 } 8567 8568 case STAR: /* /A*B/ where A is width 1 char */ 8569 ST.paren = 0; 8570 ST.min = 0; 8571 ST.max = REG_INFTY; 8572 scan = NEXTOPER(scan); 8573 goto repeat; 8574 8575 case PLUS: /* /A+B/ where A is width 1 char */ 8576 ST.paren = 0; 8577 ST.min = 1; 8578 ST.max = REG_INFTY; 8579 scan = NEXTOPER(scan); 8580 goto repeat; 8581 8582 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ 8583 ST.paren = scan->flags; /* Which paren to set */ 8584 ST.lastparen = rex->lastparen; 8585 ST.lastcloseparen = rex->lastcloseparen; 8586 if (ST.paren > maxopenparen) 8587 maxopenparen = ST.paren; 8588 ST.min = ARG1(scan); /* min to match */ 8589 ST.max = ARG2(scan); /* max to match */ 8590 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); 8591 8592 /* handle the single-char capture called as a GOSUB etc */ 8593 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren)) 8594 { 8595 char *li = locinput; 8596 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1)) 8597 sayNO; 8598 SET_locinput(li); 8599 goto fake_end; 8600 } 8601 8602 goto repeat; 8603 8604 case CURLY: /* /A{m,n}B/ where A is width 1 char */ 8605 ST.paren = 0; 8606 ST.min = ARG1(scan); /* min to match */ 8607 ST.max = ARG2(scan); /* max to match */ 8608 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 8609 repeat: 8610 /* 8611 * Lookahead to avoid useless match attempts 8612 * when we know what character comes next. 8613 * 8614 * Used to only do .*x and .*?x, but now it allows 8615 * for )'s, ('s and (?{ ... })'s to be in the way 8616 * of the quantifier and the EXACT-like node. -- japhy 8617 */ 8618 8619 assert(ST.min <= ST.max); 8620 if (! HAS_TEXT(next) && ! JUMPABLE(next)) { 8621 ST.c1 = ST.c2 = CHRTEST_VOID; 8622 } 8623 else { 8624 regnode *text_node = next; 8625 8626 if (! HAS_TEXT(text_node)) 8627 FIND_NEXT_IMPT(text_node); 8628 8629 if (! HAS_TEXT(text_node)) 8630 ST.c1 = ST.c2 = CHRTEST_VOID; 8631 else { 8632 if ( PL_regkind[OP(text_node)] != EXACT ) { 8633 ST.c1 = ST.c2 = CHRTEST_VOID; 8634 } 8635 else { 8636 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ 8637 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, 8638 reginfo)) 8639 { 8640 sayNO; 8641 } 8642 } 8643 } 8644 } 8645 8646 ST.A = scan; 8647 ST.B = next; 8648 if (minmod) { 8649 char *li = locinput; 8650 minmod = 0; 8651 if (ST.min && 8652 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min) 8653 < ST.min) 8654 sayNO; 8655 SET_locinput(li); 8656 ST.count = ST.min; 8657 REGCP_SET(ST.cp); 8658 if (ST.c1 == CHRTEST_VOID) 8659 goto curly_try_B_min; 8660 8661 ST.oldloc = locinput; 8662 8663 /* set ST.maxpos to the furthest point along the 8664 * string that could possibly match */ 8665 if (ST.max == REG_INFTY) { 8666 ST.maxpos = loceol - 1; 8667 if (utf8_target) 8668 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) 8669 ST.maxpos--; 8670 } 8671 else if (utf8_target) { 8672 int m = ST.max - ST.min; 8673 for (ST.maxpos = locinput; 8674 m >0 && ST.maxpos < loceol; m--) 8675 ST.maxpos += UTF8SKIP(ST.maxpos); 8676 } 8677 else { 8678 ST.maxpos = locinput + ST.max - ST.min; 8679 if (ST.maxpos >= loceol) 8680 ST.maxpos = loceol - 1; 8681 } 8682 goto curly_try_B_min_known; 8683 8684 } 8685 else { 8686 /* avoid taking address of locinput, so it can remain 8687 * a register var */ 8688 char *li = locinput; 8689 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max); 8690 if (ST.count < ST.min) 8691 sayNO; 8692 SET_locinput(li); 8693 if ((ST.count > ST.min) 8694 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) 8695 { 8696 /* A{m,n} must come at the end of the string, there's 8697 * no point in backing off ... */ 8698 ST.min = ST.count; 8699 /* ...except that $ and \Z can match before *and* after 8700 newline at the end. Consider "\n\n" =~ /\n+\Z\n/. 8701 We may back off by one in this case. */ 8702 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) 8703 ST.min--; 8704 } 8705 REGCP_SET(ST.cp); 8706 goto curly_try_B_max; 8707 } 8708 NOT_REACHED; /* NOTREACHED */ 8709 8710 case CURLY_B_min_fail: 8711 /* failed to find B in a non-greedy match. 8712 * Handles both cases where c1,c2 valid or not */ 8713 8714 REGCP_UNWIND(ST.cp); 8715 if (ST.paren) { 8716 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 8717 } 8718 8719 if (ST.c1 == CHRTEST_VOID) { 8720 /* failed -- move forward one */ 8721 char *li = locinput; 8722 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) { 8723 sayNO; 8724 } 8725 locinput = li; 8726 ST.count++; 8727 if (!( ST.count <= ST.max 8728 /* count overflow ? */ 8729 || (ST.max == REG_INFTY && ST.count > 0)) 8730 ) 8731 sayNO; 8732 } 8733 else { 8734 int n; 8735 /* Couldn't or didn't -- move forward. */ 8736 ST.oldloc = locinput; 8737 if (utf8_target) 8738 locinput += UTF8SKIP(locinput); 8739 else 8740 locinput++; 8741 ST.count++; 8742 8743 curly_try_B_min_known: 8744 /* find the next place where 'B' could work, then call B */ 8745 if (utf8_target) { 8746 n = (ST.oldloc == locinput) ? 0 : 1; 8747 if (ST.c1 == ST.c2) { 8748 /* set n to utf8_distance(oldloc, locinput) */ 8749 while ( locinput <= ST.maxpos 8750 && locinput < loceol 8751 && memNE(locinput, ST.c1_utf8, 8752 UTF8_SAFE_SKIP(locinput, reginfo->strend))) 8753 { 8754 locinput += UTF8_SAFE_SKIP(locinput, 8755 reginfo->strend); 8756 n++; 8757 } 8758 } 8759 else { 8760 /* set n to utf8_distance(oldloc, locinput) */ 8761 while ( locinput <= ST.maxpos 8762 && locinput < loceol 8763 && memNE(locinput, ST.c1_utf8, 8764 UTF8_SAFE_SKIP(locinput, reginfo->strend)) 8765 && memNE(locinput, ST.c2_utf8, 8766 UTF8_SAFE_SKIP(locinput, reginfo->strend))) 8767 { 8768 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); 8769 n++; 8770 } 8771 } 8772 } 8773 else { /* Not utf8_target */ 8774 if (ST.c1 == ST.c2) { 8775 locinput = (char *) memchr(locinput, 8776 ST.c1, 8777 ST.maxpos + 1 - locinput); 8778 if (! locinput) { 8779 locinput = ST.maxpos + 1; 8780 } 8781 } 8782 else { 8783 U8 c1_c2_bits_differing = ST.c1 ^ ST.c2; 8784 8785 if (! isPOWER_OF_2(c1_c2_bits_differing)) { 8786 while ( locinput <= ST.maxpos 8787 && UCHARAT(locinput) != ST.c1 8788 && UCHARAT(locinput) != ST.c2) 8789 { 8790 locinput++; 8791 } 8792 } 8793 else { 8794 /* If c1 and c2 only differ by a single bit, we can 8795 * avoid a conditional each time through the loop, 8796 * at the expense of a little preliminary setup and 8797 * an extra mask each iteration. By masking out 8798 * that bit, we match exactly two characters, c1 8799 * and c2, and so we don't have to test for both. 8800 * On both ASCII and EBCDIC platforms, most of the 8801 * ASCII-range and Latin1-range folded equivalents 8802 * differ only in a single bit, so this is actually 8803 * the most common case. (e.g. 'A' 0x41 vs 'a' 8804 * 0x61). */ 8805 U8 c1_masked = ST.c1 &~ c1_c2_bits_differing; 8806 U8 c1_c2_mask = ~ c1_c2_bits_differing; 8807 while ( locinput <= ST.maxpos 8808 && (UCHARAT(locinput) & c1_c2_mask) 8809 != c1_masked) 8810 { 8811 locinput++; 8812 } 8813 } 8814 } 8815 n = locinput - ST.oldloc; 8816 } 8817 if (locinput > ST.maxpos) 8818 sayNO; 8819 if (n) { 8820 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is 8821 * at b; check that everything between oldloc and 8822 * locinput matches */ 8823 char *li = ST.oldloc; 8824 ST.count += n; 8825 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n) 8826 sayNO; 8827 assert(n == REG_INFTY || locinput == li); 8828 } 8829 } 8830 8831 curly_try_B_min: 8832 CURLY_SETPAREN(ST.paren, ST.count); 8833 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol, 8834 script_run_begin); 8835 NOT_REACHED; /* NOTREACHED */ 8836 8837 8838 curly_try_B_max: 8839 /* a successful greedy match: now try to match B */ 8840 { 8841 bool could_match = locinput < loceol; 8842 8843 /* If it could work, try it. */ 8844 if (ST.c1 != CHRTEST_VOID && could_match) { 8845 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) 8846 { 8847 could_match = memEQ(locinput, ST.c1_utf8, 8848 UTF8_SAFE_SKIP(locinput, 8849 reginfo->strend)) 8850 || memEQ(locinput, ST.c2_utf8, 8851 UTF8_SAFE_SKIP(locinput, 8852 reginfo->strend)); 8853 } 8854 else { 8855 could_match = UCHARAT(locinput) == ST.c1 8856 || UCHARAT(locinput) == ST.c2; 8857 } 8858 } 8859 if (ST.c1 == CHRTEST_VOID || could_match) { 8860 CURLY_SETPAREN(ST.paren, ST.count); 8861 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol, 8862 script_run_begin); 8863 NOT_REACHED; /* NOTREACHED */ 8864 } 8865 } 8866 /* FALLTHROUGH */ 8867 8868 case CURLY_B_max_fail: 8869 /* failed to find B in a greedy match */ 8870 8871 REGCP_UNWIND(ST.cp); 8872 if (ST.paren) { 8873 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 8874 } 8875 /* back up. */ 8876 if (--ST.count < ST.min) 8877 sayNO; 8878 locinput = HOPc(locinput, -1); 8879 goto curly_try_B_max; 8880 8881 #undef ST 8882 8883 case END: /* last op of main pattern */ 8884 fake_end: 8885 if (cur_eval) { 8886 /* we've just finished A in /(??{A})B/; now continue with B */ 8887 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput); 8888 st->u.eval.prev_rex = rex_sv; /* inner */ 8889 8890 /* Save *all* the positions. */ 8891 st->u.eval.cp = regcppush(rex, 0, maxopenparen); 8892 rex_sv = CUR_EVAL.prev_rex; 8893 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 8894 SET_reg_curpm(rex_sv); 8895 rex = ReANY(rex_sv); 8896 rexi = RXi_GET(rex); 8897 8898 st->u.eval.prev_curlyx = cur_curlyx; 8899 cur_curlyx = CUR_EVAL.prev_curlyx; 8900 8901 REGCP_SET(st->u.eval.lastcp); 8902 8903 /* Restore parens of the outer rex without popping the 8904 * savestack */ 8905 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen); 8906 8907 st->u.eval.prev_eval = cur_eval; 8908 cur_eval = CUR_EVAL.prev_eval; 8909 DEBUG_EXECUTE_r( 8910 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n", 8911 depth, cur_eval);); 8912 if ( nochange_depth ) 8913 nochange_depth--; 8914 8915 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput); 8916 8917 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */ 8918 st->u.eval.prev_eval->u.eval.B, 8919 locinput, loceol, script_run_begin); 8920 } 8921 8922 if (locinput < reginfo->till) { 8923 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 8924 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n", 8925 PL_colors[4], 8926 (long)(locinput - startpos), 8927 (long)(reginfo->till - startpos), 8928 PL_colors[5])); 8929 8930 sayNO_SILENT; /* Cannot match: too short. */ 8931 } 8932 sayYES; /* Success! */ 8933 8934 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */ 8935 DEBUG_EXECUTE_r( 8936 Perl_re_exec_indentf( aTHX_ "%sSUCCEED: subpattern success...%s\n", 8937 depth, PL_colors[4], PL_colors[5])); 8938 sayYES; /* Success! */ 8939 8940 #undef ST 8941 #define ST st->u.ifmatch 8942 8943 case SUSPEND: /* (?>A) */ 8944 ST.wanted = 1; 8945 ST.start = locinput; 8946 ST.end = loceol; 8947 ST.count = 1; 8948 goto do_ifmatch; 8949 8950 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */ 8951 ST.wanted = 0; 8952 goto ifmatch_trivial_fail_test; 8953 8954 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */ 8955 ST.wanted = 1; 8956 ifmatch_trivial_fail_test: 8957 ST.count = scan->next_off + 1; /* next_off repurposed to be 8958 lookbehind count, requires 8959 non-zero flags */ 8960 if (! scan->flags) { /* 'flags' zero means lookahed */ 8961 8962 /* Lookahead starts here and ends at the normal place */ 8963 ST.start = locinput; 8964 ST.end = loceol; 8965 } 8966 else { 8967 PERL_UINT_FAST8_T back_count = scan->flags; 8968 char * s; 8969 8970 /* Lookbehind can look beyond the current position */ 8971 ST.end = loceol; 8972 8973 /* ... and starts at the first place in the input that is in 8974 * the range of the possible start positions */ 8975 for (; ST.count > 0; ST.count--, back_count--) { 8976 s = HOPBACKc(locinput, back_count); 8977 if (s) { 8978 ST.start = s; 8979 goto do_ifmatch; 8980 } 8981 } 8982 8983 /* If the lookbehind doesn't start in the actual string, is a 8984 * trivial match failure */ 8985 if (logical) { 8986 logical = 0; 8987 sw = 1 - cBOOL(ST.wanted); 8988 } 8989 else if (ST.wanted) 8990 sayNO; 8991 8992 /* Here, we didn't want it to match, so is actually success */ 8993 next = scan + ARG(scan); 8994 if (next == scan) 8995 next = NULL; 8996 break; 8997 } 8998 8999 do_ifmatch: 9000 ST.me = scan; 9001 ST.logical = logical; 9002 logical = 0; /* XXX: reset state of logical once it has been saved into ST */ 9003 9004 /* execute body of (?...A) */ 9005 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, 9006 ST.end, script_run_begin); 9007 NOT_REACHED; /* NOTREACHED */ 9008 9009 { 9010 bool matched; 9011 9012 case IFMATCH_A_fail: /* body of (?...A) failed */ 9013 if (! ST.logical && ST.count > 1) { 9014 9015 /* It isn't a real failure until we've tried all starting 9016 * positions. Move to the next starting position and retry */ 9017 ST.count--; 9018 ST.start = HOPc(ST.start, 1); 9019 scan = ST.me; 9020 logical = ST.logical; 9021 goto do_ifmatch; 9022 } 9023 9024 /* Here, all starting positions have been tried. */ 9025 matched = FALSE; 9026 goto ifmatch_done; 9027 9028 case IFMATCH_A: /* body of (?...A) succeeded */ 9029 matched = TRUE; 9030 ifmatch_done: 9031 sw = matched == ST.wanted; 9032 if (! ST.logical && !sw) { 9033 sayNO; 9034 } 9035 9036 if (OP(ST.me) != SUSPEND) { 9037 /* restore old position except for (?>...) */ 9038 locinput = st->locinput; 9039 loceol = st->loceol; 9040 script_run_begin = st->sr0; 9041 } 9042 scan = ST.me + ARG(ST.me); 9043 if (scan == ST.me) 9044 scan = NULL; 9045 continue; /* execute B */ 9046 } 9047 9048 #undef ST 9049 9050 case LONGJMP: /* alternative with many branches compiles to 9051 * (BRANCHJ; EXACT ...; LONGJMP ) x N */ 9052 next = scan + ARG(scan); 9053 if (next == scan) 9054 next = NULL; 9055 break; 9056 9057 case COMMIT: /* (*COMMIT) */ 9058 reginfo->cutpoint = loceol; 9059 /* FALLTHROUGH */ 9060 9061 case PRUNE: /* (*PRUNE) */ 9062 if (scan->flags) 9063 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 9064 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol, 9065 script_run_begin); 9066 NOT_REACHED; /* NOTREACHED */ 9067 9068 case COMMIT_next_fail: 9069 no_final = 1; 9070 /* FALLTHROUGH */ 9071 sayNO; 9072 NOT_REACHED; /* NOTREACHED */ 9073 9074 case OPFAIL: /* (*FAIL) */ 9075 if (scan->flags) 9076 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 9077 if (logical) { 9078 /* deal with (?(?!)X|Y) properly, 9079 * make sure we trigger the no branch 9080 * of the trailing IFTHEN structure*/ 9081 sw= 0; 9082 break; 9083 } else { 9084 sayNO; 9085 } 9086 NOT_REACHED; /* NOTREACHED */ 9087 9088 #define ST st->u.mark 9089 case MARKPOINT: /* (*MARK:foo) */ 9090 ST.prev_mark = mark_state; 9091 ST.mark_name = sv_commit = sv_yes_mark 9092 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 9093 mark_state = st; 9094 ST.mark_loc = locinput; 9095 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol, 9096 script_run_begin); 9097 NOT_REACHED; /* NOTREACHED */ 9098 9099 case MARKPOINT_next: 9100 mark_state = ST.prev_mark; 9101 sayYES; 9102 NOT_REACHED; /* NOTREACHED */ 9103 9104 case MARKPOINT_next_fail: 9105 if (popmark && sv_eq(ST.mark_name,popmark)) 9106 { 9107 if (ST.mark_loc > startpoint) 9108 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 9109 popmark = NULL; /* we found our mark */ 9110 sv_commit = ST.mark_name; 9111 9112 DEBUG_EXECUTE_r({ 9113 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n", 9114 depth, 9115 PL_colors[4], SVfARG(sv_commit), PL_colors[5]); 9116 }); 9117 } 9118 mark_state = ST.prev_mark; 9119 sv_yes_mark = mark_state ? 9120 mark_state->u.mark.mark_name : NULL; 9121 sayNO; 9122 NOT_REACHED; /* NOTREACHED */ 9123 9124 case SKIP: /* (*SKIP) */ 9125 if (!scan->flags) { 9126 /* (*SKIP) : if we fail we cut here*/ 9127 ST.mark_name = NULL; 9128 ST.mark_loc = locinput; 9129 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol, 9130 script_run_begin); 9131 } else { 9132 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, 9133 otherwise do nothing. Meaning we need to scan 9134 */ 9135 regmatch_state *cur = mark_state; 9136 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 9137 9138 while (cur) { 9139 if ( sv_eq( cur->u.mark.mark_name, 9140 find ) ) 9141 { 9142 ST.mark_name = find; 9143 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol, 9144 script_run_begin); 9145 } 9146 cur = cur->u.mark.prev_mark; 9147 } 9148 } 9149 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ 9150 break; 9151 9152 case SKIP_next_fail: 9153 if (ST.mark_name) { 9154 /* (*CUT:NAME) - Set up to search for the name as we 9155 collapse the stack*/ 9156 popmark = ST.mark_name; 9157 } else { 9158 /* (*CUT) - No name, we cut here.*/ 9159 if (ST.mark_loc > startpoint) 9160 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 9161 /* but we set sv_commit to latest mark_name if there 9162 is one so they can test to see how things lead to this 9163 cut */ 9164 if (mark_state) 9165 sv_commit=mark_state->u.mark.mark_name; 9166 } 9167 no_final = 1; 9168 sayNO; 9169 NOT_REACHED; /* NOTREACHED */ 9170 #undef ST 9171 9172 case LNBREAK: /* \R */ 9173 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) { 9174 locinput += n; 9175 } else 9176 sayNO; 9177 break; 9178 9179 default: 9180 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n", 9181 PTR2UV(scan), OP(scan)); 9182 Perl_croak(aTHX_ "regexp memory corruption"); 9183 9184 /* this is a point to jump to in order to increment 9185 * locinput by one character */ 9186 increment_locinput: 9187 assert(!NEXTCHR_IS_EOS); 9188 if (utf8_target) { 9189 locinput += PL_utf8skip[nextchr]; 9190 /* locinput is allowed to go 1 char off the end (signifying 9191 * EOS), but not 2+ */ 9192 if (locinput > loceol) 9193 sayNO; 9194 } 9195 else 9196 locinput++; 9197 break; 9198 9199 } /* end switch */ 9200 9201 /* switch break jumps here */ 9202 scan = next; /* prepare to execute the next op and ... */ 9203 continue; /* ... jump back to the top, reusing st */ 9204 /* NOTREACHED */ 9205 9206 push_yes_state: 9207 /* push a state that backtracks on success */ 9208 st->u.yes.prev_yes_state = yes_state; 9209 yes_state = st; 9210 /* FALLTHROUGH */ 9211 push_state: 9212 /* push a new regex state, then continue at scan */ 9213 { 9214 regmatch_state *newst; 9215 DECLARE_AND_GET_RE_DEBUG_FLAGS; 9216 9217 DEBUG_r( /* DEBUG_STACK_r */ 9218 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) { 9219 regmatch_state *cur = st; 9220 regmatch_state *curyes = yes_state; 9221 U32 i; 9222 regmatch_slab *slab = PL_regmatch_slab; 9223 for (i = 0; i < 3 && i <= depth; cur--,i++) { 9224 if (cur < SLAB_FIRST(slab)) { 9225 slab = slab->prev; 9226 cur = SLAB_LAST(slab); 9227 } 9228 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n", 9229 depth, 9230 i ? " " : "push", 9231 depth - i, PL_reg_name[cur->resume_state], 9232 (curyes == cur) ? "yes" : "" 9233 ); 9234 if (curyes == cur) 9235 curyes = cur->u.yes.prev_yes_state; 9236 } 9237 } else { 9238 DEBUG_STATE_pp("push") 9239 }); 9240 depth++; 9241 st->locinput = locinput; 9242 st->loceol = loceol; 9243 st->sr0 = script_run_begin; 9244 newst = st+1; 9245 if (newst > SLAB_LAST(PL_regmatch_slab)) 9246 newst = S_push_slab(aTHX); 9247 PL_regmatch_state = newst; 9248 9249 locinput = pushinput; 9250 loceol = pusheol; 9251 script_run_begin = pushsr0; 9252 st = newst; 9253 continue; 9254 /* NOTREACHED */ 9255 } 9256 } 9257 #ifdef SOLARIS_BAD_OPTIMIZER 9258 # undef PL_charclass 9259 #endif 9260 9261 /* 9262 * We get here only if there's trouble -- normally "case END" is 9263 * the terminating point. 9264 */ 9265 Perl_croak(aTHX_ "corrupted regexp pointers"); 9266 NOT_REACHED; /* NOTREACHED */ 9267 9268 yes: 9269 if (yes_state) { 9270 /* we have successfully completed a subexpression, but we must now 9271 * pop to the state marked by yes_state and continue from there */ 9272 assert(st != yes_state); 9273 #ifdef DEBUGGING 9274 while (st != yes_state) { 9275 st--; 9276 if (st < SLAB_FIRST(PL_regmatch_slab)) { 9277 PL_regmatch_slab = PL_regmatch_slab->prev; 9278 st = SLAB_LAST(PL_regmatch_slab); 9279 } 9280 DEBUG_STATE_r({ 9281 if (no_final) { 9282 DEBUG_STATE_pp("pop (no final)"); 9283 } else { 9284 DEBUG_STATE_pp("pop (yes)"); 9285 } 9286 }); 9287 depth--; 9288 } 9289 #else 9290 while (yes_state < SLAB_FIRST(PL_regmatch_slab) 9291 || yes_state > SLAB_LAST(PL_regmatch_slab)) 9292 { 9293 /* not in this slab, pop slab */ 9294 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); 9295 PL_regmatch_slab = PL_regmatch_slab->prev; 9296 st = SLAB_LAST(PL_regmatch_slab); 9297 } 9298 depth -= (st - yes_state); 9299 #endif 9300 st = yes_state; 9301 yes_state = st->u.yes.prev_yes_state; 9302 PL_regmatch_state = st; 9303 9304 if (no_final) { 9305 locinput= st->locinput; 9306 loceol= st->loceol; 9307 script_run_begin = st->sr0; 9308 } 9309 state_num = st->resume_state + no_final; 9310 goto reenter_switch; 9311 } 9312 9313 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n", 9314 PL_colors[4], PL_colors[5])); 9315 9316 if (reginfo->info_aux_eval) { 9317 /* each successfully executed (?{...}) block does the equivalent of 9318 * local $^R = do {...} 9319 * When popping the save stack, all these locals would be undone; 9320 * bypass this by setting the outermost saved $^R to the latest 9321 * value */ 9322 /* I dont know if this is needed or works properly now. 9323 * see code related to PL_replgv elsewhere in this file. 9324 * Yves 9325 */ 9326 if (oreplsv != GvSV(PL_replgv)) { 9327 sv_setsv(oreplsv, GvSV(PL_replgv)); 9328 SvSETMAGIC(oreplsv); 9329 } 9330 } 9331 result = 1; 9332 goto final_exit; 9333 9334 no: 9335 DEBUG_EXECUTE_r( 9336 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n", 9337 depth, 9338 PL_colors[4], PL_colors[5]) 9339 ); 9340 9341 no_silent: 9342 if (no_final) { 9343 if (yes_state) { 9344 goto yes; 9345 } else { 9346 goto final_exit; 9347 } 9348 } 9349 if (depth) { 9350 /* there's a previous state to backtrack to */ 9351 st--; 9352 if (st < SLAB_FIRST(PL_regmatch_slab)) { 9353 PL_regmatch_slab = PL_regmatch_slab->prev; 9354 st = SLAB_LAST(PL_regmatch_slab); 9355 } 9356 PL_regmatch_state = st; 9357 locinput= st->locinput; 9358 loceol= st->loceol; 9359 script_run_begin = st->sr0; 9360 9361 DEBUG_STATE_pp("pop"); 9362 depth--; 9363 if (yes_state == st) 9364 yes_state = st->u.yes.prev_yes_state; 9365 9366 state_num = st->resume_state + 1; /* failure = success + 1 */ 9367 PERL_ASYNC_CHECK(); 9368 goto reenter_switch; 9369 } 9370 result = 0; 9371 9372 final_exit: 9373 if (rex->intflags & PREGf_VERBARG_SEEN) { 9374 SV *sv_err = get_sv("REGERROR", 1); 9375 SV *sv_mrk = get_sv("REGMARK", 1); 9376 if (result) { 9377 sv_commit = &PL_sv_no; 9378 if (!sv_yes_mark) 9379 sv_yes_mark = &PL_sv_yes; 9380 } else { 9381 if (!sv_commit) 9382 sv_commit = &PL_sv_yes; 9383 sv_yes_mark = &PL_sv_no; 9384 } 9385 assert(sv_err); 9386 assert(sv_mrk); 9387 sv_setsv(sv_err, sv_commit); 9388 sv_setsv(sv_mrk, sv_yes_mark); 9389 } 9390 9391 9392 if (last_pushed_cv) { 9393 dSP; 9394 /* see "Some notes about MULTICALL" above */ 9395 POP_MULTICALL; 9396 PERL_UNUSED_VAR(SP); 9397 } 9398 else 9399 LEAVE_SCOPE(orig_savestack_ix); 9400 9401 assert(!result || locinput - reginfo->strbeg >= 0); 9402 return result ? locinput - reginfo->strbeg : -1; 9403 } 9404 9405 /* 9406 - regrepeat - repeatedly match something simple, report how many 9407 * 9408 * What 'simple' means is a node which can be the operand of a quantifier like 9409 * '+', or {1,3} 9410 * 9411 * startposp - pointer to a pointer to the start position. This is updated 9412 * to point to the byte following the highest successful 9413 * match. 9414 * p - the regnode to be repeatedly matched against. 9415 * loceol - pointer to the end position beyond which we aren't supposed to 9416 * look. 9417 * reginfo - struct holding match state, such as utf8_target 9418 * max - maximum number of things to match. 9419 * depth - (for debugging) backtracking depth. 9420 */ 9421 STATIC I32 9422 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, 9423 char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH) 9424 { 9425 dVAR; 9426 char *scan; /* Pointer to current position in target string */ 9427 I32 c; 9428 char *this_eol = loceol; /* potentially adjusted version. */ 9429 I32 hardcount = 0; /* How many matches so far */ 9430 bool utf8_target = reginfo->is_utf8_target; 9431 unsigned int to_complement = 0; /* Invert the result? */ 9432 UV utf8_flags = 0; 9433 _char_class_number classnum; 9434 9435 PERL_ARGS_ASSERT_REGREPEAT; 9436 9437 /* This routine is structured so that we switch on the input OP. Each OP 9438 * case: statement contains a loop to repeatedly apply the OP, advancing 9439 * the input until it fails, or reaches the end of the input, or until it 9440 * reaches the upper limit of matches. */ 9441 9442 scan = *startposp; 9443 if (max == REG_INFTY) /* This is a special marker to go to the platform's 9444 max */ 9445 max = I32_MAX; 9446 else if (! utf8_target && this_eol - scan > max) 9447 this_eol = scan + max; 9448 9449 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down 9450 * to the maximum of how far we should go in it (leaving it set to the real 9451 * end, if the maximum permissible would take us beyond that). This allows 9452 * us to make the loop exit condition that we haven't gone past <this_eol> to 9453 * also mean that we haven't exceeded the max permissible count, saving a 9454 * test each time through the loops. But it assumes that the OP matches a 9455 * single byte, which is true for most of the OPs below when applied to a 9456 * non-UTF-8 target. Those relatively few OPs that don't have this 9457 * characteristic will have to compensate. 9458 * 9459 * There is no adjustment for UTF-8 targets, as the number of bytes per 9460 * character varies. OPs will have to test both that the count is less 9461 * than the max permissible (using <hardcount> to keep track), and that we 9462 * are still within the bounds of the string (using <this_eol>. A few OPs 9463 * match a single byte no matter what the encoding. They can omit the max 9464 * test if, for the UTF-8 case, they do the adjustment that was skipped 9465 * above. 9466 * 9467 * Thus, the code above sets things up for the common case; and exceptional 9468 * cases need extra work; the common case is to make sure <scan> doesn't 9469 * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the 9470 * count doesn't exceed the maximum permissible */ 9471 9472 switch (OP(p)) { 9473 case REG_ANY: 9474 if (utf8_target) { 9475 while (scan < this_eol && hardcount < max && *scan != '\n') { 9476 scan += UTF8SKIP(scan); 9477 hardcount++; 9478 } 9479 } else { 9480 scan = (char *) memchr(scan, '\n', this_eol - scan); 9481 if (! scan) { 9482 scan = this_eol; 9483 } 9484 } 9485 break; 9486 case SANY: 9487 if (utf8_target) { 9488 while (scan < this_eol && hardcount < max) { 9489 scan += UTF8SKIP(scan); 9490 hardcount++; 9491 } 9492 } 9493 else 9494 scan = this_eol; 9495 break; 9496 9497 case LEXACT_REQ8: 9498 if (! utf8_target) { 9499 break; 9500 } 9501 /* FALLTHROUGH */ 9502 9503 case LEXACT: 9504 { 9505 U8 * string; 9506 Size_t str_len; 9507 9508 string = (U8 *) STRINGl(p); 9509 str_len = STR_LENl(p); 9510 goto join_short_long_exact; 9511 9512 case EXACTL: 9513 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 9514 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) { 9515 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol); 9516 } 9517 goto do_exact; 9518 9519 case EXACT_REQ8: 9520 if (! utf8_target) { 9521 break; 9522 } 9523 /* FALLTHROUGH */ 9524 case EXACT: 9525 do_exact: 9526 string = (U8 *) STRINGs(p); 9527 str_len = STR_LENs(p); 9528 9529 join_short_long_exact: 9530 assert(str_len == reginfo->is_utf8_pat ? UTF8SKIP(string) : 1); 9531 9532 c = *string; 9533 9534 /* Can use a simple find if the pattern char to match on is invariant 9535 * under UTF-8, or both target and pattern aren't UTF-8. Note that we 9536 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's 9537 * true iff it doesn't matter if the argument is in UTF-8 or not */ 9538 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) { 9539 if (utf8_target && this_eol - scan > max) { 9540 /* We didn't adjust <this_eol> because is UTF-8, but ok to do so, 9541 * since here, to match at all, 1 char == 1 byte */ 9542 this_eol = scan + max; 9543 } 9544 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c); 9545 } 9546 else if (reginfo->is_utf8_pat) { 9547 if (utf8_target) { 9548 STRLEN scan_char_len; 9549 9550 /* When both target and pattern are UTF-8, we have to do 9551 * string EQ */ 9552 while (hardcount < max 9553 && scan < this_eol 9554 && (scan_char_len = UTF8SKIP(scan)) <= str_len 9555 && memEQ(scan, string, scan_char_len)) 9556 { 9557 scan += scan_char_len; 9558 hardcount++; 9559 } 9560 } 9561 else if (! UTF8_IS_ABOVE_LATIN1(c)) { 9562 9563 /* Target isn't utf8; convert the character in the UTF-8 9564 * pattern to non-UTF8, and do a simple find */ 9565 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(string + 1)); 9566 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c); 9567 } /* else pattern char is above Latin1, can't possibly match the 9568 non-UTF-8 target */ 9569 } 9570 else { 9571 9572 /* Here, the string must be utf8; pattern isn't, and <c> is 9573 * different in utf8 than not, so can't compare them directly. 9574 * Outside the loop, find the two utf8 bytes that represent c, and 9575 * then look for those in sequence in the utf8 string */ 9576 U8 high = UTF8_TWO_BYTE_HI(c); 9577 U8 low = UTF8_TWO_BYTE_LO(c); 9578 9579 while (hardcount < max 9580 && scan + 1 < this_eol 9581 && UCHARAT(scan) == high 9582 && UCHARAT(scan + 1) == low) 9583 { 9584 scan += 2; 9585 hardcount++; 9586 } 9587 } 9588 break; 9589 } 9590 9591 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ 9592 assert(! reginfo->is_utf8_pat); 9593 /* FALLTHROUGH */ 9594 case EXACTFAA: 9595 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; 9596 if (reginfo->is_utf8_pat || ! utf8_target) { 9597 9598 /* The possible presence of a MICRO SIGN in the pattern forbids us 9599 * to view a non-UTF-8 pattern as folded when there is a UTF-8 9600 * target. */ 9601 utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE; 9602 } 9603 goto do_exactf; 9604 9605 case EXACTFL: 9606 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 9607 utf8_flags = FOLDEQ_LOCALE; 9608 goto do_exactf; 9609 9610 case EXACTF: /* This node only generated for non-utf8 patterns */ 9611 assert(! reginfo->is_utf8_pat); 9612 goto do_exactf; 9613 9614 case EXACTFLU8: 9615 if (! utf8_target) { 9616 break; 9617 } 9618 utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED 9619 | FOLDEQ_S2_FOLDS_SANE; 9620 goto do_exactf; 9621 9622 case EXACTFU_REQ8: 9623 if (! utf8_target) { 9624 break; 9625 } 9626 assert(reginfo->is_utf8_pat); 9627 utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 9628 goto do_exactf; 9629 9630 case EXACTFU: 9631 utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 9632 /* FALLTHROUGH */ 9633 9634 case EXACTFUP: 9635 9636 do_exactf: { 9637 int c1, c2; 9638 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; 9639 9640 assert(STR_LENs(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1); 9641 9642 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, 9643 reginfo)) 9644 { 9645 if (c1 == CHRTEST_VOID) { 9646 /* Use full Unicode fold matching */ 9647 char *tmpeol = loceol; 9648 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1; 9649 while (hardcount < max 9650 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, 9651 STRINGs(p), NULL, pat_len, 9652 reginfo->is_utf8_pat, utf8_flags)) 9653 { 9654 scan = tmpeol; 9655 tmpeol = loceol; 9656 hardcount++; 9657 } 9658 } 9659 else if (utf8_target) { 9660 if (c1 == c2) { 9661 while (scan < this_eol 9662 && hardcount < max 9663 && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, 9664 loceol))) 9665 { 9666 scan += UTF8SKIP(c1_utf8); 9667 hardcount++; 9668 } 9669 } 9670 else { 9671 while (scan < this_eol 9672 && hardcount < max 9673 && ( memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, 9674 loceol)) 9675 || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan, 9676 loceol)))) 9677 { 9678 scan += UTF8_SAFE_SKIP(scan, loceol); 9679 hardcount++; 9680 } 9681 } 9682 } 9683 else if (c1 == c2) { 9684 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c1); 9685 } 9686 else { 9687 /* See comments in regmatch() CURLY_B_min_known_fail. We avoid 9688 * a conditional each time through the loop if the characters 9689 * differ only in a single bit, as is the usual situation */ 9690 U8 c1_c2_bits_differing = c1 ^ c2; 9691 9692 if (isPOWER_OF_2(c1_c2_bits_differing)) { 9693 U8 c1_c2_mask = ~ c1_c2_bits_differing; 9694 9695 scan = (char *) find_span_end_mask((U8 *) scan, 9696 (U8 *) this_eol, 9697 c1 & c1_c2_mask, 9698 c1_c2_mask); 9699 } 9700 else { 9701 while ( scan < this_eol 9702 && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) 9703 { 9704 scan++; 9705 } 9706 } 9707 } 9708 } 9709 break; 9710 } 9711 case ANYOFPOSIXL: 9712 case ANYOFL: 9713 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 9714 9715 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) { 9716 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); 9717 } 9718 /* FALLTHROUGH */ 9719 case ANYOFD: 9720 case ANYOF: 9721 if (utf8_target) { 9722 while (hardcount < max 9723 && scan < this_eol 9724 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target)) 9725 { 9726 scan += UTF8SKIP(scan); 9727 hardcount++; 9728 } 9729 } 9730 else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 9731 while (scan < this_eol 9732 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0)) 9733 scan++; 9734 } 9735 else { 9736 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan))) 9737 scan++; 9738 } 9739 break; 9740 9741 case ANYOFM: 9742 if (utf8_target && this_eol - scan > max) { 9743 9744 /* We didn't adjust <this_eol> at the beginning of this routine 9745 * because is UTF-8, but it is actually ok to do so, since here, to 9746 * match, 1 char == 1 byte. */ 9747 this_eol = scan + max; 9748 } 9749 9750 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p)); 9751 break; 9752 9753 case NANYOFM: 9754 if (utf8_target) { 9755 while ( hardcount < max 9756 && scan < this_eol 9757 && (*scan & FLAGS(p)) != ARG(p)) 9758 { 9759 scan += UTF8SKIP(scan); 9760 hardcount++; 9761 } 9762 } 9763 else { 9764 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p)); 9765 } 9766 break; 9767 9768 case ANYOFH: 9769 if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ 9770 while ( hardcount < max 9771 && scan < this_eol 9772 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 9773 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 9774 { 9775 scan += UTF8SKIP(scan); 9776 hardcount++; 9777 } 9778 } 9779 break; 9780 9781 case ANYOFHb: 9782 if (utf8_target) { /* ANYOFHb only can match UTF-8 targets */ 9783 9784 /* we know the first byte must be the FLAGS field */ 9785 while ( hardcount < max 9786 && scan < this_eol 9787 && (U8) *scan == ANYOF_FLAGS(p) 9788 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, 9789 TRUE)) 9790 { 9791 scan += UTF8SKIP(scan); 9792 hardcount++; 9793 } 9794 } 9795 break; 9796 9797 case ANYOFHr: 9798 if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ 9799 while ( hardcount < max 9800 && scan < this_eol 9801 && inRANGE(NATIVE_UTF8_TO_I8(*scan), 9802 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)), 9803 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p))) 9804 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 9805 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 9806 { 9807 scan += UTF8SKIP(scan); 9808 hardcount++; 9809 } 9810 } 9811 break; 9812 9813 case ANYOFHs: 9814 if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ 9815 while ( hardcount < max 9816 && scan + FLAGS(p) < this_eol 9817 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p)) 9818 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 9819 { 9820 scan += UTF8SKIP(scan); 9821 hardcount++; 9822 } 9823 } 9824 break; 9825 9826 case ANYOFR: 9827 if (utf8_target) { 9828 while ( hardcount < max 9829 && scan < this_eol 9830 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 9831 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, 9832 (U8 *) this_eol, 9833 NULL), 9834 ANYOFRbase(p), ANYOFRdelta(p))) 9835 { 9836 scan += UTF8SKIP(scan); 9837 hardcount++; 9838 } 9839 } 9840 else { 9841 while ( hardcount < max 9842 && scan < this_eol 9843 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) 9844 { 9845 scan++; 9846 hardcount++; 9847 } 9848 } 9849 break; 9850 9851 case ANYOFRb: 9852 if (utf8_target) { 9853 while ( hardcount < max 9854 && scan < this_eol 9855 && (U8) *scan == ANYOF_FLAGS(p) 9856 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, 9857 (U8 *) this_eol, 9858 NULL), 9859 ANYOFRbase(p), ANYOFRdelta(p))) 9860 { 9861 scan += UTF8SKIP(scan); 9862 hardcount++; 9863 } 9864 } 9865 else { 9866 while ( hardcount < max 9867 && scan < this_eol 9868 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) 9869 { 9870 scan++; 9871 hardcount++; 9872 } 9873 } 9874 break; 9875 9876 /* The argument (FLAGS) to all the POSIX node types is the class number */ 9877 9878 case NPOSIXL: 9879 to_complement = 1; 9880 /* FALLTHROUGH */ 9881 9882 case POSIXL: 9883 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 9884 if (! utf8_target) { 9885 while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), 9886 *scan))) 9887 { 9888 scan++; 9889 } 9890 } else { 9891 while (hardcount < max && scan < this_eol 9892 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), 9893 (U8 *) scan, 9894 (U8 *) this_eol))) 9895 { 9896 scan += UTF8SKIP(scan); 9897 hardcount++; 9898 } 9899 } 9900 break; 9901 9902 case POSIXD: 9903 if (utf8_target) { 9904 goto utf8_posix; 9905 } 9906 /* FALLTHROUGH */ 9907 9908 case POSIXA: 9909 if (utf8_target && this_eol - scan > max) { 9910 9911 /* We didn't adjust <this_eol> at the beginning of this routine 9912 * because is UTF-8, but it is actually ok to do so, since here, to 9913 * match, 1 char == 1 byte. */ 9914 this_eol = scan + max; 9915 } 9916 while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) { 9917 scan++; 9918 } 9919 break; 9920 9921 case NPOSIXD: 9922 if (utf8_target) { 9923 to_complement = 1; 9924 goto utf8_posix; 9925 } 9926 /* FALLTHROUGH */ 9927 9928 case NPOSIXA: 9929 if (! utf8_target) { 9930 while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { 9931 scan++; 9932 } 9933 } 9934 else { 9935 9936 /* The complement of something that matches only ASCII matches all 9937 * non-ASCII, plus everything in ASCII that isn't in the class. */ 9938 while (hardcount < max && scan < this_eol 9939 && ( ! isASCII_utf8_safe(scan, loceol) 9940 || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) 9941 { 9942 scan += UTF8SKIP(scan); 9943 hardcount++; 9944 } 9945 } 9946 break; 9947 9948 case NPOSIXU: 9949 to_complement = 1; 9950 /* FALLTHROUGH */ 9951 9952 case POSIXU: 9953 if (! utf8_target) { 9954 while (scan < this_eol && to_complement 9955 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p)))) 9956 { 9957 scan++; 9958 } 9959 } 9960 else { 9961 utf8_posix: 9962 classnum = (_char_class_number) FLAGS(p); 9963 switch (classnum) { 9964 default: 9965 while ( hardcount < max && scan < this_eol 9966 && to_complement ^ cBOOL(_invlist_contains_cp( 9967 PL_XPosix_ptrs[classnum], 9968 utf8_to_uvchr_buf((U8 *) scan, 9969 (U8 *) this_eol, 9970 NULL)))) 9971 { 9972 scan += UTF8SKIP(scan); 9973 hardcount++; 9974 } 9975 break; 9976 9977 /* For the classes below, the knowledge of how to handle 9978 * every code point is compiled in to Perl via a macro. 9979 * This code is written for making the loops as tight as 9980 * possible. It could be refactored to save space instead. 9981 * */ 9982 9983 case _CC_ENUM_SPACE: 9984 while (hardcount < max 9985 && scan < this_eol 9986 && (to_complement 9987 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol)))) 9988 { 9989 scan += UTF8SKIP(scan); 9990 hardcount++; 9991 } 9992 break; 9993 case _CC_ENUM_BLANK: 9994 while (hardcount < max 9995 && scan < this_eol 9996 && (to_complement 9997 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol)))) 9998 { 9999 scan += UTF8SKIP(scan); 10000 hardcount++; 10001 } 10002 break; 10003 case _CC_ENUM_XDIGIT: 10004 while (hardcount < max 10005 && scan < this_eol 10006 && (to_complement 10007 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol)))) 10008 { 10009 scan += UTF8SKIP(scan); 10010 hardcount++; 10011 } 10012 break; 10013 case _CC_ENUM_VERTSPACE: 10014 while (hardcount < max 10015 && scan < this_eol 10016 && (to_complement 10017 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol)))) 10018 { 10019 scan += UTF8SKIP(scan); 10020 hardcount++; 10021 } 10022 break; 10023 case _CC_ENUM_CNTRL: 10024 while (hardcount < max 10025 && scan < this_eol 10026 && (to_complement 10027 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol)))) 10028 { 10029 scan += UTF8SKIP(scan); 10030 hardcount++; 10031 } 10032 break; 10033 } 10034 } 10035 break; 10036 10037 case LNBREAK: 10038 if (utf8_target) { 10039 while (hardcount < max && scan < this_eol && 10040 (c=is_LNBREAK_utf8_safe(scan, this_eol))) { 10041 scan += c; 10042 hardcount++; 10043 } 10044 } else { 10045 /* LNBREAK can match one or two latin chars, which is ok, but we 10046 * have to use hardcount in this situation, and throw away the 10047 * adjustment to <this_eol> done before the switch statement */ 10048 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { 10049 scan+=c; 10050 hardcount++; 10051 } 10052 } 10053 break; 10054 10055 case BOUNDL: 10056 case NBOUNDL: 10057 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 10058 /* FALLTHROUGH */ 10059 case BOUND: 10060 case BOUNDA: 10061 case BOUNDU: 10062 case EOS: 10063 case GPOS: 10064 case KEEPS: 10065 case NBOUND: 10066 case NBOUNDA: 10067 case NBOUNDU: 10068 case OPFAIL: 10069 case SBOL: 10070 case SEOL: 10071 /* These are all 0 width, so match right here or not at all. */ 10072 break; 10073 10074 default: 10075 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); 10076 NOT_REACHED; /* NOTREACHED */ 10077 10078 } 10079 10080 if (hardcount) 10081 c = hardcount; 10082 else 10083 c = scan - *startposp; 10084 *startposp = scan; 10085 10086 DEBUG_r({ 10087 DECLARE_AND_GET_RE_DEBUG_FLAGS; 10088 DEBUG_EXECUTE_r({ 10089 SV * const prop = sv_newmortal(); 10090 regprop(prog, prop, p, reginfo, NULL); 10091 Perl_re_exec_indentf( aTHX_ "%s can match %" IVdf " times out of %" IVdf "...\n", 10092 depth, SvPVX_const(prop),(IV)c,(IV)max); 10093 }); 10094 }); 10095 10096 return(c); 10097 } 10098 10099 /* 10100 - reginclass - determine if a character falls into a character class 10101 10102 n is the ANYOF-type regnode 10103 p is the target string 10104 p_end points to one byte beyond the end of the target string 10105 utf8_target tells whether p is in UTF-8. 10106 10107 Returns true if matched; false otherwise. 10108 10109 Note that this can be a synthetic start class, a combination of various 10110 nodes, so things you think might be mutually exclusive, such as locale, 10111 aren't. It can match both locale and non-locale 10112 10113 */ 10114 10115 STATIC bool 10116 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) 10117 { 10118 dVAR; 10119 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs)) 10120 ? 0 10121 : ANYOF_FLAGS(n); 10122 bool match = FALSE; 10123 UV c = *p; 10124 10125 PERL_ARGS_ASSERT_REGINCLASS; 10126 10127 /* If c is not already the code point, get it. Note that 10128 * UTF8_IS_INVARIANT() works even if not in UTF-8 */ 10129 if (! UTF8_IS_INVARIANT(c) && utf8_target) { 10130 STRLEN c_len = 0; 10131 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT; 10132 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY); 10133 if (c_len == (STRLEN)-1) { 10134 _force_out_malformed_utf8_message(p, p_end, 10135 utf8n_flags, 10136 1 /* 1 means die */ ); 10137 NOT_REACHED; /* NOTREACHED */ 10138 } 10139 if ( c > 255 10140 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL) 10141 && ! ANYOFL_UTF8_LOCALE_REQD(flags)) 10142 { 10143 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); 10144 } 10145 } 10146 10147 /* If this character is potentially in the bitmap, check it */ 10148 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) { 10149 if (ANYOF_BITMAP_TEST(n, c)) 10150 match = TRUE; 10151 else if ((flags 10152 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) 10153 && OP(n) == ANYOFD 10154 && ! utf8_target 10155 && ! isASCII(c)) 10156 { 10157 match = TRUE; 10158 } 10159 else if (flags & ANYOF_LOCALE_FLAGS) { 10160 if ( (flags & ANYOFL_FOLD) 10161 && c < sizeof(PL_fold_locale) 10162 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) 10163 { 10164 match = TRUE; 10165 } 10166 else if ( ANYOF_POSIXL_TEST_ANY_SET(n) 10167 && c <= U8_MAX /* param to isFOO_lc() */ 10168 ) { 10169 10170 /* The data structure is arranged so bits 0, 2, 4, ... are set 10171 * if the class includes the Posix character class given by 10172 * bit/2; and 1, 3, 5, ... are set if the class includes the 10173 * complemented Posix class given by int(bit/2). So we loop 10174 * through the bits, each time changing whether we complement 10175 * the result or not. Suppose for the sake of illustration 10176 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0 10177 * is set, it means there is a match for this ANYOF node if the 10178 * character is in the class given by the expression (0 / 2 = 0 10179 * = \w). If it is in that class, isFOO_lc() will return 1, 10180 * and since 'to_complement' is 0, the result will stay TRUE, 10181 * and we exit the loop. Suppose instead that bit 0 is 0, but 10182 * bit 1 is 1. That means there is a match if the character 10183 * matches \W. We won't bother to call isFOO_lc() on bit 0, 10184 * but will on bit 1. On the second iteration 'to_complement' 10185 * will be 1, so the exclusive or will reverse things, so we 10186 * are testing for \W. On the third iteration, 'to_complement' 10187 * will be 0, and we would be testing for \s; the fourth 10188 * iteration would test for \S, etc. 10189 * 10190 * Note that this code assumes that all the classes are closed 10191 * under folding. For example, if a character matches \w, then 10192 * its fold does too; and vice versa. This should be true for 10193 * any well-behaved locale for all the currently defined Posix 10194 * classes, except for :lower: and :upper:, which are handled 10195 * by the pseudo-class :cased: which matches if either of the 10196 * other two does. To get rid of this assumption, an outer 10197 * loop could be used below to iterate over both the source 10198 * character, and its fold (if different) */ 10199 10200 int count = 0; 10201 int to_complement = 0; 10202 10203 while (count < ANYOF_MAX) { 10204 if (ANYOF_POSIXL_TEST(n, count) 10205 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) 10206 { 10207 match = TRUE; 10208 break; 10209 } 10210 count++; 10211 to_complement ^= 1; 10212 } 10213 } 10214 } 10215 } 10216 10217 10218 /* If the bitmap didn't (or couldn't) match, and something outside the 10219 * bitmap could match, try that. */ 10220 if (!match) { 10221 if (c >= NUM_ANYOF_CODE_POINTS 10222 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)) 10223 { 10224 match = TRUE; /* Everything above the bitmap matches */ 10225 } 10226 /* Here doesn't match everything above the bitmap. If there is 10227 * some information available beyond the bitmap, we may find a 10228 * match in it. If so, this is most likely because the code point 10229 * is outside the bitmap range. But rarely, it could be because of 10230 * some other reason. If so, various flags are set to indicate 10231 * this possibility. On ANYOFD nodes, there may be matches that 10232 * happen only when the target string is UTF-8; or for other node 10233 * types, because runtime lookup is needed, regardless of the 10234 * UTF-8ness of the target string. Finally, under /il, there may 10235 * be some matches only possible if the locale is a UTF-8 one. */ 10236 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP 10237 && ( c >= NUM_ANYOF_CODE_POINTS 10238 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) 10239 && ( UNLIKELY(OP(n) != ANYOFD) 10240 || (utf8_target && ! isASCII_uni(c) 10241 # if NUM_ANYOF_CODE_POINTS > 256 10242 && c < 256 10243 # endif 10244 ))) 10245 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags) 10246 && IN_UTF8_CTYPE_LOCALE))) 10247 { 10248 SV* only_utf8_locale = NULL; 10249 SV * const definition = 10250 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 10251 get_regclass_nonbitmap_data(prog, n, TRUE, 0, 10252 &only_utf8_locale, NULL); 10253 #else 10254 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0, 10255 &only_utf8_locale, NULL); 10256 #endif 10257 if (definition) { 10258 U8 utf8_buffer[2]; 10259 U8 * utf8_p; 10260 if (utf8_target) { 10261 utf8_p = (U8 *) p; 10262 } else { /* Convert to utf8 */ 10263 utf8_p = utf8_buffer; 10264 append_utf8_from_native_byte(*p, &utf8_p); 10265 utf8_p = utf8_buffer; 10266 } 10267 10268 /* Turkish locales have these hard-coded rules overriding 10269 * normal ones */ 10270 if ( UNLIKELY(PL_in_utf8_turkic_locale) 10271 && isALPHA_FOLD_EQ(*p, 'i')) 10272 { 10273 if (*p == 'i') { 10274 if (_invlist_contains_cp(definition, 10275 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) 10276 { 10277 match = TRUE; 10278 } 10279 } 10280 else if (*p == 'I') { 10281 if (_invlist_contains_cp(definition, 10282 LATIN_SMALL_LETTER_DOTLESS_I)) 10283 { 10284 match = TRUE; 10285 } 10286 } 10287 } 10288 else if (_invlist_contains_cp(definition, c)) { 10289 match = TRUE; 10290 } 10291 } 10292 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { 10293 match = _invlist_contains_cp(only_utf8_locale, c); 10294 } 10295 } 10296 10297 /* In a Turkic locale under folding, hard-code the I i case pair 10298 * matches */ 10299 if ( UNLIKELY(PL_in_utf8_turkic_locale) 10300 && ! match 10301 && (flags & ANYOFL_FOLD) 10302 && utf8_target) 10303 { 10304 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 10305 if (ANYOF_BITMAP_TEST(n, 'i')) { 10306 match = TRUE; 10307 } 10308 } 10309 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) { 10310 if (ANYOF_BITMAP_TEST(n, 'I')) { 10311 match = TRUE; 10312 } 10313 } 10314 } 10315 10316 if (UNICODE_IS_SUPER(c) 10317 && (flags 10318 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) 10319 && OP(n) != ANYOFD 10320 && ckWARN_d(WARN_NON_UNICODE)) 10321 { 10322 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 10323 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c); 10324 } 10325 } 10326 10327 #if ANYOF_INVERT != 1 10328 /* Depending on compiler optimization cBOOL takes time, so if don't have to 10329 * use it, don't */ 10330 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, 10331 #endif 10332 10333 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ 10334 return (flags & ANYOF_INVERT) ^ match; 10335 } 10336 10337 STATIC U8 * 10338 S_reghop3(U8 *s, SSize_t off, const U8* lim) 10339 { 10340 /* return the position 'off' UTF-8 characters away from 's', forward if 10341 * 'off' >= 0, backwards if negative. But don't go outside of position 10342 * 'lim', which better be < s if off < 0 */ 10343 10344 PERL_ARGS_ASSERT_REGHOP3; 10345 10346 if (off >= 0) { 10347 while (off-- && s < lim) { 10348 /* XXX could check well-formedness here */ 10349 U8 *new_s = s + UTF8SKIP(s); 10350 if (new_s > lim) /* lim may be in the middle of a long character */ 10351 return s; 10352 s = new_s; 10353 } 10354 } 10355 else { 10356 while (off++ && s > lim) { 10357 s--; 10358 if (UTF8_IS_CONTINUED(*s)) { 10359 while (s > lim && UTF8_IS_CONTINUATION(*s)) 10360 s--; 10361 if (! UTF8_IS_START(*s)) { 10362 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 10363 } 10364 } 10365 /* XXX could check well-formedness here */ 10366 } 10367 } 10368 return s; 10369 } 10370 10371 STATIC U8 * 10372 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) 10373 { 10374 PERL_ARGS_ASSERT_REGHOP4; 10375 10376 if (off >= 0) { 10377 while (off-- && s < rlim) { 10378 /* XXX could check well-formedness here */ 10379 s += UTF8SKIP(s); 10380 } 10381 } 10382 else { 10383 while (off++ && s > llim) { 10384 s--; 10385 if (UTF8_IS_CONTINUED(*s)) { 10386 while (s > llim && UTF8_IS_CONTINUATION(*s)) 10387 s--; 10388 if (! UTF8_IS_START(*s)) { 10389 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 10390 } 10391 } 10392 /* XXX could check well-formedness here */ 10393 } 10394 } 10395 return s; 10396 } 10397 10398 /* like reghop3, but returns NULL on overrun, rather than returning last 10399 * char pos */ 10400 10401 STATIC U8 * 10402 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim) 10403 { 10404 PERL_ARGS_ASSERT_REGHOPMAYBE3; 10405 10406 if (off >= 0) { 10407 while (off-- && s < lim) { 10408 /* XXX could check well-formedness here */ 10409 s += UTF8SKIP(s); 10410 } 10411 if (off >= 0) 10412 return NULL; 10413 } 10414 else { 10415 while (off++ && s > lim) { 10416 s--; 10417 if (UTF8_IS_CONTINUED(*s)) { 10418 while (s > lim && UTF8_IS_CONTINUATION(*s)) 10419 s--; 10420 if (! UTF8_IS_START(*s)) { 10421 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 10422 } 10423 } 10424 /* XXX could check well-formedness here */ 10425 } 10426 if (off <= 0) 10427 return NULL; 10428 } 10429 return s; 10430 } 10431 10432 10433 /* when executing a regex that may have (?{}), extra stuff needs setting 10434 up that will be visible to the called code, even before the current 10435 match has finished. In particular: 10436 10437 * $_ is localised to the SV currently being matched; 10438 * pos($_) is created if necessary, ready to be updated on each call-out 10439 to code; 10440 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm 10441 isn't set until the current pattern is successfully finished), so that 10442 $1 etc of the match-so-far can be seen; 10443 * save the old values of subbeg etc of the current regex, and set then 10444 to the current string (again, this is normally only done at the end 10445 of execution) 10446 */ 10447 10448 static void 10449 S_setup_eval_state(pTHX_ regmatch_info *const reginfo) 10450 { 10451 MAGIC *mg; 10452 regexp *const rex = ReANY(reginfo->prog); 10453 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; 10454 10455 eval_state->rex = rex; 10456 eval_state->sv = reginfo->sv; 10457 10458 if (reginfo->sv) { 10459 /* Make $_ available to executed code. */ 10460 if (reginfo->sv != DEFSV) { 10461 SAVE_DEFSV; 10462 DEFSV_set(reginfo->sv); 10463 } 10464 /* will be dec'd by S_cleanup_regmatch_info_aux */ 10465 SvREFCNT_inc_NN(reginfo->sv); 10466 10467 if (!(mg = mg_find_mglob(reginfo->sv))) { 10468 /* prepare for quick setting of pos */ 10469 mg = sv_magicext_mglob(reginfo->sv); 10470 mg->mg_len = -1; 10471 } 10472 eval_state->pos_magic = mg; 10473 eval_state->pos = mg->mg_len; 10474 eval_state->pos_flags = mg->mg_flags; 10475 } 10476 else 10477 eval_state->pos_magic = NULL; 10478 10479 if (!PL_reg_curpm) { 10480 /* PL_reg_curpm is a fake PMOP that we can attach the current 10481 * regex to and point PL_curpm at, so that $1 et al are visible 10482 * within a /(?{})/. It's just allocated once per interpreter the 10483 * first time its needed */ 10484 Newxz(PL_reg_curpm, 1, PMOP); 10485 #ifdef USE_ITHREADS 10486 { 10487 SV* const repointer = &PL_sv_undef; 10488 /* this regexp is also owned by the new PL_reg_curpm, which 10489 will try to free it. */ 10490 av_push(PL_regex_padav, repointer); 10491 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); 10492 PL_regex_pad = AvARRAY(PL_regex_padav); 10493 } 10494 #endif 10495 } 10496 SET_reg_curpm(reginfo->prog); 10497 eval_state->curpm = PL_curpm; 10498 PL_curpm_under = PL_curpm; 10499 PL_curpm = PL_reg_curpm; 10500 if (RXp_MATCH_COPIED(rex)) { 10501 /* Here is a serious problem: we cannot rewrite subbeg, 10502 since it may be needed if this match fails. Thus 10503 $` inside (?{}) could fail... */ 10504 eval_state->subbeg = rex->subbeg; 10505 eval_state->sublen = rex->sublen; 10506 eval_state->suboffset = rex->suboffset; 10507 eval_state->subcoffset = rex->subcoffset; 10508 #ifdef PERL_ANY_COW 10509 eval_state->saved_copy = rex->saved_copy; 10510 #endif 10511 RXp_MATCH_COPIED_off(rex); 10512 } 10513 else 10514 eval_state->subbeg = NULL; 10515 rex->subbeg = (char *)reginfo->strbeg; 10516 rex->suboffset = 0; 10517 rex->subcoffset = 0; 10518 rex->sublen = reginfo->strend - reginfo->strbeg; 10519 } 10520 10521 10522 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ 10523 10524 static void 10525 S_cleanup_regmatch_info_aux(pTHX_ void *arg) 10526 { 10527 regmatch_info_aux *aux = (regmatch_info_aux *) arg; 10528 regmatch_info_aux_eval *eval_state = aux->info_aux_eval; 10529 regmatch_slab *s; 10530 10531 Safefree(aux->poscache); 10532 10533 if (eval_state) { 10534 10535 /* undo the effects of S_setup_eval_state() */ 10536 10537 if (eval_state->subbeg) { 10538 regexp * const rex = eval_state->rex; 10539 rex->subbeg = eval_state->subbeg; 10540 rex->sublen = eval_state->sublen; 10541 rex->suboffset = eval_state->suboffset; 10542 rex->subcoffset = eval_state->subcoffset; 10543 #ifdef PERL_ANY_COW 10544 rex->saved_copy = eval_state->saved_copy; 10545 #endif 10546 RXp_MATCH_COPIED_on(rex); 10547 } 10548 if (eval_state->pos_magic) 10549 { 10550 eval_state->pos_magic->mg_len = eval_state->pos; 10551 eval_state->pos_magic->mg_flags = 10552 (eval_state->pos_magic->mg_flags & ~MGf_BYTES) 10553 | (eval_state->pos_flags & MGf_BYTES); 10554 } 10555 10556 PL_curpm = eval_state->curpm; 10557 SvREFCNT_dec(eval_state->sv); 10558 } 10559 10560 PL_regmatch_state = aux->old_regmatch_state; 10561 PL_regmatch_slab = aux->old_regmatch_slab; 10562 10563 /* free all slabs above current one - this must be the last action 10564 * of this function, as aux and eval_state are allocated within 10565 * slabs and may be freed here */ 10566 10567 s = PL_regmatch_slab->next; 10568 if (s) { 10569 PL_regmatch_slab->next = NULL; 10570 while (s) { 10571 regmatch_slab * const osl = s; 10572 s = s->next; 10573 Safefree(osl); 10574 } 10575 } 10576 } 10577 10578 10579 STATIC void 10580 S_to_utf8_substr(pTHX_ regexp *prog) 10581 { 10582 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile 10583 * on the converted value */ 10584 10585 int i = 1; 10586 10587 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; 10588 10589 do { 10590 if (prog->substrs->data[i].substr 10591 && !prog->substrs->data[i].utf8_substr) { 10592 SV* const sv = newSVsv(prog->substrs->data[i].substr); 10593 prog->substrs->data[i].utf8_substr = sv; 10594 sv_utf8_upgrade(sv); 10595 if (SvVALID(prog->substrs->data[i].substr)) { 10596 if (SvTAIL(prog->substrs->data[i].substr)) { 10597 /* Trim the trailing \n that fbm_compile added last 10598 time. */ 10599 SvCUR_set(sv, SvCUR(sv) - 1); 10600 /* Whilst this makes the SV technically "invalid" (as its 10601 buffer is no longer followed by "\0") when fbm_compile() 10602 adds the "\n" back, a "\0" is restored. */ 10603 fbm_compile(sv, FBMcf_TAIL); 10604 } else 10605 fbm_compile(sv, 0); 10606 } 10607 if (prog->substrs->data[i].substr == prog->check_substr) 10608 prog->check_utf8 = sv; 10609 } 10610 } while (i--); 10611 } 10612 10613 STATIC bool 10614 S_to_byte_substr(pTHX_ regexp *prog) 10615 { 10616 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile 10617 * on the converted value; returns FALSE if can't be converted. */ 10618 10619 int i = 1; 10620 10621 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; 10622 10623 do { 10624 if (prog->substrs->data[i].utf8_substr 10625 && !prog->substrs->data[i].substr) { 10626 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); 10627 if (! sv_utf8_downgrade(sv, TRUE)) { 10628 SvREFCNT_dec_NN(sv); 10629 return FALSE; 10630 } 10631 if (SvVALID(prog->substrs->data[i].utf8_substr)) { 10632 if (SvTAIL(prog->substrs->data[i].utf8_substr)) { 10633 /* Trim the trailing \n that fbm_compile added last 10634 time. */ 10635 SvCUR_set(sv, SvCUR(sv) - 1); 10636 fbm_compile(sv, FBMcf_TAIL); 10637 } else 10638 fbm_compile(sv, 0); 10639 } 10640 prog->substrs->data[i].substr = sv; 10641 if (prog->substrs->data[i].utf8_substr == prog->check_utf8) 10642 prog->check_substr = sv; 10643 } 10644 } while (i--); 10645 10646 return TRUE; 10647 } 10648 10649 #ifndef PERL_IN_XSUB_RE 10650 10651 bool 10652 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) 10653 { 10654 /* Temporary helper function for toke.c. Verify that the code point 'cp' 10655 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in 10656 * the larger string bounded by 'strbeg' and 'strend'. 10657 * 10658 * 'cp' needs to be assigned (if not, a future version of the Unicode 10659 * Standard could make it something that combines with adjacent characters, 10660 * so code using it would then break), and there has to be a GCB break 10661 * before and after the character. */ 10662 10663 dVAR; 10664 10665 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val; 10666 const U8 * prev_cp_start; 10667 10668 PERL_ARGS_ASSERT_IS_GRAPHEME; 10669 10670 if ( UNLIKELY(UNICODE_IS_SUPER(cp)) 10671 || UNLIKELY(UNICODE_IS_NONCHAR(cp))) 10672 { 10673 /* These are considered graphemes */ 10674 return TRUE; 10675 } 10676 10677 /* Otherwise, unassigned code points are forbidden */ 10678 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST( 10679 _invlist_search(PL_Assigned_invlist, cp)))) 10680 { 10681 return FALSE; 10682 } 10683 10684 cp_gcb_val = getGCB_VAL_CP(cp); 10685 10686 /* Find the GCB value of the previous code point in the input */ 10687 prev_cp_start = utf8_hop_back(s, -1, strbeg); 10688 if (UNLIKELY(prev_cp_start == s)) { 10689 prev_cp_gcb_val = GCB_EDGE; 10690 } 10691 else { 10692 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend); 10693 } 10694 10695 /* And check that is a grapheme boundary */ 10696 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s, 10697 TRUE /* is UTF-8 encoded */ )) 10698 { 10699 return FALSE; 10700 } 10701 10702 /* Similarly verify there is a break between the current character and the 10703 * following one */ 10704 s += UTF8SKIP(s); 10705 if (s >= strend) { 10706 next_cp_gcb_val = GCB_EDGE; 10707 } 10708 else { 10709 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend); 10710 } 10711 10712 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE); 10713 } 10714 10715 /* 10716 =head1 Unicode Support 10717 10718 =for apidoc isSCRIPT_RUN 10719 10720 Returns a bool as to whether or not the sequence of bytes from C<s> up to but 10721 not including C<send> form a "script run". C<utf8_target> is TRUE iff the 10722 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for 10723 two degenerate cases given below, this function returns TRUE iff all code 10724 points in it come from any combination of three "scripts" given by the Unicode 10725 "Script Extensions" property: Common, Inherited, and possibly one other. 10726 Additionally all decimal digits must come from the same consecutive sequence of 10727 10. 10728 10729 For example, if all the characters in the sequence are Greek, or Common, or 10730 Inherited, this function will return TRUE, provided any decimal digits in it 10731 are from the same block of digits in Common. (These are the ASCII digits 10732 "0".."9" and additionally a block for full width forms of these, and several 10733 others used in mathematical notation.) For scripts (unlike Greek) that have 10734 their own digits defined this will accept either digits from that set or from 10735 one of the Common digit sets, but not a combination of the two. Some scripts, 10736 such as Arabic, have more than one set of digits. All digits must come from 10737 the same set for this function to return TRUE. 10738 10739 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE 10740 contain the script found, using the C<SCX_enum> typedef. Its value will be 10741 C<SCX_INVALID> if the function returns FALSE. 10742 10743 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for) 10744 will be C<SCX_INVALID>. 10745 10746 If the sequence contains a single code point which is unassigned to a character 10747 in the version of Unicode being used, the function will return TRUE, and the 10748 script will be C<SCX_Unknown>. Any other combination of unassigned code points 10749 in the input sequence will result in the function treating the input as not 10750 being a script run. 10751 10752 The returned script will be C<SCX_Inherited> iff all the code points in it are 10753 from the Inherited script. 10754 10755 Otherwise, the returned script will be C<SCX_Common> iff all the code points in 10756 it are from the Inherited or Common scripts. 10757 10758 =cut 10759 10760 */ 10761 10762 bool 10763 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) 10764 { 10765 /* Basically, it looks at each character in the sequence to see if the 10766 * above conditions are met; if not it fails. It uses an inversion map to 10767 * find the enum corresponding to the script of each character. But this 10768 * is complicated by the fact that a few code points can be in any of 10769 * several scripts. The data has been constructed so that there are 10770 * additional enum values (all negative) for these situations. The 10771 * absolute value of those is an index into another table which contains 10772 * pointers to auxiliary tables for each such situation. Each aux array 10773 * lists all the scripts for the given situation. There is another, 10774 * parallel, table that gives the number of entries in each aux table. 10775 * These are all defined in charclass_invlists.h */ 10776 10777 /* XXX Here are the additional things UTS 39 says could be done: 10778 * 10779 * Forbid sequences of the same nonspacing mark 10780 * 10781 * Check to see that all the characters are in the sets of exemplar 10782 * characters for at least one language in the Unicode Common Locale Data 10783 * Repository [CLDR]. */ 10784 10785 dVAR; 10786 10787 /* Things that match /\d/u */ 10788 SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT]; 10789 UV * decimals_array = invlist_array(decimals_invlist); 10790 10791 /* What code point is the digit '0' of the script run? (0 meaning FALSE if 10792 * not currently known) */ 10793 UV zero_of_run = 0; 10794 10795 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */ 10796 SCX_enum script_of_char = SCX_INVALID; 10797 10798 /* If the script remains not fully determined from iteration to iteration, 10799 * this is the current intersection of the possiblities. */ 10800 SCX_enum * intersection = NULL; 10801 PERL_UINT_FAST8_T intersection_len = 0; 10802 10803 bool retval = TRUE; 10804 SCX_enum * ret_script = NULL; 10805 10806 assert(send >= s); 10807 10808 PERL_ARGS_ASSERT_ISSCRIPT_RUN; 10809 10810 /* All code points in 0..255 are either Common or Latin, so must be a 10811 * script run. We can return immediately unless we need to know which 10812 * script it is. */ 10813 if (! utf8_target && LIKELY(send > s)) { 10814 if (ret_script == NULL) { 10815 return TRUE; 10816 } 10817 10818 /* If any character is Latin, the run is Latin */ 10819 while (s < send) { 10820 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) { 10821 *ret_script = SCX_Latin; 10822 return TRUE; 10823 } 10824 } 10825 10826 /* Here, all are Common */ 10827 *ret_script = SCX_Common; 10828 return TRUE; 10829 } 10830 10831 /* Look at each character in the sequence */ 10832 while (s < send) { 10833 /* If the current character being examined is a digit, this is the code 10834 * point of the zero for its sequence of 10 */ 10835 UV zero_of_char; 10836 10837 UV cp; 10838 10839 /* The code allows all scripts to use the ASCII digits. This is 10840 * because they are in the Common script. Hence any ASCII ones found 10841 * are ok, unless and until a digit from another set has already been 10842 * encountered. digit ranges in Common are not similarly blessed) */ 10843 if (UNLIKELY(isDIGIT(*s))) { 10844 if (UNLIKELY(script_of_run == SCX_Unknown)) { 10845 retval = FALSE; 10846 break; 10847 } 10848 if (zero_of_run) { 10849 if (zero_of_run != '0') { 10850 retval = FALSE; 10851 break; 10852 } 10853 } 10854 else { 10855 zero_of_run = '0'; 10856 } 10857 s++; 10858 continue; 10859 } 10860 10861 /* Here, isn't an ASCII digit. Find the code point of the character */ 10862 if (! UTF8_IS_INVARIANT(*s)) { 10863 Size_t len; 10864 cp = valid_utf8_to_uvchr((U8 *) s, &len); 10865 s += len; 10866 } 10867 else { 10868 cp = *(s++); 10869 } 10870 10871 /* If is within the range [+0 .. +9] of the script's zero, it also is a 10872 * digit in that script. We can skip the rest of this code for this 10873 * character. */ 10874 if (UNLIKELY( zero_of_run 10875 && cp >= zero_of_run 10876 && cp - zero_of_run <= 9)) 10877 { 10878 continue; 10879 } 10880 10881 /* Find the character's script. The correct values are hard-coded here 10882 * for small-enough code points. */ 10883 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely 10884 unlikely to change */ 10885 if ( cp > 255 10886 || ( isALPHA_L1(cp) 10887 && LIKELY(cp != MICRO_SIGN_NATIVE))) 10888 { 10889 script_of_char = SCX_Latin; 10890 } 10891 else { 10892 script_of_char = SCX_Common; 10893 } 10894 } 10895 else { 10896 script_of_char = _Perl_SCX_invmap[ 10897 _invlist_search(PL_SCX_invlist, cp)]; 10898 } 10899 10900 /* We arbitrarily accept a single unassigned character, but not in 10901 * combination with anything else, and not a run of them. */ 10902 if ( UNLIKELY(script_of_run == SCX_Unknown) 10903 || UNLIKELY( script_of_run != SCX_INVALID 10904 && script_of_char == SCX_Unknown)) 10905 { 10906 retval = FALSE; 10907 break; 10908 } 10909 10910 /* For the first character, or the run is inherited, the run's script 10911 * is set to the char's */ 10912 if ( UNLIKELY(script_of_run == SCX_INVALID) 10913 || UNLIKELY(script_of_run == SCX_Inherited)) 10914 { 10915 script_of_run = script_of_char; 10916 } 10917 10918 /* For the character's script to be Unknown, it must be the first 10919 * character in the sequence (for otherwise a test above would have 10920 * prevented us from reaching here), and we have set the run's script 10921 * to it. Nothing further to be done for this character */ 10922 if (UNLIKELY(script_of_char == SCX_Unknown)) { 10923 continue; 10924 } 10925 10926 /* We accept 'inherited' script characters currently even at the 10927 * beginning. (We know that no characters in Inherited are digits, or 10928 * we'd have to check for that) */ 10929 if (UNLIKELY(script_of_char == SCX_Inherited)) { 10930 continue; 10931 } 10932 10933 /* If the run so far is Common, and the new character isn't, change the 10934 * run's script to that of this character */ 10935 if (script_of_run == SCX_Common && script_of_char != SCX_Common) { 10936 script_of_run = script_of_char; 10937 } 10938 10939 /* Now we can see if the script of the new character is the same as 10940 * that of the run */ 10941 if (LIKELY(script_of_char == script_of_run)) { 10942 /* By far the most common case */ 10943 goto scripts_match; 10944 } 10945 10946 /* Here, the script of the run isn't Common. But characters in Common 10947 * match any script */ 10948 if (script_of_char == SCX_Common) { 10949 goto scripts_match; 10950 } 10951 10952 #ifndef HAS_SCX_AUX_TABLES 10953 10954 /* Too early a Unicode version to have a code point belonging to more 10955 * than one script, so, if the scripts don't exactly match, fail */ 10956 PERL_UNUSED_VAR(intersection_len); 10957 retval = FALSE; 10958 break; 10959 10960 #else 10961 10962 /* Here there is no exact match between the character's script and the 10963 * run's. And we've handled the special cases of scripts Unknown, 10964 * Inherited, and Common. 10965 * 10966 * Negative script numbers signify that the value may be any of several 10967 * scripts, and we need to look at auxiliary information to make our 10968 * deterimination. But if both are non-negative, we can fail now */ 10969 if (LIKELY(script_of_char >= 0)) { 10970 const SCX_enum * search_in; 10971 PERL_UINT_FAST8_T search_in_len; 10972 PERL_UINT_FAST8_T i; 10973 10974 if (LIKELY(script_of_run >= 0)) { 10975 retval = FALSE; 10976 break; 10977 } 10978 10979 /* Use the previously constructed set of possible scripts, if any. 10980 * */ 10981 if (intersection) { 10982 search_in = intersection; 10983 search_in_len = intersection_len; 10984 } 10985 else { 10986 search_in = SCX_AUX_TABLE_ptrs[-script_of_run]; 10987 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run]; 10988 } 10989 10990 for (i = 0; i < search_in_len; i++) { 10991 if (search_in[i] == script_of_char) { 10992 script_of_run = script_of_char; 10993 goto scripts_match; 10994 } 10995 } 10996 10997 retval = FALSE; 10998 break; 10999 } 11000 else if (LIKELY(script_of_run >= 0)) { 11001 /* script of character could be one of several, but run is a single 11002 * script */ 11003 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char]; 11004 const PERL_UINT_FAST8_T search_in_len 11005 = SCX_AUX_TABLE_lengths[-script_of_char]; 11006 PERL_UINT_FAST8_T i; 11007 11008 for (i = 0; i < search_in_len; i++) { 11009 if (search_in[i] == script_of_run) { 11010 script_of_char = script_of_run; 11011 goto scripts_match; 11012 } 11013 } 11014 11015 retval = FALSE; 11016 break; 11017 } 11018 else { 11019 /* Both run and char could be in one of several scripts. If the 11020 * intersection is empty, then this character isn't in this script 11021 * run. Otherwise, we need to calculate the intersection to use 11022 * for future iterations of the loop, unless we are already at the 11023 * final character */ 11024 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char]; 11025 const PERL_UINT_FAST8_T char_len 11026 = SCX_AUX_TABLE_lengths[-script_of_char]; 11027 const SCX_enum * search_run; 11028 PERL_UINT_FAST8_T run_len; 11029 11030 SCX_enum * new_overlap = NULL; 11031 PERL_UINT_FAST8_T i, j; 11032 11033 if (intersection) { 11034 search_run = intersection; 11035 run_len = intersection_len; 11036 } 11037 else { 11038 search_run = SCX_AUX_TABLE_ptrs[-script_of_run]; 11039 run_len = SCX_AUX_TABLE_lengths[-script_of_run]; 11040 } 11041 11042 intersection_len = 0; 11043 11044 for (i = 0; i < run_len; i++) { 11045 for (j = 0; j < char_len; j++) { 11046 if (search_run[i] == search_char[j]) { 11047 11048 /* Here, the script at i,j matches. That means this 11049 * character is in the run. But continue on to find 11050 * the complete intersection, for the next loop 11051 * iteration, and for the digit check after it. 11052 * 11053 * On the first found common script, we malloc space 11054 * for the intersection list for the worst case of the 11055 * intersection, which is the minimum of the number of 11056 * scripts remaining in each set. */ 11057 if (intersection_len == 0) { 11058 Newx(new_overlap, 11059 MIN(run_len - i, char_len - j), 11060 SCX_enum); 11061 } 11062 new_overlap[intersection_len++] = search_run[i]; 11063 } 11064 } 11065 } 11066 11067 /* Here we've looked through everything. If they have no scripts 11068 * in common, not a run */ 11069 if (intersection_len == 0) { 11070 retval = FALSE; 11071 break; 11072 } 11073 11074 /* If there is only a single script in common, set to that. 11075 * Otherwise, use the intersection going forward */ 11076 Safefree(intersection); 11077 intersection = NULL; 11078 if (intersection_len == 1) { 11079 script_of_run = script_of_char = new_overlap[0]; 11080 Safefree(new_overlap); 11081 new_overlap = NULL; 11082 } 11083 else { 11084 intersection = new_overlap; 11085 } 11086 } 11087 11088 #endif 11089 11090 scripts_match: 11091 11092 /* Here, the script of the character is compatible with that of the 11093 * run. That means that in most cases, it continues the script run. 11094 * Either it and the run match exactly, or one or both can be in any of 11095 * several scripts, and the intersection is not empty. However, if the 11096 * character is a decimal digit, it could still mean failure if it is 11097 * from the wrong sequence of 10. So, we need to look at if it's a 11098 * digit. We've already handled the 10 decimal digits, and the next 11099 * lowest one is this one: */ 11100 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) { 11101 continue; /* Not a digit; this character is part of the run */ 11102 } 11103 11104 /* If we have a definitive '0' for the script of this character, we 11105 * know that for this to be a digit, it must be in the range of +0..+9 11106 * of that zero. */ 11107 if ( script_of_char >= 0 11108 && (zero_of_char = script_zeros[script_of_char])) 11109 { 11110 if ( cp < zero_of_char 11111 || cp > zero_of_char + 9) 11112 { 11113 continue; /* Not a digit; this character is part of the run 11114 */ 11115 } 11116 11117 } 11118 else { /* Need to look up if this character is a digit or not */ 11119 SSize_t index_of_zero_of_char; 11120 index_of_zero_of_char = _invlist_search(decimals_invlist, cp); 11121 if ( UNLIKELY(index_of_zero_of_char < 0) 11122 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char)) 11123 { 11124 continue; /* Not a digit; this character is part of the run. 11125 */ 11126 } 11127 11128 zero_of_char = decimals_array[index_of_zero_of_char]; 11129 } 11130 11131 /* Here, the character is a decimal digit, and the zero of its sequence 11132 * of 10 is in 'zero_of_char'. If we already have a zero for this run, 11133 * they better be the same. */ 11134 if (zero_of_run) { 11135 if (zero_of_run != zero_of_char) { 11136 retval = FALSE; 11137 break; 11138 } 11139 } 11140 else { /* Otherwise we now have a zero for this run */ 11141 zero_of_run = zero_of_char; 11142 } 11143 } /* end of looping through CLOSESR text */ 11144 11145 Safefree(intersection); 11146 11147 if (ret_script != NULL) { 11148 if (retval) { 11149 *ret_script = script_of_run; 11150 } 11151 else { 11152 *ret_script = SCX_INVALID; 11153 } 11154 } 11155 11156 return retval; 11157 } 11158 11159 #endif /* ifndef PERL_IN_XSUB_RE */ 11160 11161 /* 11162 * ex: set ts=8 sts=4 sw=4 et: 11163 */ 11164