1 /* regexec.c 2 */ 3 4 /* 5 * One Ring to rule them all, One Ring to find them 6 & 7 * [p.v of _The Lord of the Rings_, opening poem] 8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] 9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] 10 */ 11 12 /* This file contains functions for executing a regular expression. See 13 * also regcomp.c which funnily enough, contains functions for compiling 14 * a regular expression. 15 * 16 * This file is also copied at build time to ext/re/re_exec.c, where 17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 18 * This causes the main functions to be compiled under new names and with 19 * debugging support added, which makes "use re 'debug'" work. 20 */ 21 22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 23 * confused with the original package (see point 3 below). Thanks, Henry! 24 */ 25 26 /* Additional note: this code is very heavily munged from Henry's version 27 * in places. In some spots I've traded clarity for efficiency, so don't 28 * blame Henry for some of the lack of readability. 29 */ 30 31 /* The names of the functions have been changed from regcomp and 32 * regexec to pregcomp and pregexec in order to avoid conflicts 33 * with the POSIX routines of the same names. 34 */ 35 36 #ifdef PERL_EXT_RE_BUILD 37 #include "re_top.h" 38 #endif 39 40 /* 41 * pregcomp and pregexec -- regsub and regerror are not used in perl 42 * 43 * Copyright (c) 1986 by University of Toronto. 44 * Written by Henry Spencer. Not derived from licensed software. 45 * 46 * Permission is granted to anyone to use this software for any 47 * purpose on any computer system, and to redistribute it freely, 48 * subject to the following restrictions: 49 * 50 * 1. The author is not responsible for the consequences of use of 51 * this software, no matter how awful, even if they arise 52 * from defects in it. 53 * 54 * 2. The origin of this software must not be misrepresented, either 55 * by explicit claim or by omission. 56 * 57 * 3. Altered versions must be plainly marked as such, and must not 58 * be misrepresented as being the original software. 59 * 60 **** Alterations to Henry's code are... 61 **** 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 64 **** by Larry Wall and others 65 **** 66 **** You may distribute under the terms of either the GNU General Public 67 **** License or the Artistic License, as specified in the README file. 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGEXEC_C 75 #include "perl.h" 76 77 #ifdef PERL_IN_XSUB_RE 78 # include "re_comp.h" 79 #else 80 # include "regcomp.h" 81 #endif 82 83 #define RF_tainted 1 /* tainted information used? */ 84 #define RF_warned 2 /* warned about big count? */ 85 86 #define RF_utf8 8 /* Pattern contains multibyte chars? */ 87 88 #define UTF ((PL_reg_flags & RF_utf8) != 0) 89 90 #define RS_init 1 /* eval environment created */ 91 #define RS_set 2 /* replsv value is set */ 92 93 #ifndef STATIC 94 #define STATIC static 95 #endif 96 97 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c))) 98 99 /* 100 * Forwards. 101 */ 102 103 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv)) 104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b) 105 106 #define HOPc(pos,off) \ 107 (char *)(PL_reg_match_utf8 \ 108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \ 109 : (U8*)(pos + off)) 110 #define HOPBACKc(pos, off) \ 111 (char*)(PL_reg_match_utf8\ 112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \ 113 : (pos - off >= PL_bostr) \ 114 ? (U8*)pos - off \ 115 : NULL) 116 117 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) 118 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) 119 120 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ 121 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END 122 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a") 123 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0") 124 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ") 125 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86") 126 127 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ 128 129 /* for use after a quantifier and before an EXACT-like node -- japhy */ 130 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */ 131 #define JUMPABLE(rn) ( \ 132 OP(rn) == OPEN || \ 133 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ 134 OP(rn) == EVAL || \ 135 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ 136 OP(rn) == PLUS || OP(rn) == MINMOD || \ 137 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \ 138 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ 139 ) 140 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) 141 142 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) 143 144 #if 0 145 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so 146 we don't need this definition. */ 147 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) 148 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) 149 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) 150 151 #else 152 /* ... so we use this as its faster. */ 153 #define IS_TEXT(rn) ( OP(rn)==EXACT ) 154 #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) 155 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) 156 157 #endif 158 159 /* 160 Search for mandatory following text node; for lookahead, the text must 161 follow but for lookbehind (rn->flags != 0) we skip to the next step. 162 */ 163 #define FIND_NEXT_IMPT(rn) STMT_START { \ 164 while (JUMPABLE(rn)) { \ 165 const OPCODE type = OP(rn); \ 166 if (type == SUSPEND || PL_regkind[type] == CURLY) \ 167 rn = NEXTOPER(NEXTOPER(rn)); \ 168 else if (type == PLUS) \ 169 rn = NEXTOPER(rn); \ 170 else if (type == IFMATCH) \ 171 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ 172 else rn += NEXT_OFF(rn); \ 173 } \ 174 } STMT_END 175 176 177 static void restore_pos(pTHX_ void *arg); 178 179 STATIC CHECKPOINT 180 S_regcppush(pTHX_ I32 parenfloor) 181 { 182 dVAR; 183 const int retval = PL_savestack_ix; 184 #define REGCP_PAREN_ELEMS 4 185 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS; 186 int p; 187 GET_RE_DEBUG_FLAGS_DECL; 188 189 if (paren_elems_to_push < 0) 190 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0"); 191 192 #define REGCP_OTHER_ELEMS 7 193 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS); 194 195 for (p = PL_regsize; p > parenfloor; p--) { 196 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ 197 SSPUSHINT(PL_regoffs[p].end); 198 SSPUSHINT(PL_regoffs[p].start); 199 SSPUSHPTR(PL_reg_start_tmp[p]); 200 SSPUSHINT(p); 201 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, 202 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n", 203 (UV)p, (IV)PL_regoffs[p].start, 204 (IV)(PL_reg_start_tmp[p] - PL_bostr), 205 (IV)PL_regoffs[p].end 206 )); 207 } 208 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ 209 SSPUSHPTR(PL_regoffs); 210 SSPUSHINT(PL_regsize); 211 SSPUSHINT(*PL_reglastparen); 212 SSPUSHINT(*PL_reglastcloseparen); 213 SSPUSHPTR(PL_reginput); 214 #define REGCP_FRAME_ELEMS 2 215 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and 216 * are needed for the regexp context stack bookkeeping. */ 217 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS); 218 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */ 219 220 return retval; 221 } 222 223 /* These are needed since we do not localize EVAL nodes: */ 224 #define REGCP_SET(cp) \ 225 DEBUG_STATE_r( \ 226 PerlIO_printf(Perl_debug_log, \ 227 " Setting an EVAL scope, savestack=%"IVdf"\n", \ 228 (IV)PL_savestack_ix)); \ 229 cp = PL_savestack_ix 230 231 #define REGCP_UNWIND(cp) \ 232 DEBUG_STATE_r( \ 233 if (cp != PL_savestack_ix) \ 234 PerlIO_printf(Perl_debug_log, \ 235 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ 236 (IV)(cp), (IV)PL_savestack_ix)); \ 237 regcpblow(cp) 238 239 STATIC char * 240 S_regcppop(pTHX_ const regexp *rex) 241 { 242 dVAR; 243 U32 i; 244 char *input; 245 GET_RE_DEBUG_FLAGS_DECL; 246 247 PERL_ARGS_ASSERT_REGCPPOP; 248 249 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ 250 i = SSPOPINT; 251 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ 252 i = SSPOPINT; /* Parentheses elements to pop. */ 253 input = (char *) SSPOPPTR; 254 *PL_reglastcloseparen = SSPOPINT; 255 *PL_reglastparen = SSPOPINT; 256 PL_regsize = SSPOPINT; 257 PL_regoffs=(regexp_paren_pair *) SSPOPPTR; 258 259 260 /* Now restore the parentheses context. */ 261 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS); 262 i > 0; i -= REGCP_PAREN_ELEMS) { 263 I32 tmps; 264 U32 paren = (U32)SSPOPINT; 265 PL_reg_start_tmp[paren] = (char *) SSPOPPTR; 266 PL_regoffs[paren].start = SSPOPINT; 267 tmps = SSPOPINT; 268 if (paren <= *PL_reglastparen) 269 PL_regoffs[paren].end = tmps; 270 DEBUG_BUFFERS_r( 271 PerlIO_printf(Perl_debug_log, 272 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n", 273 (UV)paren, (IV)PL_regoffs[paren].start, 274 (IV)(PL_reg_start_tmp[paren] - PL_bostr), 275 (IV)PL_regoffs[paren].end, 276 (paren > *PL_reglastparen ? "(no)" : "")); 277 ); 278 } 279 DEBUG_BUFFERS_r( 280 if (*PL_reglastparen + 1 <= rex->nparens) { 281 PerlIO_printf(Perl_debug_log, 282 " restoring \\%"IVdf"..\\%"IVdf" to undef\n", 283 (IV)(*PL_reglastparen + 1), (IV)rex->nparens); 284 } 285 ); 286 #if 1 287 /* It would seem that the similar code in regtry() 288 * already takes care of this, and in fact it is in 289 * a better location to since this code can #if 0-ed out 290 * but the code in regtry() is needed or otherwise tests 291 * requiring null fields (pat.t#187 and split.t#{13,14} 292 * (as of patchlevel 7877) will fail. Then again, 293 * this code seems to be necessary or otherwise 294 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 295 * --jhi updated by dapm */ 296 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) { 297 if (i > PL_regsize) 298 PL_regoffs[i].start = -1; 299 PL_regoffs[i].end = -1; 300 } 301 #endif 302 return input; 303 } 304 305 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ 306 307 /* 308 * pregexec and friends 309 */ 310 311 #ifndef PERL_IN_XSUB_RE 312 /* 313 - pregexec - match a regexp against a string 314 */ 315 I32 316 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend, 317 char *strbeg, I32 minend, SV *screamer, U32 nosave) 318 /* strend: pointer to null at end of string */ 319 /* strbeg: real beginning of string */ 320 /* minend: end of match must be >=minend after stringarg. */ 321 /* nosave: For optimizations. */ 322 { 323 PERL_ARGS_ASSERT_PREGEXEC; 324 325 return 326 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 327 nosave ? 0 : REXEC_COPY_STR); 328 } 329 #endif 330 331 /* 332 * Need to implement the following flags for reg_anch: 333 * 334 * USE_INTUIT_NOML - Useful to call re_intuit_start() first 335 * USE_INTUIT_ML 336 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer 337 * INTUIT_AUTORITATIVE_ML 338 * INTUIT_ONCE_NOML - Intuit can match in one location only. 339 * INTUIT_ONCE_ML 340 * 341 * Another flag for this function: SECOND_TIME (so that float substrs 342 * with giant delta may be not rechecked). 343 */ 344 345 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ 346 347 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. 348 Otherwise, only SvCUR(sv) is used to get strbeg. */ 349 350 /* XXXX We assume that strpos is strbeg unless sv. */ 351 352 /* XXXX Some places assume that there is a fixed substring. 353 An update may be needed if optimizer marks as "INTUITable" 354 RExen without fixed substrings. Similarly, it is assumed that 355 lengths of all the strings are no more than minlen, thus they 356 cannot come from lookahead. 357 (Or minlen should take into account lookahead.) 358 NOTE: Some of this comment is not correct. minlen does now take account 359 of lookahead/behind. Further research is required. -- demerphq 360 361 */ 362 363 /* A failure to find a constant substring means that there is no need to make 364 an expensive call to REx engine, thus we celebrate a failure. Similarly, 365 finding a substring too deep into the string means that less calls to 366 regtry() should be needed. 367 368 REx compiler's optimizer found 4 possible hints: 369 a) Anchored substring; 370 b) Fixed substring; 371 c) Whether we are anchored (beginning-of-line or \G); 372 d) First node (of those at offset 0) which may distingush positions; 373 We use a)b)d) and multiline-part of c), and try to find a position in the 374 string which does not contradict any of them. 375 */ 376 377 /* Most of decisions we do here should have been done at compile time. 378 The nodes of the REx which we used for the search should have been 379 deleted from the finite automaton. */ 380 381 char * 382 Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos, 383 char *strend, const U32 flags, re_scream_pos_data *data) 384 { 385 dVAR; 386 register I32 start_shift = 0; 387 /* Should be nonnegative! */ 388 register I32 end_shift = 0; 389 register char *s; 390 register SV *check; 391 char *strbeg; 392 char *t; 393 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ 394 I32 ml_anch; 395 register char *other_last = NULL; /* other substr checked before this */ 396 char *check_at = NULL; /* check substr found at this pos */ 397 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; 398 RXi_GET_DECL(prog,progi); 399 #ifdef DEBUGGING 400 const char * const i_strpos = strpos; 401 #endif 402 GET_RE_DEBUG_FLAGS_DECL; 403 404 PERL_ARGS_ASSERT_RE_INTUIT_START; 405 406 RX_MATCH_UTF8_set(prog,do_utf8); 407 408 if (RX_UTF8(prog)) { 409 PL_reg_flags |= RF_utf8; 410 } 411 DEBUG_EXECUTE_r( 412 debug_start_match(prog, do_utf8, strpos, strend, 413 sv ? "Guessing start of match in sv for" 414 : "Guessing start of match in string for"); 415 ); 416 417 /* CHR_DIST() would be more correct here but it makes things slow. */ 418 if (prog->minlen > strend - strpos) { 419 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 420 "String too short... [re_intuit_start]\n")); 421 goto fail; 422 } 423 424 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos; 425 PL_regeol = strend; 426 if (do_utf8) { 427 if (!prog->check_utf8 && prog->check_substr) 428 to_utf8_substr(prog); 429 check = prog->check_utf8; 430 } else { 431 if (!prog->check_substr && prog->check_utf8) 432 to_byte_substr(prog); 433 check = prog->check_substr; 434 } 435 if (check == &PL_sv_undef) { 436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 437 "Non-utf8 string cannot match utf8 check string\n")); 438 goto fail; 439 } 440 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */ 441 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) 442 || ( (prog->extflags & RXf_ANCH_BOL) 443 && !multiline ) ); /* Check after \n? */ 444 445 if (!ml_anch) { 446 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */ 447 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ 448 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */ 449 && sv && !SvROK(sv) 450 && (strpos != strbeg)) { 451 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); 452 goto fail; 453 } 454 if (prog->check_offset_min == prog->check_offset_max && 455 !(prog->extflags & RXf_CANY_SEEN)) { 456 /* Substring at constant offset from beg-of-str... */ 457 I32 slen; 458 459 s = HOP3c(strpos, prog->check_offset_min, strend); 460 461 if (SvTAIL(check)) { 462 slen = SvCUR(check); /* >= 1 */ 463 464 if ( strend - s > slen || strend - s < slen - 1 465 || (strend - s == slen && strend[-1] != '\n')) { 466 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); 467 goto fail_finish; 468 } 469 /* Now should match s[0..slen-2] */ 470 slen--; 471 if (slen && (*SvPVX_const(check) != *s 472 || (slen > 1 473 && memNE(SvPVX_const(check), s, slen)))) { 474 report_neq: 475 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); 476 goto fail_finish; 477 } 478 } 479 else if (*SvPVX_const(check) != *s 480 || ((slen = SvCUR(check)) > 1 481 && memNE(SvPVX_const(check), s, slen))) 482 goto report_neq; 483 check_at = s; 484 goto success_at_start; 485 } 486 } 487 /* Match is anchored, but substr is not anchored wrt beg-of-str. */ 488 s = strpos; 489 start_shift = prog->check_offset_min; /* okay to underestimate on CC */ 490 end_shift = prog->check_end_shift; 491 492 if (!ml_anch) { 493 const I32 end = prog->check_offset_max + CHR_SVLEN(check) 494 - (SvTAIL(check) != 0); 495 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; 496 497 if (end_shift < eshift) 498 end_shift = eshift; 499 } 500 } 501 else { /* Can match at random position */ 502 ml_anch = 0; 503 s = strpos; 504 start_shift = prog->check_offset_min; /* okay to underestimate on CC */ 505 end_shift = prog->check_end_shift; 506 507 /* end shift should be non negative here */ 508 } 509 510 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ 511 if (end_shift < 0) 512 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", 513 (IV)end_shift, RX_PRECOMP(prog)); 514 #endif 515 516 restart: 517 /* Find a possible match in the region s..strend by looking for 518 the "check" substring in the region corrected by start/end_shift. */ 519 520 { 521 I32 srch_start_shift = start_shift; 522 I32 srch_end_shift = end_shift; 523 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { 524 srch_end_shift -= ((strbeg - s) - srch_start_shift); 525 srch_start_shift = strbeg - s; 526 } 527 DEBUG_OPTIMISE_MORE_r({ 528 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", 529 (IV)prog->check_offset_min, 530 (IV)srch_start_shift, 531 (IV)srch_end_shift, 532 (IV)prog->check_end_shift); 533 }); 534 535 if (flags & REXEC_SCREAM) { 536 I32 p = -1; /* Internal iterator of scream. */ 537 I32 * const pp = data ? data->scream_pos : &p; 538 539 if (PL_screamfirst[BmRARE(check)] >= 0 540 || ( BmRARE(check) == '\n' 541 && (BmPREVIOUS(check) == SvCUR(check) - 1) 542 && SvTAIL(check) )) 543 s = screaminstr(sv, check, 544 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0); 545 else 546 goto fail_finish; 547 /* we may be pointing at the wrong string */ 548 if (s && RXp_MATCH_COPIED(prog)) 549 s = strbeg + (s - SvPVX_const(sv)); 550 if (data) 551 *data->scream_olds = s; 552 } 553 else { 554 U8* start_point; 555 U8* end_point; 556 if (prog->extflags & RXf_CANY_SEEN) { 557 start_point= (U8*)(s + srch_start_shift); 558 end_point= (U8*)(strend - srch_end_shift); 559 } else { 560 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); 561 end_point= HOP3(strend, -srch_end_shift, strbeg); 562 } 563 DEBUG_OPTIMISE_MORE_r({ 564 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", 565 (int)(end_point - start_point), 566 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), 567 start_point); 568 }); 569 570 s = fbm_instr( start_point, end_point, 571 check, multiline ? FBMrf_MULTILINE : 0); 572 } 573 } 574 /* Update the count-of-usability, remove useless subpatterns, 575 unshift s. */ 576 577 DEBUG_EXECUTE_r({ 578 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), 579 SvPVX_const(check), RE_SV_DUMPLEN(check), 30); 580 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", 581 (s ? "Found" : "Did not find"), 582 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) 583 ? "anchored" : "floating"), 584 quoted, 585 RE_SV_TAIL(check), 586 (s ? " at offset " : "...\n") ); 587 }); 588 589 if (!s) 590 goto fail_finish; 591 /* Finish the diagnostic message */ 592 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); 593 594 /* XXX dmq: first branch is for positive lookbehind... 595 Our check string is offset from the beginning of the pattern. 596 So we need to do any stclass tests offset forward from that 597 point. I think. :-( 598 */ 599 600 601 602 check_at=s; 603 604 605 /* Got a candidate. Check MBOL anchoring, and the *other* substr. 606 Start with the other substr. 607 XXXX no SCREAM optimization yet - and a very coarse implementation 608 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will 609 *always* match. Probably should be marked during compile... 610 Probably it is right to do no SCREAM here... 611 */ 612 613 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) 614 : (prog->float_substr && prog->anchored_substr)) 615 { 616 /* Take into account the "other" substring. */ 617 /* XXXX May be hopelessly wrong for UTF... */ 618 if (!other_last) 619 other_last = strpos; 620 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) { 621 do_other_anchored: 622 { 623 char * const last = HOP3c(s, -start_shift, strbeg); 624 char *last1, *last2; 625 char * const saved_s = s; 626 SV* must; 627 628 t = s - prog->check_offset_max; 629 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ 630 && (!do_utf8 631 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) 632 && t > strpos))) 633 NOOP; 634 else 635 t = strpos; 636 t = HOP3c(t, prog->anchored_offset, strend); 637 if (t < other_last) /* These positions already checked */ 638 t = other_last; 639 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); 640 if (last < last1) 641 last1 = last; 642 /* XXXX It is not documented what units *_offsets are in. 643 We assume bytes, but this is clearly wrong. 644 Meaning this code needs to be carefully reviewed for errors. 645 dmq. 646 */ 647 648 /* On end-of-str: see comment below. */ 649 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; 650 if (must == &PL_sv_undef) { 651 s = (char*)NULL; 652 DEBUG_r(must = prog->anchored_utf8); /* for debug */ 653 } 654 else 655 s = fbm_instr( 656 (unsigned char*)t, 657 HOP3(HOP3(last1, prog->anchored_offset, strend) 658 + SvCUR(must), -(SvTAIL(must)!=0), strbeg), 659 must, 660 multiline ? FBMrf_MULTILINE : 0 661 ); 662 DEBUG_EXECUTE_r({ 663 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), 664 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 665 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", 666 (s ? "Found" : "Contradicts"), 667 quoted, RE_SV_TAIL(must)); 668 }); 669 670 671 if (!s) { 672 if (last1 >= last2) { 673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 674 ", giving up...\n")); 675 goto fail_finish; 676 } 677 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 678 ", trying floating at offset %ld...\n", 679 (long)(HOP3c(saved_s, 1, strend) - i_strpos))); 680 other_last = HOP3c(last1, prog->anchored_offset+1, strend); 681 s = HOP3c(last, 1, strend); 682 goto restart; 683 } 684 else { 685 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", 686 (long)(s - i_strpos))); 687 t = HOP3c(s, -prog->anchored_offset, strbeg); 688 other_last = HOP3c(s, 1, strend); 689 s = saved_s; 690 if (t == strpos) 691 goto try_at_start; 692 goto try_at_offset; 693 } 694 } 695 } 696 else { /* Take into account the floating substring. */ 697 char *last, *last1; 698 char * const saved_s = s; 699 SV* must; 700 701 t = HOP3c(s, -start_shift, strbeg); 702 last1 = last = 703 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); 704 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) 705 last = HOP3c(t, prog->float_max_offset, strend); 706 s = HOP3c(t, prog->float_min_offset, strend); 707 if (s < other_last) 708 s = other_last; 709 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ 710 must = do_utf8 ? prog->float_utf8 : prog->float_substr; 711 /* fbm_instr() takes into account exact value of end-of-str 712 if the check is SvTAIL(ed). Since false positives are OK, 713 and end-of-str is not later than strend we are OK. */ 714 if (must == &PL_sv_undef) { 715 s = (char*)NULL; 716 DEBUG_r(must = prog->float_utf8); /* for debug message */ 717 } 718 else 719 s = fbm_instr((unsigned char*)s, 720 (unsigned char*)last + SvCUR(must) 721 - (SvTAIL(must)!=0), 722 must, multiline ? FBMrf_MULTILINE : 0); 723 DEBUG_EXECUTE_r({ 724 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), 725 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 726 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", 727 (s ? "Found" : "Contradicts"), 728 quoted, RE_SV_TAIL(must)); 729 }); 730 if (!s) { 731 if (last1 == last) { 732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 733 ", giving up...\n")); 734 goto fail_finish; 735 } 736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 737 ", trying anchored starting at offset %ld...\n", 738 (long)(saved_s + 1 - i_strpos))); 739 other_last = last; 740 s = HOP3c(t, 1, strend); 741 goto restart; 742 } 743 else { 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", 745 (long)(s - i_strpos))); 746 other_last = s; /* Fix this later. --Hugo */ 747 s = saved_s; 748 if (t == strpos) 749 goto try_at_start; 750 goto try_at_offset; 751 } 752 } 753 } 754 755 756 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); 757 758 DEBUG_OPTIMISE_MORE_r( 759 PerlIO_printf(Perl_debug_log, 760 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", 761 (IV)prog->check_offset_min, 762 (IV)prog->check_offset_max, 763 (IV)(s-strpos), 764 (IV)(t-strpos), 765 (IV)(t-s), 766 (IV)(strend-strpos) 767 ) 768 ); 769 770 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ 771 && (!do_utf8 772 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) 773 && t > strpos))) 774 { 775 /* Fixed substring is found far enough so that the match 776 cannot start at strpos. */ 777 try_at_offset: 778 if (ml_anch && t[-1] != '\n') { 779 /* Eventually fbm_*() should handle this, but often 780 anchored_offset is not 0, so this check will not be wasted. */ 781 /* XXXX In the code below we prefer to look for "^" even in 782 presence of anchored substrings. And we search even 783 beyond the found float position. These pessimizations 784 are historical artefacts only. */ 785 find_anchor: 786 while (t < strend - prog->minlen) { 787 if (*t == '\n') { 788 if (t < check_at - prog->check_offset_min) { 789 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) { 790 /* Since we moved from the found position, 791 we definitely contradict the found anchored 792 substr. Due to the above check we do not 793 contradict "check" substr. 794 Thus we can arrive here only if check substr 795 is float. Redo checking for "other"=="fixed". 796 */ 797 strpos = t + 1; 798 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", 799 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); 800 goto do_other_anchored; 801 } 802 /* We don't contradict the found floating substring. */ 803 /* XXXX Why not check for STCLASS? */ 804 s = t + 1; 805 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", 806 PL_colors[0], PL_colors[1], (long)(s - i_strpos))); 807 goto set_useful; 808 } 809 /* Position contradicts check-string */ 810 /* XXXX probably better to look for check-string 811 than for "\n", so one should lower the limit for t? */ 812 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", 813 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos))); 814 other_last = strpos = s = t + 1; 815 goto restart; 816 } 817 t++; 818 } 819 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", 820 PL_colors[0], PL_colors[1])); 821 goto fail_finish; 822 } 823 else { 824 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", 825 PL_colors[0], PL_colors[1])); 826 } 827 s = t; 828 set_useful: 829 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ 830 } 831 else { 832 /* The found string does not prohibit matching at strpos, 833 - no optimization of calling REx engine can be performed, 834 unless it was an MBOL and we are not after MBOL, 835 or a future STCLASS check will fail this. */ 836 try_at_start: 837 /* Even in this situation we may use MBOL flag if strpos is offset 838 wrt the start of the string. */ 839 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */ 840 && (strpos != strbeg) && strpos[-1] != '\n' 841 /* May be due to an implicit anchor of m{.*foo} */ 842 && !(prog->intflags & PREGf_IMPLICIT)) 843 { 844 t = strpos; 845 goto find_anchor; 846 } 847 DEBUG_EXECUTE_r( if (ml_anch) 848 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", 849 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]); 850 ); 851 success_at_start: 852 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ 853 && (do_utf8 ? ( 854 prog->check_utf8 /* Could be deleted already */ 855 && --BmUSEFUL(prog->check_utf8) < 0 856 && (prog->check_utf8 == prog->float_utf8) 857 ) : ( 858 prog->check_substr /* Could be deleted already */ 859 && --BmUSEFUL(prog->check_substr) < 0 860 && (prog->check_substr == prog->float_substr) 861 ))) 862 { 863 /* If flags & SOMETHING - do not do it many times on the same match */ 864 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); 865 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr); 866 if (do_utf8 ? prog->check_substr : prog->check_utf8) 867 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8); 868 prog->check_substr = prog->check_utf8 = NULL; /* disable */ 869 prog->float_substr = prog->float_utf8 = NULL; /* clear */ 870 check = NULL; /* abort */ 871 s = strpos; 872 /* XXXX This is a remnant of the old implementation. It 873 looks wasteful, since now INTUIT can use many 874 other heuristics. */ 875 prog->extflags &= ~RXf_USE_INTUIT; 876 } 877 else 878 s = strpos; 879 } 880 881 /* Last resort... */ 882 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ 883 /* trie stclasses are too expensive to use here, we are better off to 884 leave it to regmatch itself */ 885 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { 886 /* minlen == 0 is possible if regstclass is \b or \B, 887 and the fixed substr is ''$. 888 Since minlen is already taken into account, s+1 is before strend; 889 accidentally, minlen >= 1 guaranties no false positives at s + 1 890 even for \b or \B. But (minlen? 1 : 0) below assumes that 891 regstclass does not come from lookahead... */ 892 /* If regstclass takes bytelength more than 1: If charlength==1, OK. 893 This leaves EXACTF only, which is dealt with in find_byclass(). */ 894 const U8* const str = (U8*)STRING(progi->regstclass); 895 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT 896 ? CHR_DIST(str+STR_LEN(progi->regstclass), str) 897 : 1); 898 char * endpos; 899 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) 900 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); 901 else if (prog->float_substr || prog->float_utf8) 902 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); 903 else 904 endpos= strend; 905 906 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n", 907 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg))); 908 909 t = s; 910 s = find_byclass(prog, progi->regstclass, s, endpos, NULL); 911 if (!s) { 912 #ifdef DEBUGGING 913 const char *what = NULL; 914 #endif 915 if (endpos == strend) { 916 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 917 "Could not match STCLASS...\n") ); 918 goto fail; 919 } 920 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 921 "This position contradicts STCLASS...\n") ); 922 if ((prog->extflags & RXf_ANCH) && !ml_anch) 923 goto fail; 924 /* Contradict one of substrings */ 925 if (prog->anchored_substr || prog->anchored_utf8) { 926 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) { 927 DEBUG_EXECUTE_r( what = "anchored" ); 928 hop_and_restart: 929 s = HOP3c(t, 1, strend); 930 if (s + start_shift + end_shift > strend) { 931 /* XXXX Should be taken into account earlier? */ 932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 933 "Could not match STCLASS...\n") ); 934 goto fail; 935 } 936 if (!check) 937 goto giveup; 938 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 939 "Looking for %s substr starting at offset %ld...\n", 940 what, (long)(s + start_shift - i_strpos)) ); 941 goto restart; 942 } 943 /* Have both, check_string is floating */ 944 if (t + start_shift >= check_at) /* Contradicts floating=check */ 945 goto retry_floating_check; 946 /* Recheck anchored substring, but not floating... */ 947 s = check_at; 948 if (!check) 949 goto giveup; 950 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 951 "Looking for anchored substr starting at offset %ld...\n", 952 (long)(other_last - i_strpos)) ); 953 goto do_other_anchored; 954 } 955 /* Another way we could have checked stclass at the 956 current position only: */ 957 if (ml_anch) { 958 s = t = t + 1; 959 if (!check) 960 goto giveup; 961 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 962 "Looking for /%s^%s/m starting at offset %ld...\n", 963 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); 964 goto try_at_offset; 965 } 966 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ 967 goto fail; 968 /* Check is floating subtring. */ 969 retry_floating_check: 970 t = check_at - start_shift; 971 DEBUG_EXECUTE_r( what = "floating" ); 972 goto hop_and_restart; 973 } 974 if (t != s) { 975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 976 "By STCLASS: moving %ld --> %ld\n", 977 (long)(t - i_strpos), (long)(s - i_strpos)) 978 ); 979 } 980 else { 981 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 982 "Does not contradict STCLASS...\n"); 983 ); 984 } 985 } 986 giveup: 987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", 988 PL_colors[4], (check ? "Guessed" : "Giving up"), 989 PL_colors[5], (long)(s - i_strpos)) ); 990 return s; 991 992 fail_finish: /* Substring not found */ 993 if (prog->check_substr || prog->check_utf8) /* could be removed already */ 994 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ 995 fail: 996 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", 997 PL_colors[4], PL_colors[5])); 998 return NULL; 999 } 1000 1001 #define DECL_TRIE_TYPE(scan) \ 1002 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ 1003 trie_type = (scan->flags != EXACT) \ 1004 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \ 1005 : (do_utf8 ? trie_utf8 : trie_plain) 1006 1007 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ 1008 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ 1009 switch (trie_type) { \ 1010 case trie_utf8_fold: \ 1011 if ( foldlen>0 ) { \ 1012 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ 1013 foldlen -= len; \ 1014 uscan += len; \ 1015 len=0; \ 1016 } else { \ 1017 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ 1018 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ 1019 foldlen -= UNISKIP( uvc ); \ 1020 uscan = foldbuf + UNISKIP( uvc ); \ 1021 } \ 1022 break; \ 1023 case trie_latin_utf8_fold: \ 1024 if ( foldlen>0 ) { \ 1025 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ 1026 foldlen -= len; \ 1027 uscan += len; \ 1028 len=0; \ 1029 } else { \ 1030 len = 1; \ 1031 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \ 1032 foldlen -= UNISKIP( uvc ); \ 1033 uscan = foldbuf + UNISKIP( uvc ); \ 1034 } \ 1035 break; \ 1036 case trie_utf8: \ 1037 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ 1038 break; \ 1039 case trie_plain: \ 1040 uvc = (UV)*uc; \ 1041 len = 1; \ 1042 } \ 1043 if (uvc < 256) { \ 1044 charid = trie->charmap[ uvc ]; \ 1045 } \ 1046 else { \ 1047 charid = 0; \ 1048 if (widecharmap) { \ 1049 SV** const svpp = hv_fetch(widecharmap, \ 1050 (char*)&uvc, sizeof(UV), 0); \ 1051 if (svpp) \ 1052 charid = (U16)SvIV(*svpp); \ 1053 } \ 1054 } \ 1055 } STMT_END 1056 1057 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \ 1058 { \ 1059 char *my_strend= (char *)strend; \ 1060 if ( (CoNd) \ 1061 && (ln == len || \ 1062 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \ 1063 m, NULL, ln, (bool)UTF)) \ 1064 && (!reginfo || regtry(reginfo, &s)) ) \ 1065 goto got_it; \ 1066 else { \ 1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \ 1068 uvchr_to_utf8(tmpbuf, c); \ 1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \ 1070 if ( f != c \ 1071 && (f == c1 || f == c2) \ 1072 && (ln == len || \ 1073 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\ 1074 m, NULL, ln, (bool)UTF)) \ 1075 && (!reginfo || regtry(reginfo, &s)) ) \ 1076 goto got_it; \ 1077 } \ 1078 } \ 1079 s += len 1080 1081 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ 1082 STMT_START { \ 1083 while (s <= e) { \ 1084 if ( (CoNd) \ 1085 && (ln == 1 || !(OP(c) == EXACTF \ 1086 ? ibcmp(s, m, ln) \ 1087 : ibcmp_locale(s, m, ln))) \ 1088 && (!reginfo || regtry(reginfo, &s)) ) \ 1089 goto got_it; \ 1090 s++; \ 1091 } \ 1092 } STMT_END 1093 1094 #define REXEC_FBC_UTF8_SCAN(CoDe) \ 1095 STMT_START { \ 1096 while (s + (uskip = UTF8SKIP(s)) <= strend) { \ 1097 CoDe \ 1098 s += uskip; \ 1099 } \ 1100 } STMT_END 1101 1102 #define REXEC_FBC_SCAN(CoDe) \ 1103 STMT_START { \ 1104 while (s < strend) { \ 1105 CoDe \ 1106 s++; \ 1107 } \ 1108 } STMT_END 1109 1110 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \ 1111 REXEC_FBC_UTF8_SCAN( \ 1112 if (CoNd) { \ 1113 if (tmp && (!reginfo || regtry(reginfo, &s))) \ 1114 goto got_it; \ 1115 else \ 1116 tmp = doevery; \ 1117 } \ 1118 else \ 1119 tmp = 1; \ 1120 ) 1121 1122 #define REXEC_FBC_CLASS_SCAN(CoNd) \ 1123 REXEC_FBC_SCAN( \ 1124 if (CoNd) { \ 1125 if (tmp && (!reginfo || regtry(reginfo, &s))) \ 1126 goto got_it; \ 1127 else \ 1128 tmp = doevery; \ 1129 } \ 1130 else \ 1131 tmp = 1; \ 1132 ) 1133 1134 #define REXEC_FBC_TRYIT \ 1135 if ((!reginfo || regtry(reginfo, &s))) \ 1136 goto got_it 1137 1138 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ 1139 if (do_utf8) { \ 1140 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ 1141 } \ 1142 else { \ 1143 REXEC_FBC_CLASS_SCAN(CoNd); \ 1144 } \ 1145 break 1146 1147 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \ 1148 if (do_utf8) { \ 1149 UtFpReLoAd; \ 1150 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ 1151 } \ 1152 else { \ 1153 REXEC_FBC_CLASS_SCAN(CoNd); \ 1154 } \ 1155 break 1156 1157 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \ 1158 PL_reg_flags |= RF_tainted; \ 1159 if (do_utf8) { \ 1160 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ 1161 } \ 1162 else { \ 1163 REXEC_FBC_CLASS_SCAN(CoNd); \ 1164 } \ 1165 break 1166 1167 #define DUMP_EXEC_POS(li,s,doutf8) \ 1168 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8) 1169 1170 /* We know what class REx starts with. Try to find this position... */ 1171 /* if reginfo is NULL, its a dryrun */ 1172 /* annoyingly all the vars in this routine have different names from their counterparts 1173 in regmatch. /grrr */ 1174 1175 STATIC char * 1176 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, 1177 const char *strend, regmatch_info *reginfo) 1178 { 1179 dVAR; 1180 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; 1181 char *m; 1182 STRLEN ln; 1183 STRLEN lnc; 1184 register STRLEN uskip; 1185 unsigned int c1; 1186 unsigned int c2; 1187 char *e; 1188 register I32 tmp = 1; /* Scratch variable? */ 1189 register const bool do_utf8 = PL_reg_match_utf8; 1190 RXi_GET_DECL(prog,progi); 1191 1192 PERL_ARGS_ASSERT_FIND_BYCLASS; 1193 1194 /* We know what class it must start with. */ 1195 switch (OP(c)) { 1196 case ANYOF: 1197 if (do_utf8) { 1198 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) || 1199 !UTF8_IS_INVARIANT((U8)s[0]) ? 1200 reginclass(prog, c, (U8*)s, 0, do_utf8) : 1201 REGINCLASS(prog, c, (U8*)s)); 1202 } 1203 else { 1204 while (s < strend) { 1205 STRLEN skip = 1; 1206 1207 if (REGINCLASS(prog, c, (U8*)s) || 1208 (ANYOF_FOLD_SHARP_S(c, s, strend) && 1209 /* The assignment of 2 is intentional: 1210 * for the folded sharp s, the skip is 2. */ 1211 (skip = SHARP_S_SKIP))) { 1212 if (tmp && (!reginfo || regtry(reginfo, &s))) 1213 goto got_it; 1214 else 1215 tmp = doevery; 1216 } 1217 else 1218 tmp = 1; 1219 s += skip; 1220 } 1221 } 1222 break; 1223 case CANY: 1224 REXEC_FBC_SCAN( 1225 if (tmp && (!reginfo || regtry(reginfo, &s))) 1226 goto got_it; 1227 else 1228 tmp = doevery; 1229 ); 1230 break; 1231 case EXACTF: 1232 m = STRING(c); 1233 ln = STR_LEN(c); /* length to match in octets/bytes */ 1234 lnc = (I32) ln; /* length to match in characters */ 1235 if (UTF) { 1236 STRLEN ulen1, ulen2; 1237 U8 *sm = (U8 *) m; 1238 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; 1239 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; 1240 /* used by commented-out code below */ 1241 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/ 1242 1243 /* XXX: Since the node will be case folded at compile 1244 time this logic is a little odd, although im not 1245 sure that its actually wrong. --dmq */ 1246 1247 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1); 1248 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2); 1249 1250 /* XXX: This is kinda strange. to_utf8_XYZ returns the 1251 codepoint of the first character in the converted 1252 form, yet originally we did the extra step. 1253 No tests fail by commenting this code out however 1254 so Ive left it out. -- dmq. 1255 1256 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 1257 0, uniflags); 1258 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE, 1259 0, uniflags); 1260 */ 1261 1262 lnc = 0; 1263 while (sm < ((U8 *) m + ln)) { 1264 lnc++; 1265 sm += UTF8SKIP(sm); 1266 } 1267 } 1268 else { 1269 c1 = *(U8*)m; 1270 c2 = PL_fold[c1]; 1271 } 1272 goto do_exactf; 1273 case EXACTFL: 1274 m = STRING(c); 1275 ln = STR_LEN(c); 1276 lnc = (I32) ln; 1277 c1 = *(U8*)m; 1278 c2 = PL_fold_locale[c1]; 1279 do_exactf: 1280 e = HOP3c(strend, -((I32)lnc), s); 1281 1282 if (!reginfo && e < s) 1283 e = s; /* Due to minlen logic of intuit() */ 1284 1285 /* The idea in the EXACTF* cases is to first find the 1286 * first character of the EXACTF* node and then, if 1287 * necessary, case-insensitively compare the full 1288 * text of the node. The c1 and c2 are the first 1289 * characters (though in Unicode it gets a bit 1290 * more complicated because there are more cases 1291 * than just upper and lower: one needs to use 1292 * the so-called folding case for case-insensitive 1293 * matching (called "loose matching" in Unicode). 1294 * ibcmp_utf8() will do just that. */ 1295 1296 if (do_utf8 || UTF) { 1297 UV c, f; 1298 U8 tmpbuf [UTF8_MAXBYTES+1]; 1299 STRLEN len = 1; 1300 STRLEN foldlen; 1301 const U32 uniflags = UTF8_ALLOW_DEFAULT; 1302 if (c1 == c2) { 1303 /* Upper and lower of 1st char are equal - 1304 * probably not a "letter". */ 1305 while (s <= e) { 1306 if (do_utf8) { 1307 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, 1308 uniflags); 1309 } else { 1310 c = *((U8*)s); 1311 } 1312 REXEC_FBC_EXACTISH_CHECK(c == c1); 1313 } 1314 } 1315 else { 1316 while (s <= e) { 1317 if (do_utf8) { 1318 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, 1319 uniflags); 1320 } else { 1321 c = *((U8*)s); 1322 } 1323 1324 /* Handle some of the three Greek sigmas cases. 1325 * Note that not all the possible combinations 1326 * are handled here: some of them are handled 1327 * by the standard folding rules, and some of 1328 * them (the character class or ANYOF cases) 1329 * are handled during compiletime in 1330 * regexec.c:S_regclass(). */ 1331 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA || 1332 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) 1333 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA; 1334 1335 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2); 1336 } 1337 } 1338 } 1339 else { 1340 /* Neither pattern nor string are UTF8 */ 1341 if (c1 == c2) 1342 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); 1343 else 1344 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); 1345 } 1346 break; 1347 case BOUNDL: 1348 PL_reg_flags |= RF_tainted; 1349 /* FALL THROUGH */ 1350 case BOUND: 1351 if (do_utf8) { 1352 if (s == PL_bostr) 1353 tmp = '\n'; 1354 else { 1355 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); 1356 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); 1357 } 1358 tmp = ((OP(c) == BOUND ? 1359 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); 1360 LOAD_UTF8_CHARCLASS_ALNUM(); 1361 REXEC_FBC_UTF8_SCAN( 1362 if (tmp == !(OP(c) == BOUND ? 1363 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : 1364 isALNUM_LC_utf8((U8*)s))) 1365 { 1366 tmp = !tmp; 1367 REXEC_FBC_TRYIT; 1368 } 1369 ); 1370 } 1371 else { 1372 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; 1373 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); 1374 REXEC_FBC_SCAN( 1375 if (tmp == 1376 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { 1377 tmp = !tmp; 1378 REXEC_FBC_TRYIT; 1379 } 1380 ); 1381 } 1382 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) 1383 goto got_it; 1384 break; 1385 case NBOUNDL: 1386 PL_reg_flags |= RF_tainted; 1387 /* FALL THROUGH */ 1388 case NBOUND: 1389 if (do_utf8) { 1390 if (s == PL_bostr) 1391 tmp = '\n'; 1392 else { 1393 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); 1394 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); 1395 } 1396 tmp = ((OP(c) == NBOUND ? 1397 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); 1398 LOAD_UTF8_CHARCLASS_ALNUM(); 1399 REXEC_FBC_UTF8_SCAN( 1400 if (tmp == !(OP(c) == NBOUND ? 1401 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : 1402 isALNUM_LC_utf8((U8*)s))) 1403 tmp = !tmp; 1404 else REXEC_FBC_TRYIT; 1405 ); 1406 } 1407 else { 1408 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; 1409 tmp = ((OP(c) == NBOUND ? 1410 isALNUM(tmp) : isALNUM_LC(tmp)) != 0); 1411 REXEC_FBC_SCAN( 1412 if (tmp == 1413 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) 1414 tmp = !tmp; 1415 else REXEC_FBC_TRYIT; 1416 ); 1417 } 1418 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s))) 1419 goto got_it; 1420 break; 1421 case ALNUM: 1422 REXEC_FBC_CSCAN_PRELOAD( 1423 LOAD_UTF8_CHARCLASS_ALNUM(), 1424 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8), 1425 isALNUM(*s) 1426 ); 1427 case ALNUML: 1428 REXEC_FBC_CSCAN_TAINT( 1429 isALNUM_LC_utf8((U8*)s), 1430 isALNUM_LC(*s) 1431 ); 1432 case NALNUM: 1433 REXEC_FBC_CSCAN_PRELOAD( 1434 LOAD_UTF8_CHARCLASS_ALNUM(), 1435 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8), 1436 !isALNUM(*s) 1437 ); 1438 case NALNUML: 1439 REXEC_FBC_CSCAN_TAINT( 1440 !isALNUM_LC_utf8((U8*)s), 1441 !isALNUM_LC(*s) 1442 ); 1443 case SPACE: 1444 REXEC_FBC_CSCAN_PRELOAD( 1445 LOAD_UTF8_CHARCLASS_SPACE(), 1446 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8), 1447 isSPACE(*s) 1448 ); 1449 case SPACEL: 1450 REXEC_FBC_CSCAN_TAINT( 1451 *s == ' ' || isSPACE_LC_utf8((U8*)s), 1452 isSPACE_LC(*s) 1453 ); 1454 case NSPACE: 1455 REXEC_FBC_CSCAN_PRELOAD( 1456 LOAD_UTF8_CHARCLASS_SPACE(), 1457 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)), 1458 !isSPACE(*s) 1459 ); 1460 case NSPACEL: 1461 REXEC_FBC_CSCAN_TAINT( 1462 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)), 1463 !isSPACE_LC(*s) 1464 ); 1465 case DIGIT: 1466 REXEC_FBC_CSCAN_PRELOAD( 1467 LOAD_UTF8_CHARCLASS_DIGIT(), 1468 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8), 1469 isDIGIT(*s) 1470 ); 1471 case DIGITL: 1472 REXEC_FBC_CSCAN_TAINT( 1473 isDIGIT_LC_utf8((U8*)s), 1474 isDIGIT_LC(*s) 1475 ); 1476 case NDIGIT: 1477 REXEC_FBC_CSCAN_PRELOAD( 1478 LOAD_UTF8_CHARCLASS_DIGIT(), 1479 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8), 1480 !isDIGIT(*s) 1481 ); 1482 case NDIGITL: 1483 REXEC_FBC_CSCAN_TAINT( 1484 !isDIGIT_LC_utf8((U8*)s), 1485 !isDIGIT_LC(*s) 1486 ); 1487 case LNBREAK: 1488 REXEC_FBC_CSCAN( 1489 is_LNBREAK_utf8(s), 1490 is_LNBREAK_latin1(s) 1491 ); 1492 case VERTWS: 1493 REXEC_FBC_CSCAN( 1494 is_VERTWS_utf8(s), 1495 is_VERTWS_latin1(s) 1496 ); 1497 case NVERTWS: 1498 REXEC_FBC_CSCAN( 1499 !is_VERTWS_utf8(s), 1500 !is_VERTWS_latin1(s) 1501 ); 1502 case HORIZWS: 1503 REXEC_FBC_CSCAN( 1504 is_HORIZWS_utf8(s), 1505 is_HORIZWS_latin1(s) 1506 ); 1507 case NHORIZWS: 1508 REXEC_FBC_CSCAN( 1509 !is_HORIZWS_utf8(s), 1510 !is_HORIZWS_latin1(s) 1511 ); 1512 case AHOCORASICKC: 1513 case AHOCORASICK: 1514 { 1515 DECL_TRIE_TYPE(c); 1516 /* what trie are we using right now */ 1517 reg_ac_data *aho 1518 = (reg_ac_data*)progi->data->data[ ARG( c ) ]; 1519 reg_trie_data *trie 1520 = (reg_trie_data*)progi->data->data[ aho->trie ]; 1521 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); 1522 1523 const char *last_start = strend - trie->minlen; 1524 #ifdef DEBUGGING 1525 const char *real_start = s; 1526 #endif 1527 STRLEN maxlen = trie->maxlen; 1528 SV *sv_points; 1529 U8 **points; /* map of where we were in the input string 1530 when reading a given char. For ASCII this 1531 is unnecessary overhead as the relationship 1532 is always 1:1, but for Unicode, especially 1533 case folded Unicode this is not true. */ 1534 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1535 U8 *bitmap=NULL; 1536 1537 1538 GET_RE_DEBUG_FLAGS_DECL; 1539 1540 /* We can't just allocate points here. We need to wrap it in 1541 * an SV so it gets freed properly if there is a croak while 1542 * running the match */ 1543 ENTER; 1544 SAVETMPS; 1545 sv_points=newSV(maxlen * sizeof(U8 *)); 1546 SvCUR_set(sv_points, 1547 maxlen * sizeof(U8 *)); 1548 SvPOK_on(sv_points); 1549 sv_2mortal(sv_points); 1550 points=(U8**)SvPV_nolen(sv_points ); 1551 if ( trie_type != trie_utf8_fold 1552 && (trie->bitmap || OP(c)==AHOCORASICKC) ) 1553 { 1554 if (trie->bitmap) 1555 bitmap=(U8*)trie->bitmap; 1556 else 1557 bitmap=(U8*)ANYOF_BITMAP(c); 1558 } 1559 /* this is the Aho-Corasick algorithm modified a touch 1560 to include special handling for long "unknown char" 1561 sequences. The basic idea being that we use AC as long 1562 as we are dealing with a possible matching char, when 1563 we encounter an unknown char (and we have not encountered 1564 an accepting state) we scan forward until we find a legal 1565 starting char. 1566 AC matching is basically that of trie matching, except 1567 that when we encounter a failing transition, we fall back 1568 to the current states "fail state", and try the current char 1569 again, a process we repeat until we reach the root state, 1570 state 1, or a legal transition. If we fail on the root state 1571 then we can either terminate if we have reached an accepting 1572 state previously, or restart the entire process from the beginning 1573 if we have not. 1574 1575 */ 1576 while (s <= last_start) { 1577 const U32 uniflags = UTF8_ALLOW_DEFAULT; 1578 U8 *uc = (U8*)s; 1579 U16 charid = 0; 1580 U32 base = 1; 1581 U32 state = 1; 1582 UV uvc = 0; 1583 STRLEN len = 0; 1584 STRLEN foldlen = 0; 1585 U8 *uscan = (U8*)NULL; 1586 U8 *leftmost = NULL; 1587 #ifdef DEBUGGING 1588 U32 accepted_word= 0; 1589 #endif 1590 U32 pointpos = 0; 1591 1592 while ( state && uc <= (U8*)strend ) { 1593 int failed=0; 1594 U32 word = aho->states[ state ].wordnum; 1595 1596 if( state==1 ) { 1597 if ( bitmap ) { 1598 DEBUG_TRIE_EXECUTE_r( 1599 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 1600 dump_exec_pos( (char *)uc, c, strend, real_start, 1601 (char *)uc, do_utf8 ); 1602 PerlIO_printf( Perl_debug_log, 1603 " Scanning for legal start char...\n"); 1604 } 1605 ); 1606 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 1607 uc++; 1608 } 1609 s= (char *)uc; 1610 } 1611 if (uc >(U8*)last_start) break; 1612 } 1613 1614 if ( word ) { 1615 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ]; 1616 if (!leftmost || lpos < leftmost) { 1617 DEBUG_r(accepted_word=word); 1618 leftmost= lpos; 1619 } 1620 if (base==0) break; 1621 1622 } 1623 points[pointpos++ % maxlen]= uc; 1624 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 1625 uscan, len, uvc, charid, foldlen, 1626 foldbuf, uniflags); 1627 DEBUG_TRIE_EXECUTE_r({ 1628 dump_exec_pos( (char *)uc, c, strend, real_start, 1629 s, do_utf8 ); 1630 PerlIO_printf(Perl_debug_log, 1631 " Charid:%3u CP:%4"UVxf" ", 1632 charid, uvc); 1633 }); 1634 1635 do { 1636 #ifdef DEBUGGING 1637 word = aho->states[ state ].wordnum; 1638 #endif 1639 base = aho->states[ state ].trans.base; 1640 1641 DEBUG_TRIE_EXECUTE_r({ 1642 if (failed) 1643 dump_exec_pos( (char *)uc, c, strend, real_start, 1644 s, do_utf8 ); 1645 PerlIO_printf( Perl_debug_log, 1646 "%sState: %4"UVxf", word=%"UVxf, 1647 failed ? " Fail transition to " : "", 1648 (UV)state, (UV)word); 1649 }); 1650 if ( base ) { 1651 U32 tmp; 1652 if (charid && 1653 (base + charid > trie->uniquecharcount ) 1654 && (base + charid - 1 - trie->uniquecharcount 1655 < trie->lasttrans) 1656 && trie->trans[base + charid - 1 - 1657 trie->uniquecharcount].check == state 1658 && (tmp=trie->trans[base + charid - 1 - 1659 trie->uniquecharcount ].next)) 1660 { 1661 DEBUG_TRIE_EXECUTE_r( 1662 PerlIO_printf( Perl_debug_log," - legal\n")); 1663 state = tmp; 1664 break; 1665 } 1666 else { 1667 DEBUG_TRIE_EXECUTE_r( 1668 PerlIO_printf( Perl_debug_log," - fail\n")); 1669 failed = 1; 1670 state = aho->fail[state]; 1671 } 1672 } 1673 else { 1674 /* we must be accepting here */ 1675 DEBUG_TRIE_EXECUTE_r( 1676 PerlIO_printf( Perl_debug_log," - accepting\n")); 1677 failed = 1; 1678 break; 1679 } 1680 } while(state); 1681 uc += len; 1682 if (failed) { 1683 if (leftmost) 1684 break; 1685 if (!state) state = 1; 1686 } 1687 } 1688 if ( aho->states[ state ].wordnum ) { 1689 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ]; 1690 if (!leftmost || lpos < leftmost) { 1691 DEBUG_r(accepted_word=aho->states[ state ].wordnum); 1692 leftmost = lpos; 1693 } 1694 } 1695 if (leftmost) { 1696 s = (char*)leftmost; 1697 DEBUG_TRIE_EXECUTE_r({ 1698 PerlIO_printf( 1699 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n", 1700 (UV)accepted_word, (IV)(s - real_start) 1701 ); 1702 }); 1703 if (!reginfo || regtry(reginfo, &s)) { 1704 FREETMPS; 1705 LEAVE; 1706 goto got_it; 1707 } 1708 s = HOPc(s,1); 1709 DEBUG_TRIE_EXECUTE_r({ 1710 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n"); 1711 }); 1712 } else { 1713 DEBUG_TRIE_EXECUTE_r( 1714 PerlIO_printf( Perl_debug_log,"No match.\n")); 1715 break; 1716 } 1717 } 1718 FREETMPS; 1719 LEAVE; 1720 } 1721 break; 1722 default: 1723 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); 1724 break; 1725 } 1726 return 0; 1727 got_it: 1728 return s; 1729 } 1730 1731 static void 1732 S_swap_match_buff (pTHX_ regexp *prog) 1733 { 1734 regexp_paren_pair *t; 1735 1736 PERL_ARGS_ASSERT_SWAP_MATCH_BUFF; 1737 1738 if (!prog->swap) { 1739 /* We have to be careful. If the previous successful match 1740 was from this regex we don't want a subsequent paritally 1741 successful match to clobber the old results. 1742 So when we detect this possibility we add a swap buffer 1743 to the re, and switch the buffer each match. If we fail 1744 we switch it back, otherwise we leave it swapped. 1745 */ 1746 Newxz(prog->swap, (prog->nparens + 1), regexp_paren_pair); 1747 } 1748 t = prog->swap; 1749 prog->swap = prog->offs; 1750 prog->offs = t; 1751 } 1752 1753 1754 /* 1755 - regexec_flags - match a regexp against a string 1756 */ 1757 I32 1758 Perl_regexec_flags(pTHX_ REGEXP * const prog, char *stringarg, register char *strend, 1759 char *strbeg, I32 minend, SV *sv, void *data, U32 flags) 1760 /* strend: pointer to null at end of string */ 1761 /* strbeg: real beginning of string */ 1762 /* minend: end of match must be >=minend after stringarg. */ 1763 /* data: May be used for some additional optimizations. 1764 Currently its only used, with a U32 cast, for transmitting 1765 the ganch offset when doing a /g match. This will change */ 1766 /* nosave: For optimizations. */ 1767 { 1768 dVAR; 1769 /*register*/ char *s; 1770 register regnode *c; 1771 /*register*/ char *startpos = stringarg; 1772 I32 minlen; /* must match at least this many chars */ 1773 I32 dontbother = 0; /* how many characters not to try at end */ 1774 I32 end_shift = 0; /* Same for the end. */ /* CC */ 1775 I32 scream_pos = -1; /* Internal iterator of scream. */ 1776 char *scream_olds = NULL; 1777 const bool do_utf8 = (bool)DO_UTF8(sv); 1778 I32 multiline; 1779 RXi_GET_DECL(prog,progi); 1780 regmatch_info reginfo; /* create some info to pass to regtry etc */ 1781 bool swap_on_fail = 0; 1782 GET_RE_DEBUG_FLAGS_DECL; 1783 1784 PERL_ARGS_ASSERT_REGEXEC_FLAGS; 1785 PERL_UNUSED_ARG(data); 1786 1787 /* Be paranoid... */ 1788 if (prog == NULL || startpos == NULL) { 1789 Perl_croak(aTHX_ "NULL regexp parameter"); 1790 return 0; 1791 } 1792 1793 multiline = prog->extflags & RXf_PMf_MULTILINE; 1794 reginfo.prog = prog; 1795 1796 RX_MATCH_UTF8_set(prog, do_utf8); 1797 DEBUG_EXECUTE_r( 1798 debug_start_match(prog, do_utf8, startpos, strend, 1799 "Matching"); 1800 ); 1801 1802 minlen = prog->minlen; 1803 1804 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { 1805 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1806 "String too short [regexec_flags]...\n")); 1807 goto phooey; 1808 } 1809 1810 1811 /* Check validity of program. */ 1812 if (UCHARAT(progi->program) != REG_MAGIC) { 1813 Perl_croak(aTHX_ "corrupted regexp program"); 1814 } 1815 1816 PL_reg_flags = 0; 1817 PL_reg_eval_set = 0; 1818 PL_reg_maxiter = 0; 1819 1820 if (RX_UTF8(prog)) 1821 PL_reg_flags |= RF_utf8; 1822 1823 /* Mark beginning of line for ^ and lookbehind. */ 1824 reginfo.bol = startpos; /* XXX not used ??? */ 1825 PL_bostr = strbeg; 1826 reginfo.sv = sv; 1827 1828 /* Mark end of line for $ (and such) */ 1829 PL_regeol = strend; 1830 1831 /* see how far we have to get to not match where we matched before */ 1832 reginfo.till = startpos+minend; 1833 1834 /* If there is a "must appear" string, look for it. */ 1835 s = startpos; 1836 1837 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */ 1838 MAGIC *mg; 1839 1840 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */ 1841 reginfo.ganch = startpos + prog->gofs; 1842 else if (sv && SvTYPE(sv) >= SVt_PVMG 1843 && SvMAGIC(sv) 1844 && (mg = mg_find(sv, PERL_MAGIC_regex_global)) 1845 && mg->mg_len >= 0) { 1846 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */ 1847 if (prog->extflags & RXf_ANCH_GPOS) { 1848 if (s > reginfo.ganch) 1849 goto phooey; 1850 s = reginfo.ganch - prog->gofs; 1851 } 1852 } 1853 else if (data) { 1854 reginfo.ganch = strbeg + PTR2UV(data); 1855 } else /* pos() not defined */ 1856 reginfo.ganch = strbeg; 1857 } 1858 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) { 1859 swap_on_fail = 1; 1860 swap_match_buff(prog); /* do we need a save destructor here for 1861 eval dies? */ 1862 } 1863 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { 1864 re_scream_pos_data d; 1865 1866 d.scream_olds = &scream_olds; 1867 d.scream_pos = &scream_pos; 1868 s = re_intuit_start(prog, sv, s, strend, flags, &d); 1869 if (!s) { 1870 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); 1871 goto phooey; /* not present */ 1872 } 1873 } 1874 1875 1876 1877 /* Simplest case: anchored match need be tried only once. */ 1878 /* [unless only anchor is BOL and multiline is set] */ 1879 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { 1880 if (s == startpos && regtry(®info, &startpos)) 1881 goto got_it; 1882 else if (multiline || (prog->intflags & PREGf_IMPLICIT) 1883 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ 1884 { 1885 char *end; 1886 1887 if (minlen) 1888 dontbother = minlen - 1; 1889 end = HOP3c(strend, -dontbother, strbeg) - 1; 1890 /* for multiline we only have to try after newlines */ 1891 if (prog->check_substr || prog->check_utf8) { 1892 if (s == startpos) 1893 goto after_try; 1894 while (1) { 1895 if (regtry(®info, &s)) 1896 goto got_it; 1897 after_try: 1898 if (s > end) 1899 goto phooey; 1900 if (prog->extflags & RXf_USE_INTUIT) { 1901 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL); 1902 if (!s) 1903 goto phooey; 1904 } 1905 else 1906 s++; 1907 } 1908 } else { 1909 if (s > startpos) 1910 s--; 1911 while (s < end) { 1912 if (*s++ == '\n') { /* don't need PL_utf8skip here */ 1913 if (regtry(®info, &s)) 1914 goto got_it; 1915 } 1916 } 1917 } 1918 } 1919 goto phooey; 1920 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) 1921 { 1922 /* the warning about reginfo.ganch being used without intialization 1923 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN 1924 and we only enter this block when the same bit is set. */ 1925 char *tmp_s = reginfo.ganch - prog->gofs; 1926 if (regtry(®info, &tmp_s)) 1927 goto got_it; 1928 goto phooey; 1929 } 1930 1931 /* Messy cases: unanchored match. */ 1932 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { 1933 /* we have /x+whatever/ */ 1934 /* it must be a one character string (XXXX Except UTF?) */ 1935 char ch; 1936 #ifdef DEBUGGING 1937 int did_match = 0; 1938 #endif 1939 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) 1940 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1941 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0]; 1942 1943 if (do_utf8) { 1944 REXEC_FBC_SCAN( 1945 if (*s == ch) { 1946 DEBUG_EXECUTE_r( did_match = 1 ); 1947 if (regtry(®info, &s)) goto got_it; 1948 s += UTF8SKIP(s); 1949 while (s < strend && *s == ch) 1950 s += UTF8SKIP(s); 1951 } 1952 ); 1953 } 1954 else { 1955 REXEC_FBC_SCAN( 1956 if (*s == ch) { 1957 DEBUG_EXECUTE_r( did_match = 1 ); 1958 if (regtry(®info, &s)) goto got_it; 1959 s++; 1960 while (s < strend && *s == ch) 1961 s++; 1962 } 1963 ); 1964 } 1965 DEBUG_EXECUTE_r(if (!did_match) 1966 PerlIO_printf(Perl_debug_log, 1967 "Did not find anchored character...\n") 1968 ); 1969 } 1970 else if (prog->anchored_substr != NULL 1971 || prog->anchored_utf8 != NULL 1972 || ((prog->float_substr != NULL || prog->float_utf8 != NULL) 1973 && prog->float_max_offset < strend - s)) { 1974 SV *must; 1975 I32 back_max; 1976 I32 back_min; 1977 char *last; 1978 char *last1; /* Last position checked before */ 1979 #ifdef DEBUGGING 1980 int did_match = 0; 1981 #endif 1982 if (prog->anchored_substr || prog->anchored_utf8) { 1983 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) 1984 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1985 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; 1986 back_max = back_min = prog->anchored_offset; 1987 } else { 1988 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) 1989 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1990 must = do_utf8 ? prog->float_utf8 : prog->float_substr; 1991 back_max = prog->float_max_offset; 1992 back_min = prog->float_min_offset; 1993 } 1994 1995 1996 if (must == &PL_sv_undef) 1997 /* could not downgrade utf8 check substring, so must fail */ 1998 goto phooey; 1999 2000 if (back_min<0) { 2001 last = strend; 2002 } else { 2003 last = HOP3c(strend, /* Cannot start after this */ 2004 -(I32)(CHR_SVLEN(must) 2005 - (SvTAIL(must) != 0) + back_min), strbeg); 2006 } 2007 if (s > PL_bostr) 2008 last1 = HOPc(s, -1); 2009 else 2010 last1 = s - 1; /* bogus */ 2011 2012 /* XXXX check_substr already used to find "s", can optimize if 2013 check_substr==must. */ 2014 scream_pos = -1; 2015 dontbother = end_shift; 2016 strend = HOPc(strend, -dontbother); 2017 while ( (s <= last) && 2018 ((flags & REXEC_SCREAM) 2019 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg, 2020 end_shift, &scream_pos, 0)) 2021 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), 2022 (unsigned char*)strend, must, 2023 multiline ? FBMrf_MULTILINE : 0))) ) { 2024 /* we may be pointing at the wrong string */ 2025 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog)) 2026 s = strbeg + (s - SvPVX_const(sv)); 2027 DEBUG_EXECUTE_r( did_match = 1 ); 2028 if (HOPc(s, -back_max) > last1) { 2029 last1 = HOPc(s, -back_min); 2030 s = HOPc(s, -back_max); 2031 } 2032 else { 2033 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1; 2034 2035 last1 = HOPc(s, -back_min); 2036 s = t; 2037 } 2038 if (do_utf8) { 2039 while (s <= last1) { 2040 if (regtry(®info, &s)) 2041 goto got_it; 2042 s += UTF8SKIP(s); 2043 } 2044 } 2045 else { 2046 while (s <= last1) { 2047 if (regtry(®info, &s)) 2048 goto got_it; 2049 s++; 2050 } 2051 } 2052 } 2053 DEBUG_EXECUTE_r(if (!did_match) { 2054 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), 2055 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 2056 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", 2057 ((must == prog->anchored_substr || must == prog->anchored_utf8) 2058 ? "anchored" : "floating"), 2059 quoted, RE_SV_TAIL(must)); 2060 }); 2061 goto phooey; 2062 } 2063 else if ( (c = progi->regstclass) ) { 2064 if (minlen) { 2065 const OPCODE op = OP(progi->regstclass); 2066 /* don't bother with what can't match */ 2067 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE) 2068 strend = HOPc(strend, -(minlen - 1)); 2069 } 2070 DEBUG_EXECUTE_r({ 2071 SV * const prop = sv_newmortal(); 2072 regprop(prog, prop, c); 2073 { 2074 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1), 2075 s,strend-s,60); 2076 PerlIO_printf(Perl_debug_log, 2077 "Matching stclass %.*s against %s (%d chars)\n", 2078 (int)SvCUR(prop), SvPVX_const(prop), 2079 quoted, (int)(strend - s)); 2080 } 2081 }); 2082 if (find_byclass(prog, c, s, strend, ®info)) 2083 goto got_it; 2084 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n")); 2085 } 2086 else { 2087 dontbother = 0; 2088 if (prog->float_substr != NULL || prog->float_utf8 != NULL) { 2089 /* Trim the end. */ 2090 char *last; 2091 SV* float_real; 2092 2093 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) 2094 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 2095 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr; 2096 2097 if (flags & REXEC_SCREAM) { 2098 last = screaminstr(sv, float_real, s - strbeg, 2099 end_shift, &scream_pos, 1); /* last one */ 2100 if (!last) 2101 last = scream_olds; /* Only one occurrence. */ 2102 /* we may be pointing at the wrong string */ 2103 else if (RXp_MATCH_COPIED(prog)) 2104 s = strbeg + (s - SvPVX_const(sv)); 2105 } 2106 else { 2107 STRLEN len; 2108 const char * const little = SvPV_const(float_real, len); 2109 2110 if (SvTAIL(float_real)) { 2111 if (memEQ(strend - len + 1, little, len - 1)) 2112 last = strend - len + 1; 2113 else if (!multiline) 2114 last = memEQ(strend - len, little, len) 2115 ? strend - len : NULL; 2116 else 2117 goto find_last; 2118 } else { 2119 find_last: 2120 if (len) 2121 last = rninstr(s, strend, little, little + len); 2122 else 2123 last = strend; /* matching "$" */ 2124 } 2125 } 2126 if (last == NULL) { 2127 DEBUG_EXECUTE_r( 2128 PerlIO_printf(Perl_debug_log, 2129 "%sCan't trim the tail, match fails (should not happen)%s\n", 2130 PL_colors[4], PL_colors[5])); 2131 goto phooey; /* Should not happen! */ 2132 } 2133 dontbother = strend - last + prog->float_min_offset; 2134 } 2135 if (minlen && (dontbother < minlen)) 2136 dontbother = minlen - 1; 2137 strend -= dontbother; /* this one's always in bytes! */ 2138 /* We don't know much -- general case. */ 2139 if (do_utf8) { 2140 for (;;) { 2141 if (regtry(®info, &s)) 2142 goto got_it; 2143 if (s >= strend) 2144 break; 2145 s += UTF8SKIP(s); 2146 }; 2147 } 2148 else { 2149 do { 2150 if (regtry(®info, &s)) 2151 goto got_it; 2152 } while (s++ < strend); 2153 } 2154 } 2155 2156 /* Failure. */ 2157 goto phooey; 2158 2159 got_it: 2160 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted); 2161 2162 if (PL_reg_eval_set) 2163 restore_pos(aTHX_ prog); 2164 if (RXp_PAREN_NAMES(prog)) 2165 (void)hv_iterinit(RXp_PAREN_NAMES(prog)); 2166 2167 /* make sure $`, $&, $', and $digit will work later */ 2168 if ( !(flags & REXEC_NOT_FIRST) ) { 2169 RX_MATCH_COPY_FREE(prog); 2170 if (flags & REXEC_COPY_STR) { 2171 const I32 i = PL_regeol - startpos + (stringarg - strbeg); 2172 #ifdef PERL_OLD_COPY_ON_WRITE 2173 if ((SvIsCOW(sv) 2174 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) { 2175 if (DEBUG_C_TEST) { 2176 PerlIO_printf(Perl_debug_log, 2177 "Copy on write: regexp capture, type %d\n", 2178 (int) SvTYPE(sv)); 2179 } 2180 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); 2181 prog->subbeg = (char *)SvPVX_const(prog->saved_copy); 2182 assert (SvPOKp(prog->saved_copy)); 2183 } else 2184 #endif 2185 { 2186 RX_MATCH_COPIED_on(prog); 2187 s = savepvn(strbeg, i); 2188 prog->subbeg = s; 2189 } 2190 prog->sublen = i; 2191 } 2192 else { 2193 prog->subbeg = strbeg; 2194 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ 2195 } 2196 } 2197 2198 return 1; 2199 2200 phooey: 2201 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", 2202 PL_colors[4], PL_colors[5])); 2203 if (PL_reg_eval_set) 2204 restore_pos(aTHX_ prog); 2205 if (swap_on_fail) 2206 /* we failed :-( roll it back */ 2207 swap_match_buff(prog); 2208 2209 return 0; 2210 } 2211 2212 2213 /* 2214 - regtry - try match at specific point 2215 */ 2216 STATIC I32 /* 0 failure, 1 success */ 2217 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) 2218 { 2219 dVAR; 2220 CHECKPOINT lastcp; 2221 regexp *prog = reginfo->prog; 2222 RXi_GET_DECL(prog,progi); 2223 GET_RE_DEBUG_FLAGS_DECL; 2224 2225 PERL_ARGS_ASSERT_REGTRY; 2226 2227 reginfo->cutpoint=NULL; 2228 2229 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) { 2230 MAGIC *mg; 2231 2232 PL_reg_eval_set = RS_init; 2233 DEBUG_EXECUTE_r(DEBUG_s( 2234 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n", 2235 (IV)(PL_stack_sp - PL_stack_base)); 2236 )); 2237 SAVESTACK_CXPOS(); 2238 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; 2239 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ 2240 SAVETMPS; 2241 /* Apparently this is not needed, judging by wantarray. */ 2242 /* SAVEI8(cxstack[cxstack_ix].blk_gimme); 2243 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ 2244 2245 if (reginfo->sv) { 2246 /* Make $_ available to executed code. */ 2247 if (reginfo->sv != DEFSV) { 2248 SAVE_DEFSV; 2249 DEFSV_set(reginfo->sv); 2250 } 2251 2252 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv) 2253 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) { 2254 /* prepare for quick setting of pos */ 2255 #ifdef PERL_OLD_COPY_ON_WRITE 2256 if (SvIsCOW(reginfo->sv)) 2257 sv_force_normal_flags(reginfo->sv, 0); 2258 #endif 2259 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global, 2260 &PL_vtbl_mglob, NULL, 0); 2261 mg->mg_len = -1; 2262 } 2263 PL_reg_magic = mg; 2264 PL_reg_oldpos = mg->mg_len; 2265 SAVEDESTRUCTOR_X(restore_pos, prog); 2266 } 2267 if (!PL_reg_curpm) { 2268 Newxz(PL_reg_curpm, 1, PMOP); 2269 #ifdef USE_ITHREADS 2270 { 2271 SV* const repointer = newSViv(0); 2272 /* this regexp is also owned by the new PL_reg_curpm, which 2273 will try to free it. */ 2274 av_push(PL_regex_padav,repointer); 2275 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); 2276 PL_regex_pad = AvARRAY(PL_regex_padav); 2277 } 2278 #endif 2279 } 2280 #ifdef USE_ITHREADS 2281 /* It seems that non-ithreads works both with and without this code. 2282 So for efficiency reasons it seems best not to have the code 2283 compiled when it is not needed. */ 2284 /* This is safe against NULLs: */ 2285 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); 2286 /* PM_reg_curpm owns a reference to this regexp. */ 2287 ReREFCNT_inc(prog); 2288 #endif 2289 PM_SETRE(PL_reg_curpm, prog); 2290 PL_reg_oldcurpm = PL_curpm; 2291 PL_curpm = PL_reg_curpm; 2292 if (RXp_MATCH_COPIED(prog)) { 2293 /* Here is a serious problem: we cannot rewrite subbeg, 2294 since it may be needed if this match fails. Thus 2295 $` inside (?{}) could fail... */ 2296 PL_reg_oldsaved = prog->subbeg; 2297 PL_reg_oldsavedlen = prog->sublen; 2298 #ifdef PERL_OLD_COPY_ON_WRITE 2299 PL_nrs = prog->saved_copy; 2300 #endif 2301 RXp_MATCH_COPIED_off(prog); 2302 } 2303 else 2304 PL_reg_oldsaved = NULL; 2305 prog->subbeg = PL_bostr; 2306 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ 2307 } 2308 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos); 2309 prog->offs[0].start = *startpos - PL_bostr; 2310 PL_reginput = *startpos; 2311 PL_reglastparen = &prog->lastparen; 2312 PL_reglastcloseparen = &prog->lastcloseparen; 2313 prog->lastparen = 0; 2314 prog->lastcloseparen = 0; 2315 PL_regsize = 0; 2316 PL_regoffs = prog->offs; 2317 if (PL_reg_start_tmpl <= prog->nparens) { 2318 PL_reg_start_tmpl = prog->nparens*3/2 + 3; 2319 if(PL_reg_start_tmp) 2320 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 2321 else 2322 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 2323 } 2324 2325 /* XXXX What this code is doing here?!!! There should be no need 2326 to do this again and again, PL_reglastparen should take care of 2327 this! --ilya*/ 2328 2329 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. 2330 * Actually, the code in regcppop() (which Ilya may be meaning by 2331 * PL_reglastparen), is not needed at all by the test suite 2332 * (op/regexp, op/pat, op/split), but that code is needed otherwise 2333 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 2334 * Meanwhile, this code *is* needed for the 2335 * above-mentioned test suite tests to succeed. The common theme 2336 * on those tests seems to be returning null fields from matches. 2337 * --jhi updated by dapm */ 2338 #if 1 2339 if (prog->nparens) { 2340 regexp_paren_pair *pp = PL_regoffs; 2341 register I32 i; 2342 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) { 2343 ++pp; 2344 pp->start = -1; 2345 pp->end = -1; 2346 } 2347 } 2348 #endif 2349 REGCP_SET(lastcp); 2350 if (regmatch(reginfo, progi->program + 1)) { 2351 PL_regoffs[0].end = PL_reginput - PL_bostr; 2352 return 1; 2353 } 2354 if (reginfo->cutpoint) 2355 *startpos= reginfo->cutpoint; 2356 REGCP_UNWIND(lastcp); 2357 return 0; 2358 } 2359 2360 2361 #define sayYES goto yes 2362 #define sayNO goto no 2363 #define sayNO_SILENT goto no_silent 2364 2365 /* we dont use STMT_START/END here because it leads to 2366 "unreachable code" warnings, which are bogus, but distracting. */ 2367 #define CACHEsayNO \ 2368 if (ST.cache_mask) \ 2369 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \ 2370 sayNO 2371 2372 /* this is used to determine how far from the left messages like 2373 'failed...' are printed. It should be set such that messages 2374 are inline with the regop output that created them. 2375 */ 2376 #define REPORT_CODE_OFF 32 2377 2378 2379 /* Make sure there is a test for this +1 options in re_tests */ 2380 #define TRIE_INITAL_ACCEPT_BUFFLEN 4; 2381 2382 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ 2383 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ 2384 2385 #define SLAB_FIRST(s) (&(s)->states[0]) 2386 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) 2387 2388 /* grab a new slab and return the first slot in it */ 2389 2390 STATIC regmatch_state * 2391 S_push_slab(pTHX) 2392 { 2393 #if PERL_VERSION < 9 && !defined(PERL_CORE) 2394 dMY_CXT; 2395 #endif 2396 regmatch_slab *s = PL_regmatch_slab->next; 2397 if (!s) { 2398 Newx(s, 1, regmatch_slab); 2399 s->prev = PL_regmatch_slab; 2400 s->next = NULL; 2401 PL_regmatch_slab->next = s; 2402 } 2403 PL_regmatch_slab = s; 2404 return SLAB_FIRST(s); 2405 } 2406 2407 2408 /* push a new state then goto it */ 2409 2410 #define PUSH_STATE_GOTO(state, node) \ 2411 scan = node; \ 2412 st->resume_state = state; \ 2413 goto push_state; 2414 2415 /* push a new state with success backtracking, then goto it */ 2416 2417 #define PUSH_YES_STATE_GOTO(state, node) \ 2418 scan = node; \ 2419 st->resume_state = state; \ 2420 goto push_yes_state; 2421 2422 2423 2424 /* 2425 2426 regmatch() - main matching routine 2427 2428 This is basically one big switch statement in a loop. We execute an op, 2429 set 'next' to point the next op, and continue. If we come to a point which 2430 we may need to backtrack to on failure such as (A|B|C), we push a 2431 backtrack state onto the backtrack stack. On failure, we pop the top 2432 state, and re-enter the loop at the state indicated. If there are no more 2433 states to pop, we return failure. 2434 2435 Sometimes we also need to backtrack on success; for example /A+/, where 2436 after successfully matching one A, we need to go back and try to 2437 match another one; similarly for lookahead assertions: if the assertion 2438 completes successfully, we backtrack to the state just before the assertion 2439 and then carry on. In these cases, the pushed state is marked as 2440 'backtrack on success too'. This marking is in fact done by a chain of 2441 pointers, each pointing to the previous 'yes' state. On success, we pop to 2442 the nearest yes state, discarding any intermediate failure-only states. 2443 Sometimes a yes state is pushed just to force some cleanup code to be 2444 called at the end of a successful match or submatch; e.g. (??{$re}) uses 2445 it to free the inner regex. 2446 2447 Note that failure backtracking rewinds the cursor position, while 2448 success backtracking leaves it alone. 2449 2450 A pattern is complete when the END op is executed, while a subpattern 2451 such as (?=foo) is complete when the SUCCESS op is executed. Both of these 2452 ops trigger the "pop to last yes state if any, otherwise return true" 2453 behaviour. 2454 2455 A common convention in this function is to use A and B to refer to the two 2456 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is 2457 the subpattern to be matched possibly multiple times, while B is the entire 2458 rest of the pattern. Variable and state names reflect this convention. 2459 2460 The states in the main switch are the union of ops and failure/success of 2461 substates associated with with that op. For example, IFMATCH is the op 2462 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means 2463 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just 2464 successfully matched A and IFMATCH_A_fail is a state saying that we have 2465 just failed to match A. Resume states always come in pairs. The backtrack 2466 state we push is marked as 'IFMATCH_A', but when that is popped, we resume 2467 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking 2468 on success or failure. 2469 2470 The struct that holds a backtracking state is actually a big union, with 2471 one variant for each major type of op. The variable st points to the 2472 top-most backtrack struct. To make the code clearer, within each 2473 block of code we #define ST to alias the relevant union. 2474 2475 Here's a concrete example of a (vastly oversimplified) IFMATCH 2476 implementation: 2477 2478 switch (state) { 2479 .... 2480 2481 #define ST st->u.ifmatch 2482 2483 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 2484 ST.foo = ...; // some state we wish to save 2485 ... 2486 // push a yes backtrack state with a resume value of 2487 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the 2488 // first node of A: 2489 PUSH_YES_STATE_GOTO(IFMATCH_A, A); 2490 // NOTREACHED 2491 2492 case IFMATCH_A: // we have successfully executed A; now continue with B 2493 next = B; 2494 bar = ST.foo; // do something with the preserved value 2495 break; 2496 2497 case IFMATCH_A_fail: // A failed, so the assertion failed 2498 ...; // do some housekeeping, then ... 2499 sayNO; // propagate the failure 2500 2501 #undef ST 2502 2503 ... 2504 } 2505 2506 For any old-timers reading this who are familiar with the old recursive 2507 approach, the code above is equivalent to: 2508 2509 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 2510 { 2511 int foo = ... 2512 ... 2513 if (regmatch(A)) { 2514 next = B; 2515 bar = foo; 2516 break; 2517 } 2518 ...; // do some housekeeping, then ... 2519 sayNO; // propagate the failure 2520 } 2521 2522 The topmost backtrack state, pointed to by st, is usually free. If you 2523 want to claim it, populate any ST.foo fields in it with values you wish to 2524 save, then do one of 2525 2526 PUSH_STATE_GOTO(resume_state, node); 2527 PUSH_YES_STATE_GOTO(resume_state, node); 2528 2529 which sets that backtrack state's resume value to 'resume_state', pushes a 2530 new free entry to the top of the backtrack stack, then goes to 'node'. 2531 On backtracking, the free slot is popped, and the saved state becomes the 2532 new free state. An ST.foo field in this new top state can be temporarily 2533 accessed to retrieve values, but once the main loop is re-entered, it 2534 becomes available for reuse. 2535 2536 Note that the depth of the backtrack stack constantly increases during the 2537 left-to-right execution of the pattern, rather than going up and down with 2538 the pattern nesting. For example the stack is at its maximum at Z at the 2539 end of the pattern, rather than at X in the following: 2540 2541 /(((X)+)+)+....(Y)+....Z/ 2542 2543 The only exceptions to this are lookahead/behind assertions and the cut, 2544 (?>A), which pop all the backtrack states associated with A before 2545 continuing. 2546 2547 Bascktrack state structs are allocated in slabs of about 4K in size. 2548 PL_regmatch_state and st always point to the currently active state, 2549 and PL_regmatch_slab points to the slab currently containing 2550 PL_regmatch_state. The first time regmatch() is called, the first slab is 2551 allocated, and is never freed until interpreter destruction. When the slab 2552 is full, a new one is allocated and chained to the end. At exit from 2553 regmatch(), slabs allocated since entry are freed. 2554 2555 */ 2556 2557 2558 #define DEBUG_STATE_pp(pp) \ 2559 DEBUG_STATE_r({ \ 2560 DUMP_EXEC_POS(locinput, scan, do_utf8); \ 2561 PerlIO_printf(Perl_debug_log, \ 2562 " %*s"pp" %s%s%s%s%s\n", \ 2563 depth*2, "", \ 2564 PL_reg_name[st->resume_state], \ 2565 ((st==yes_state||st==mark_state) ? "[" : ""), \ 2566 ((st==yes_state) ? "Y" : ""), \ 2567 ((st==mark_state) ? "M" : ""), \ 2568 ((st==yes_state||st==mark_state) ? "]" : "") \ 2569 ); \ 2570 }); 2571 2572 2573 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1) 2574 2575 #ifdef DEBUGGING 2576 2577 STATIC void 2578 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8, 2579 const char *start, const char *end, const char *blurb) 2580 { 2581 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; 2582 2583 PERL_ARGS_ASSERT_DEBUG_START_MATCH; 2584 2585 if (!PL_colorset) 2586 reginitcolors(); 2587 { 2588 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), 2589 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); 2590 2591 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1), 2592 start, end - start, 60); 2593 2594 PerlIO_printf(Perl_debug_log, 2595 "%s%s REx%s %s against %s\n", 2596 PL_colors[4], blurb, PL_colors[5], s0, s1); 2597 2598 if (do_utf8||utf8_pat) 2599 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", 2600 utf8_pat ? "pattern" : "", 2601 utf8_pat && do_utf8 ? " and " : "", 2602 do_utf8 ? "string" : "" 2603 ); 2604 } 2605 } 2606 2607 STATIC void 2608 S_dump_exec_pos(pTHX_ const char *locinput, 2609 const regnode *scan, 2610 const char *loc_regeol, 2611 const char *loc_bostr, 2612 const char *loc_reg_starttry, 2613 const bool do_utf8) 2614 { 2615 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; 2616 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ 2617 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); 2618 /* The part of the string before starttry has one color 2619 (pref0_len chars), between starttry and current 2620 position another one (pref_len - pref0_len chars), 2621 after the current position the third one. 2622 We assume that pref0_len <= pref_len, otherwise we 2623 decrease pref0_len. */ 2624 int pref_len = (locinput - loc_bostr) > (5 + taill) - l 2625 ? (5 + taill) - l : locinput - loc_bostr; 2626 int pref0_len; 2627 2628 PERL_ARGS_ASSERT_DUMP_EXEC_POS; 2629 2630 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) 2631 pref_len++; 2632 pref0_len = pref_len - (locinput - loc_reg_starttry); 2633 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) 2634 l = ( loc_regeol - locinput > (5 + taill) - pref_len 2635 ? (5 + taill) - pref_len : loc_regeol - locinput); 2636 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) 2637 l--; 2638 if (pref0_len < 0) 2639 pref0_len = 0; 2640 if (pref0_len > pref_len) 2641 pref0_len = pref_len; 2642 { 2643 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0; 2644 2645 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), 2646 (locinput - pref_len),pref0_len, 60, 4, 5); 2647 2648 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), 2649 (locinput - pref_len + pref0_len), 2650 pref_len - pref0_len, 60, 2, 3); 2651 2652 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), 2653 locinput, loc_regeol - locinput, 10, 0, 1); 2654 2655 const STRLEN tlen=len0+len1+len2; 2656 PerlIO_printf(Perl_debug_log, 2657 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|", 2658 (IV)(locinput - loc_bostr), 2659 len0, s0, 2660 len1, s1, 2661 (docolor ? "" : "> <"), 2662 len2, s2, 2663 (int)(tlen > 19 ? 0 : 19 - tlen), 2664 ""); 2665 } 2666 } 2667 2668 #endif 2669 2670 /* reg_check_named_buff_matched() 2671 * Checks to see if a named buffer has matched. The data array of 2672 * buffer numbers corresponding to the buffer is expected to reside 2673 * in the regexp->data->data array in the slot stored in the ARG() of 2674 * node involved. Note that this routine doesn't actually care about the 2675 * name, that information is not preserved from compilation to execution. 2676 * Returns the index of the leftmost defined buffer with the given name 2677 * or 0 if non of the buffers matched. 2678 */ 2679 STATIC I32 2680 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) 2681 { 2682 I32 n; 2683 RXi_GET_DECL(rex,rexi); 2684 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 2685 I32 *nums=(I32*)SvPVX(sv_dat); 2686 2687 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; 2688 2689 for ( n=0; n<SvIVX(sv_dat); n++ ) { 2690 if ((I32)*PL_reglastparen >= nums[n] && 2691 PL_regoffs[nums[n]].end != -1) 2692 { 2693 return nums[n]; 2694 } 2695 } 2696 return 0; 2697 } 2698 2699 2700 /* free all slabs above current one - called during LEAVE_SCOPE */ 2701 2702 STATIC void 2703 S_clear_backtrack_stack(pTHX_ void *p) 2704 { 2705 regmatch_slab *s = PL_regmatch_slab->next; 2706 PERL_UNUSED_ARG(p); 2707 2708 if (!s) 2709 return; 2710 PL_regmatch_slab->next = NULL; 2711 while (s) { 2712 regmatch_slab * const osl = s; 2713 s = s->next; 2714 Safefree(osl); 2715 } 2716 } 2717 2718 2719 #define SETREX(Re1,Re2) \ 2720 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \ 2721 Re1 = (Re2) 2722 2723 STATIC I32 /* 0 failure, 1 success */ 2724 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) 2725 { 2726 #if PERL_VERSION < 9 && !defined(PERL_CORE) 2727 dMY_CXT; 2728 #endif 2729 dVAR; 2730 register const bool do_utf8 = PL_reg_match_utf8; 2731 const U32 uniflags = UTF8_ALLOW_DEFAULT; 2732 regexp *rex = reginfo->prog; 2733 RXi_GET_DECL(rex,rexi); 2734 I32 oldsave; 2735 /* the current state. This is a cached copy of PL_regmatch_state */ 2736 register regmatch_state *st; 2737 /* cache heavy used fields of st in registers */ 2738 register regnode *scan; 2739 register regnode *next; 2740 register U32 n = 0; /* general value; init to avoid compiler warning */ 2741 register I32 ln = 0; /* len or last; init to avoid compiler warning */ 2742 register char *locinput = PL_reginput; 2743 register I32 nextchr; /* is always set to UCHARAT(locinput) */ 2744 2745 bool result = 0; /* return value of S_regmatch */ 2746 int depth = 0; /* depth of backtrack stack */ 2747 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ 2748 const U32 max_nochange_depth = 2749 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? 2750 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; 2751 regmatch_state *yes_state = NULL; /* state to pop to on success of 2752 subpattern */ 2753 /* mark_state piggy backs on the yes_state logic so that when we unwind 2754 the stack on success we can update the mark_state as we go */ 2755 regmatch_state *mark_state = NULL; /* last mark state we have seen */ 2756 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ 2757 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ 2758 U32 state_num; 2759 bool no_final = 0; /* prevent failure from backtracking? */ 2760 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ 2761 char *startpoint = PL_reginput; 2762 SV *popmark = NULL; /* are we looking for a mark? */ 2763 SV *sv_commit = NULL; /* last mark name seen in failure */ 2764 SV *sv_yes_mark = NULL; /* last mark name we have seen 2765 during a successfull match */ 2766 U32 lastopen = 0; /* last open we saw */ 2767 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; 2768 SV* const oreplsv = GvSV(PL_replgv); 2769 /* these three flags are set by various ops to signal information to 2770 * the very next op. They have a useful lifetime of exactly one loop 2771 * iteration, and are not preserved or restored by state pushes/pops 2772 */ 2773 bool sw = 0; /* the condition value in (?(cond)a|b) */ 2774 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ 2775 int logical = 0; /* the following EVAL is: 2776 0: (?{...}) 2777 1: (?(?{...})X|Y) 2778 2: (??{...}) 2779 or the following IFMATCH/UNLESSM is: 2780 false: plain (?=foo) 2781 true: used as a condition: (?(?=foo)) 2782 */ 2783 #ifdef DEBUGGING 2784 GET_RE_DEBUG_FLAGS_DECL; 2785 #endif 2786 2787 PERL_ARGS_ASSERT_REGMATCH; 2788 2789 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ 2790 PerlIO_printf(Perl_debug_log,"regmatch start\n"); 2791 })); 2792 /* on first ever call to regmatch, allocate first slab */ 2793 if (!PL_regmatch_slab) { 2794 Newx(PL_regmatch_slab, 1, regmatch_slab); 2795 PL_regmatch_slab->prev = NULL; 2796 PL_regmatch_slab->next = NULL; 2797 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); 2798 } 2799 2800 oldsave = PL_savestack_ix; 2801 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL); 2802 SAVEVPTR(PL_regmatch_slab); 2803 SAVEVPTR(PL_regmatch_state); 2804 2805 /* grab next free state slot */ 2806 st = ++PL_regmatch_state; 2807 if (st > SLAB_LAST(PL_regmatch_slab)) 2808 st = PL_regmatch_state = S_push_slab(aTHX); 2809 2810 /* Note that nextchr is a byte even in UTF */ 2811 nextchr = UCHARAT(locinput); 2812 scan = prog; 2813 while (scan != NULL) { 2814 2815 DEBUG_EXECUTE_r( { 2816 SV * const prop = sv_newmortal(); 2817 regnode *rnext=regnext(scan); 2818 DUMP_EXEC_POS( locinput, scan, do_utf8 ); 2819 regprop(rex, prop, scan); 2820 2821 PerlIO_printf(Perl_debug_log, 2822 "%3"IVdf":%*s%s(%"IVdf")\n", 2823 (IV)(scan - rexi->program), depth*2, "", 2824 SvPVX_const(prop), 2825 (PL_regkind[OP(scan)] == END || !rnext) ? 2826 0 : (IV)(rnext - rexi->program)); 2827 }); 2828 2829 next = scan + NEXT_OFF(scan); 2830 if (next == scan) 2831 next = NULL; 2832 state_num = OP(scan); 2833 2834 reenter_switch: 2835 2836 assert(PL_reglastparen == &rex->lastparen); 2837 assert(PL_reglastcloseparen == &rex->lastcloseparen); 2838 assert(PL_regoffs == rex->offs); 2839 2840 switch (state_num) { 2841 case BOL: 2842 if (locinput == PL_bostr) 2843 { 2844 /* reginfo->till = reginfo->bol; */ 2845 break; 2846 } 2847 sayNO; 2848 case MBOL: 2849 if (locinput == PL_bostr || 2850 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n')) 2851 { 2852 break; 2853 } 2854 sayNO; 2855 case SBOL: 2856 if (locinput == PL_bostr) 2857 break; 2858 sayNO; 2859 case GPOS: 2860 if (locinput == reginfo->ganch) 2861 break; 2862 sayNO; 2863 2864 case KEEPS: 2865 /* update the startpoint */ 2866 st->u.keeper.val = PL_regoffs[0].start; 2867 PL_reginput = locinput; 2868 PL_regoffs[0].start = locinput - PL_bostr; 2869 PUSH_STATE_GOTO(KEEPS_next, next); 2870 /*NOT-REACHED*/ 2871 case KEEPS_next_fail: 2872 /* rollback the start point change */ 2873 PL_regoffs[0].start = st->u.keeper.val; 2874 sayNO_SILENT; 2875 /*NOT-REACHED*/ 2876 case EOL: 2877 goto seol; 2878 case MEOL: 2879 if ((nextchr || locinput < PL_regeol) && nextchr != '\n') 2880 sayNO; 2881 break; 2882 case SEOL: 2883 seol: 2884 if ((nextchr || locinput < PL_regeol) && nextchr != '\n') 2885 sayNO; 2886 if (PL_regeol - locinput > 1) 2887 sayNO; 2888 break; 2889 case EOS: 2890 if (PL_regeol != locinput) 2891 sayNO; 2892 break; 2893 case SANY: 2894 if (!nextchr && locinput >= PL_regeol) 2895 sayNO; 2896 if (do_utf8) { 2897 locinput += PL_utf8skip[nextchr]; 2898 if (locinput > PL_regeol) 2899 sayNO; 2900 nextchr = UCHARAT(locinput); 2901 } 2902 else 2903 nextchr = UCHARAT(++locinput); 2904 break; 2905 case CANY: 2906 if (!nextchr && locinput >= PL_regeol) 2907 sayNO; 2908 nextchr = UCHARAT(++locinput); 2909 break; 2910 case REG_ANY: 2911 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') 2912 sayNO; 2913 if (do_utf8) { 2914 locinput += PL_utf8skip[nextchr]; 2915 if (locinput > PL_regeol) 2916 sayNO; 2917 nextchr = UCHARAT(locinput); 2918 } 2919 else 2920 nextchr = UCHARAT(++locinput); 2921 break; 2922 2923 #undef ST 2924 #define ST st->u.trie 2925 case TRIEC: 2926 /* In this case the charclass data is available inline so 2927 we can fail fast without a lot of extra overhead. 2928 */ 2929 if (scan->flags == EXACT || !do_utf8) { 2930 if(!ANYOF_BITMAP_TEST(scan, *locinput)) { 2931 DEBUG_EXECUTE_r( 2932 PerlIO_printf(Perl_debug_log, 2933 "%*s %sfailed to match trie start class...%s\n", 2934 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) 2935 ); 2936 sayNO_SILENT; 2937 /* NOTREACHED */ 2938 } 2939 } 2940 /* FALL THROUGH */ 2941 case TRIE: 2942 { 2943 /* what type of TRIE am I? (utf8 makes this contextual) */ 2944 DECL_TRIE_TYPE(scan); 2945 2946 /* what trie are we using right now */ 2947 reg_trie_data * const trie 2948 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; 2949 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); 2950 U32 state = trie->startstate; 2951 2952 if (trie->bitmap && trie_type != trie_utf8_fold && 2953 !TRIE_BITMAP_TEST(trie,*locinput) 2954 ) { 2955 if (trie->states[ state ].wordnum) { 2956 DEBUG_EXECUTE_r( 2957 PerlIO_printf(Perl_debug_log, 2958 "%*s %smatched empty string...%s\n", 2959 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) 2960 ); 2961 break; 2962 } else { 2963 DEBUG_EXECUTE_r( 2964 PerlIO_printf(Perl_debug_log, 2965 "%*s %sfailed to match trie start class...%s\n", 2966 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) 2967 ); 2968 sayNO_SILENT; 2969 } 2970 } 2971 2972 { 2973 U8 *uc = ( U8* )locinput; 2974 2975 STRLEN len = 0; 2976 STRLEN foldlen = 0; 2977 U8 *uscan = (U8*)NULL; 2978 STRLEN bufflen=0; 2979 SV *sv_accept_buff = NULL; 2980 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 2981 2982 ST.accepted = 0; /* how many accepting states we have seen */ 2983 ST.B = next; 2984 ST.jump = trie->jump; 2985 ST.me = scan; 2986 /* 2987 traverse the TRIE keeping track of all accepting states 2988 we transition through until we get to a failing node. 2989 */ 2990 2991 while ( state && uc <= (U8*)PL_regeol ) { 2992 U32 base = trie->states[ state ].trans.base; 2993 UV uvc = 0; 2994 U16 charid; 2995 /* We use charid to hold the wordnum as we don't use it 2996 for charid until after we have done the wordnum logic. 2997 We define an alias just so that the wordnum logic reads 2998 more naturally. */ 2999 3000 #define got_wordnum charid 3001 got_wordnum = trie->states[ state ].wordnum; 3002 3003 if ( got_wordnum ) { 3004 if ( ! ST.accepted ) { 3005 ENTER; 3006 SAVETMPS; /* XXX is this necessary? dmq */ 3007 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN; 3008 sv_accept_buff=newSV(bufflen * 3009 sizeof(reg_trie_accepted) - 1); 3010 SvCUR_set(sv_accept_buff, 0); 3011 SvPOK_on(sv_accept_buff); 3012 sv_2mortal(sv_accept_buff); 3013 SAVETMPS; 3014 ST.accept_buff = 3015 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff ); 3016 } 3017 do { 3018 if (ST.accepted >= bufflen) { 3019 bufflen *= 2; 3020 ST.accept_buff =(reg_trie_accepted*) 3021 SvGROW(sv_accept_buff, 3022 bufflen * sizeof(reg_trie_accepted)); 3023 } 3024 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff) 3025 + sizeof(reg_trie_accepted)); 3026 3027 3028 ST.accept_buff[ST.accepted].wordnum = got_wordnum; 3029 ST.accept_buff[ST.accepted].endpos = uc; 3030 ++ST.accepted; 3031 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum])); 3032 } 3033 #undef got_wordnum 3034 3035 DEBUG_TRIE_EXECUTE_r({ 3036 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 ); 3037 PerlIO_printf( Perl_debug_log, 3038 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ", 3039 2+depth * 2, "", PL_colors[4], 3040 (UV)state, (UV)ST.accepted ); 3041 }); 3042 3043 if ( base ) { 3044 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 3045 uscan, len, uvc, charid, foldlen, 3046 foldbuf, uniflags); 3047 3048 if (charid && 3049 (base + charid > trie->uniquecharcount ) 3050 && (base + charid - 1 - trie->uniquecharcount 3051 < trie->lasttrans) 3052 && trie->trans[base + charid - 1 - 3053 trie->uniquecharcount].check == state) 3054 { 3055 state = trie->trans[base + charid - 1 - 3056 trie->uniquecharcount ].next; 3057 } 3058 else { 3059 state = 0; 3060 } 3061 uc += len; 3062 3063 } 3064 else { 3065 state = 0; 3066 } 3067 DEBUG_TRIE_EXECUTE_r( 3068 PerlIO_printf( Perl_debug_log, 3069 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n", 3070 charid, uvc, (UV)state, PL_colors[5] ); 3071 ); 3072 } 3073 if (!ST.accepted ) 3074 sayNO; 3075 3076 DEBUG_EXECUTE_r( 3077 PerlIO_printf( Perl_debug_log, 3078 "%*s %sgot %"IVdf" possible matches%s\n", 3079 REPORT_CODE_OFF + depth * 2, "", 3080 PL_colors[4], (IV)ST.accepted, PL_colors[5] ); 3081 ); 3082 }} 3083 goto trie_first_try; /* jump into the fail handler */ 3084 /* NOTREACHED */ 3085 case TRIE_next_fail: /* we failed - try next alterative */ 3086 if ( ST.jump) { 3087 REGCP_UNWIND(ST.cp); 3088 for (n = *PL_reglastparen; n > ST.lastparen; n--) 3089 PL_regoffs[n].end = -1; 3090 *PL_reglastparen = n; 3091 } 3092 trie_first_try: 3093 if (do_cutgroup) { 3094 do_cutgroup = 0; 3095 no_final = 0; 3096 } 3097 3098 if ( ST.jump) { 3099 ST.lastparen = *PL_reglastparen; 3100 REGCP_SET(ST.cp); 3101 } 3102 if ( ST.accepted == 1 ) { 3103 /* only one choice left - just continue */ 3104 DEBUG_EXECUTE_r({ 3105 AV *const trie_words 3106 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); 3107 SV ** const tmp = av_fetch( trie_words, 3108 ST.accept_buff[ 0 ].wordnum-1, 0 ); 3109 SV *sv= tmp ? sv_newmortal() : NULL; 3110 3111 PerlIO_printf( Perl_debug_log, 3112 "%*s %sonly one match left: #%d <%s>%s\n", 3113 REPORT_CODE_OFF+depth*2, "", PL_colors[4], 3114 ST.accept_buff[ 0 ].wordnum, 3115 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 3116 PL_colors[0], PL_colors[1], 3117 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) 3118 ) 3119 : "not compiled under -Dr", 3120 PL_colors[5] ); 3121 }); 3122 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos; 3123 /* in this case we free tmps/leave before we call regmatch 3124 as we wont be using accept_buff again. */ 3125 3126 locinput = PL_reginput; 3127 nextchr = UCHARAT(locinput); 3128 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum]) 3129 scan = ST.B; 3130 else 3131 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum]; 3132 if (!has_cutgroup) { 3133 FREETMPS; 3134 LEAVE; 3135 } else { 3136 ST.accepted--; 3137 PUSH_YES_STATE_GOTO(TRIE_next, scan); 3138 } 3139 3140 continue; /* execute rest of RE */ 3141 } 3142 3143 if ( !ST.accepted-- ) { 3144 DEBUG_EXECUTE_r({ 3145 PerlIO_printf( Perl_debug_log, 3146 "%*s %sTRIE failed...%s\n", 3147 REPORT_CODE_OFF+depth*2, "", 3148 PL_colors[4], 3149 PL_colors[5] ); 3150 }); 3151 FREETMPS; 3152 LEAVE; 3153 sayNO_SILENT; 3154 /*NOTREACHED*/ 3155 } 3156 3157 /* 3158 There are at least two accepting states left. Presumably 3159 the number of accepting states is going to be low, 3160 typically two. So we simply scan through to find the one 3161 with lowest wordnum. Once we find it, we swap the last 3162 state into its place and decrement the size. We then try to 3163 match the rest of the pattern at the point where the word 3164 ends. If we succeed, control just continues along the 3165 regex; if we fail we return here to try the next accepting 3166 state 3167 */ 3168 3169 { 3170 U32 best = 0; 3171 U32 cur; 3172 for( cur = 1 ; cur <= ST.accepted ; cur++ ) { 3173 DEBUG_TRIE_EXECUTE_r( 3174 PerlIO_printf( Perl_debug_log, 3175 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n", 3176 REPORT_CODE_OFF + depth * 2, "", PL_colors[4], 3177 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur, 3178 ST.accept_buff[ cur ].wordnum, PL_colors[5] ); 3179 ); 3180 3181 if (ST.accept_buff[cur].wordnum < 3182 ST.accept_buff[best].wordnum) 3183 best = cur; 3184 } 3185 3186 DEBUG_EXECUTE_r({ 3187 AV *const trie_words 3188 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); 3189 SV ** const tmp = av_fetch( trie_words, 3190 ST.accept_buff[ best ].wordnum - 1, 0 ); 3191 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ? 3192 ST.B : 3193 ST.me + ST.jump[ST.accept_buff[best].wordnum]; 3194 SV *sv= tmp ? sv_newmortal() : NULL; 3195 3196 PerlIO_printf( Perl_debug_log, 3197 "%*s %strying alternation #%d <%s> at node #%d %s\n", 3198 REPORT_CODE_OFF+depth*2, "", PL_colors[4], 3199 ST.accept_buff[best].wordnum, 3200 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 3201 PL_colors[0], PL_colors[1], 3202 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) 3203 ) : "not compiled under -Dr", 3204 REG_NODE_NUM(nextop), 3205 PL_colors[5] ); 3206 }); 3207 3208 if ( best<ST.accepted ) { 3209 reg_trie_accepted tmp = ST.accept_buff[ best ]; 3210 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ]; 3211 ST.accept_buff[ ST.accepted ] = tmp; 3212 best = ST.accepted; 3213 } 3214 PL_reginput = (char *)ST.accept_buff[ best ].endpos; 3215 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) { 3216 scan = ST.B; 3217 } else { 3218 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum]; 3219 } 3220 PUSH_YES_STATE_GOTO(TRIE_next, scan); 3221 /* NOTREACHED */ 3222 } 3223 /* NOTREACHED */ 3224 case TRIE_next: 3225 /* we dont want to throw this away, see bug 57042*/ 3226 if (oreplsv != GvSV(PL_replgv)) 3227 sv_setsv(oreplsv, GvSV(PL_replgv)); 3228 FREETMPS; 3229 LEAVE; 3230 sayYES; 3231 #undef ST 3232 3233 case EXACT: { 3234 char *s = STRING(scan); 3235 ln = STR_LEN(scan); 3236 if (do_utf8 != UTF) { 3237 /* The target and the pattern have differing utf8ness. */ 3238 char *l = locinput; 3239 const char * const e = s + ln; 3240 3241 if (do_utf8) { 3242 /* The target is utf8, the pattern is not utf8. */ 3243 while (s < e) { 3244 STRLEN ulen; 3245 if (l >= PL_regeol) 3246 sayNO; 3247 if (NATIVE_TO_UNI(*(U8*)s) != 3248 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen, 3249 uniflags)) 3250 sayNO; 3251 l += ulen; 3252 s ++; 3253 } 3254 } 3255 else { 3256 /* The target is not utf8, the pattern is utf8. */ 3257 while (s < e) { 3258 STRLEN ulen; 3259 if (l >= PL_regeol) 3260 sayNO; 3261 if (NATIVE_TO_UNI(*((U8*)l)) != 3262 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen, 3263 uniflags)) 3264 sayNO; 3265 s += ulen; 3266 l ++; 3267 } 3268 } 3269 locinput = l; 3270 nextchr = UCHARAT(locinput); 3271 break; 3272 } 3273 /* The target and the pattern have the same utf8ness. */ 3274 /* Inline the first character, for speed. */ 3275 if (UCHARAT(s) != nextchr) 3276 sayNO; 3277 if (PL_regeol - locinput < ln) 3278 sayNO; 3279 if (ln > 1 && memNE(s, locinput, ln)) 3280 sayNO; 3281 locinput += ln; 3282 nextchr = UCHARAT(locinput); 3283 break; 3284 } 3285 case EXACTFL: 3286 PL_reg_flags |= RF_tainted; 3287 /* FALL THROUGH */ 3288 case EXACTF: { 3289 char * const s = STRING(scan); 3290 ln = STR_LEN(scan); 3291 3292 if (do_utf8 || UTF) { 3293 /* Either target or the pattern are utf8. */ 3294 const char * const l = locinput; 3295 char *e = PL_regeol; 3296 3297 if (ibcmp_utf8(s, 0, ln, (bool)UTF, 3298 l, &e, 0, do_utf8)) { 3299 /* One more case for the sharp s: 3300 * pack("U0U*", 0xDF) =~ /ss/i, 3301 * the 0xC3 0x9F are the UTF-8 3302 * byte sequence for the U+00DF. */ 3303 3304 if (!(do_utf8 && 3305 toLOWER(s[0]) == 's' && 3306 ln >= 2 && 3307 toLOWER(s[1]) == 's' && 3308 (U8)l[0] == 0xC3 && 3309 e - l >= 2 && 3310 (U8)l[1] == 0x9F)) 3311 sayNO; 3312 } 3313 locinput = e; 3314 nextchr = UCHARAT(locinput); 3315 break; 3316 } 3317 3318 /* Neither the target and the pattern are utf8. */ 3319 3320 /* Inline the first character, for speed. */ 3321 if (UCHARAT(s) != nextchr && 3322 UCHARAT(s) != ((OP(scan) == EXACTF) 3323 ? PL_fold : PL_fold_locale)[nextchr]) 3324 sayNO; 3325 if (PL_regeol - locinput < ln) 3326 sayNO; 3327 if (ln > 1 && (OP(scan) == EXACTF 3328 ? ibcmp(s, locinput, ln) 3329 : ibcmp_locale(s, locinput, ln))) 3330 sayNO; 3331 locinput += ln; 3332 nextchr = UCHARAT(locinput); 3333 break; 3334 } 3335 case ANYOF: 3336 if (do_utf8) { 3337 STRLEN inclasslen = PL_regeol - locinput; 3338 3339 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8)) 3340 goto anyof_fail; 3341 if (locinput >= PL_regeol) 3342 sayNO; 3343 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput); 3344 nextchr = UCHARAT(locinput); 3345 break; 3346 } 3347 else { 3348 if (nextchr < 0) 3349 nextchr = UCHARAT(locinput); 3350 if (!REGINCLASS(rex, scan, (U8*)locinput)) 3351 goto anyof_fail; 3352 if (!nextchr && locinput >= PL_regeol) 3353 sayNO; 3354 nextchr = UCHARAT(++locinput); 3355 break; 3356 } 3357 anyof_fail: 3358 /* If we might have the case of the German sharp s 3359 * in a casefolding Unicode character class. */ 3360 3361 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) { 3362 locinput += SHARP_S_SKIP; 3363 nextchr = UCHARAT(locinput); 3364 } 3365 else 3366 sayNO; 3367 break; 3368 case ALNUML: 3369 PL_reg_flags |= RF_tainted; 3370 /* FALL THROUGH */ 3371 case ALNUM: 3372 if (!nextchr) 3373 sayNO; 3374 if (do_utf8) { 3375 LOAD_UTF8_CHARCLASS_ALNUM(); 3376 if (!(OP(scan) == ALNUM 3377 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8) 3378 : isALNUM_LC_utf8((U8*)locinput))) 3379 { 3380 sayNO; 3381 } 3382 locinput += PL_utf8skip[nextchr]; 3383 nextchr = UCHARAT(locinput); 3384 break; 3385 } 3386 if (!(OP(scan) == ALNUM 3387 ? isALNUM(nextchr) : isALNUM_LC(nextchr))) 3388 sayNO; 3389 nextchr = UCHARAT(++locinput); 3390 break; 3391 case NALNUML: 3392 PL_reg_flags |= RF_tainted; 3393 /* FALL THROUGH */ 3394 case NALNUM: 3395 if (!nextchr && locinput >= PL_regeol) 3396 sayNO; 3397 if (do_utf8) { 3398 LOAD_UTF8_CHARCLASS_ALNUM(); 3399 if (OP(scan) == NALNUM 3400 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8) 3401 : isALNUM_LC_utf8((U8*)locinput)) 3402 { 3403 sayNO; 3404 } 3405 locinput += PL_utf8skip[nextchr]; 3406 nextchr = UCHARAT(locinput); 3407 break; 3408 } 3409 if (OP(scan) == NALNUM 3410 ? isALNUM(nextchr) : isALNUM_LC(nextchr)) 3411 sayNO; 3412 nextchr = UCHARAT(++locinput); 3413 break; 3414 case BOUNDL: 3415 case NBOUNDL: 3416 PL_reg_flags |= RF_tainted; 3417 /* FALL THROUGH */ 3418 case BOUND: 3419 case NBOUND: 3420 /* was last char in word? */ 3421 if (do_utf8) { 3422 if (locinput == PL_bostr) 3423 ln = '\n'; 3424 else { 3425 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr); 3426 3427 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); 3428 } 3429 if (OP(scan) == BOUND || OP(scan) == NBOUND) { 3430 ln = isALNUM_uni(ln); 3431 LOAD_UTF8_CHARCLASS_ALNUM(); 3432 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8); 3433 } 3434 else { 3435 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln)); 3436 n = isALNUM_LC_utf8((U8*)locinput); 3437 } 3438 } 3439 else { 3440 ln = (locinput != PL_bostr) ? 3441 UCHARAT(locinput - 1) : '\n'; 3442 if (OP(scan) == BOUND || OP(scan) == NBOUND) { 3443 ln = isALNUM(ln); 3444 n = isALNUM(nextchr); 3445 } 3446 else { 3447 ln = isALNUM_LC(ln); 3448 n = isALNUM_LC(nextchr); 3449 } 3450 } 3451 if (((!ln) == (!n)) == (OP(scan) == BOUND || 3452 OP(scan) == BOUNDL)) 3453 sayNO; 3454 break; 3455 case SPACEL: 3456 PL_reg_flags |= RF_tainted; 3457 /* FALL THROUGH */ 3458 case SPACE: 3459 if (!nextchr) 3460 sayNO; 3461 if (do_utf8) { 3462 if (UTF8_IS_CONTINUED(nextchr)) { 3463 LOAD_UTF8_CHARCLASS_SPACE(); 3464 if (!(OP(scan) == SPACE 3465 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8) 3466 : isSPACE_LC_utf8((U8*)locinput))) 3467 { 3468 sayNO; 3469 } 3470 locinput += PL_utf8skip[nextchr]; 3471 nextchr = UCHARAT(locinput); 3472 break; 3473 } 3474 if (!(OP(scan) == SPACE 3475 ? isSPACE(nextchr) : isSPACE_LC(nextchr))) 3476 sayNO; 3477 nextchr = UCHARAT(++locinput); 3478 } 3479 else { 3480 if (!(OP(scan) == SPACE 3481 ? isSPACE(nextchr) : isSPACE_LC(nextchr))) 3482 sayNO; 3483 nextchr = UCHARAT(++locinput); 3484 } 3485 break; 3486 case NSPACEL: 3487 PL_reg_flags |= RF_tainted; 3488 /* FALL THROUGH */ 3489 case NSPACE: 3490 if (!nextchr && locinput >= PL_regeol) 3491 sayNO; 3492 if (do_utf8) { 3493 LOAD_UTF8_CHARCLASS_SPACE(); 3494 if (OP(scan) == NSPACE 3495 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8) 3496 : isSPACE_LC_utf8((U8*)locinput)) 3497 { 3498 sayNO; 3499 } 3500 locinput += PL_utf8skip[nextchr]; 3501 nextchr = UCHARAT(locinput); 3502 break; 3503 } 3504 if (OP(scan) == NSPACE 3505 ? isSPACE(nextchr) : isSPACE_LC(nextchr)) 3506 sayNO; 3507 nextchr = UCHARAT(++locinput); 3508 break; 3509 case DIGITL: 3510 PL_reg_flags |= RF_tainted; 3511 /* FALL THROUGH */ 3512 case DIGIT: 3513 if (!nextchr) 3514 sayNO; 3515 if (do_utf8) { 3516 LOAD_UTF8_CHARCLASS_DIGIT(); 3517 if (!(OP(scan) == DIGIT 3518 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8) 3519 : isDIGIT_LC_utf8((U8*)locinput))) 3520 { 3521 sayNO; 3522 } 3523 locinput += PL_utf8skip[nextchr]; 3524 nextchr = UCHARAT(locinput); 3525 break; 3526 } 3527 if (!(OP(scan) == DIGIT 3528 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) 3529 sayNO; 3530 nextchr = UCHARAT(++locinput); 3531 break; 3532 case NDIGITL: 3533 PL_reg_flags |= RF_tainted; 3534 /* FALL THROUGH */ 3535 case NDIGIT: 3536 if (!nextchr && locinput >= PL_regeol) 3537 sayNO; 3538 if (do_utf8) { 3539 LOAD_UTF8_CHARCLASS_DIGIT(); 3540 if (OP(scan) == NDIGIT 3541 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8) 3542 : isDIGIT_LC_utf8((U8*)locinput)) 3543 { 3544 sayNO; 3545 } 3546 locinput += PL_utf8skip[nextchr]; 3547 nextchr = UCHARAT(locinput); 3548 break; 3549 } 3550 if (OP(scan) == NDIGIT 3551 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) 3552 sayNO; 3553 nextchr = UCHARAT(++locinput); 3554 break; 3555 case CLUMP: 3556 if (locinput >= PL_regeol) 3557 sayNO; 3558 if (do_utf8) { 3559 LOAD_UTF8_CHARCLASS_MARK(); 3560 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) 3561 sayNO; 3562 locinput += PL_utf8skip[nextchr]; 3563 while (locinput < PL_regeol && 3564 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) 3565 locinput += UTF8SKIP(locinput); 3566 if (locinput > PL_regeol) 3567 sayNO; 3568 } 3569 else 3570 locinput++; 3571 nextchr = UCHARAT(locinput); 3572 break; 3573 3574 case NREFFL: 3575 { 3576 char *s; 3577 char type; 3578 PL_reg_flags |= RF_tainted; 3579 /* FALL THROUGH */ 3580 case NREF: 3581 case NREFF: 3582 type = OP(scan); 3583 n = reg_check_named_buff_matched(rex,scan); 3584 3585 if ( n ) { 3586 type = REF + ( type - NREF ); 3587 goto do_ref; 3588 } else { 3589 sayNO; 3590 } 3591 /* unreached */ 3592 case REFFL: 3593 PL_reg_flags |= RF_tainted; 3594 /* FALL THROUGH */ 3595 case REF: 3596 case REFF: 3597 n = ARG(scan); /* which paren pair */ 3598 type = OP(scan); 3599 do_ref: 3600 ln = PL_regoffs[n].start; 3601 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ 3602 if (*PL_reglastparen < n || ln == -1) 3603 sayNO; /* Do not match unless seen CLOSEn. */ 3604 if (ln == PL_regoffs[n].end) 3605 break; 3606 3607 s = PL_bostr + ln; 3608 if (do_utf8 && type != REF) { /* REF can do byte comparison */ 3609 char *l = locinput; 3610 const char *e = PL_bostr + PL_regoffs[n].end; 3611 /* 3612 * Note that we can't do the "other character" lookup trick as 3613 * in the 8-bit case (no pun intended) because in Unicode we 3614 * have to map both upper and title case to lower case. 3615 */ 3616 if (type == REFF) { 3617 while (s < e) { 3618 STRLEN ulen1, ulen2; 3619 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; 3620 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; 3621 3622 if (l >= PL_regeol) 3623 sayNO; 3624 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1); 3625 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2); 3626 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) 3627 sayNO; 3628 s += ulen1; 3629 l += ulen2; 3630 } 3631 } 3632 locinput = l; 3633 nextchr = UCHARAT(locinput); 3634 break; 3635 } 3636 3637 /* Inline the first character, for speed. */ 3638 if (UCHARAT(s) != nextchr && 3639 (type == REF || 3640 (UCHARAT(s) != (type == REFF 3641 ? PL_fold : PL_fold_locale)[nextchr]))) 3642 sayNO; 3643 ln = PL_regoffs[n].end - ln; 3644 if (locinput + ln > PL_regeol) 3645 sayNO; 3646 if (ln > 1 && (type == REF 3647 ? memNE(s, locinput, ln) 3648 : (type == REFF 3649 ? ibcmp(s, locinput, ln) 3650 : ibcmp_locale(s, locinput, ln)))) 3651 sayNO; 3652 locinput += ln; 3653 nextchr = UCHARAT(locinput); 3654 break; 3655 } 3656 case NOTHING: 3657 case TAIL: 3658 break; 3659 case BACK: 3660 break; 3661 3662 #undef ST 3663 #define ST st->u.eval 3664 { 3665 SV *ret; 3666 regexp *re; 3667 regexp_internal *rei; 3668 regnode *startpoint; 3669 3670 case GOSTART: 3671 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ 3672 if (cur_eval && cur_eval->locinput==locinput) { 3673 if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) 3674 Perl_croak(aTHX_ "Infinite recursion in regex"); 3675 if ( ++nochange_depth > max_nochange_depth ) 3676 Perl_croak(aTHX_ 3677 "Pattern subroutine nesting without pos change" 3678 " exceeded limit in regex"); 3679 } else { 3680 nochange_depth = 0; 3681 } 3682 re = rex; 3683 rei = rexi; 3684 (void)ReREFCNT_inc(rex); 3685 if (OP(scan)==GOSUB) { 3686 startpoint = scan + ARG2L(scan); 3687 ST.close_paren = ARG(scan); 3688 } else { 3689 startpoint = rei->program+1; 3690 ST.close_paren = 0; 3691 } 3692 goto eval_recurse_doit; 3693 /* NOTREACHED */ 3694 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ 3695 if (cur_eval && cur_eval->locinput==locinput) { 3696 if ( ++nochange_depth > max_nochange_depth ) 3697 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); 3698 } else { 3699 nochange_depth = 0; 3700 } 3701 { 3702 /* execute the code in the {...} */ 3703 dSP; 3704 SV ** const before = SP; 3705 OP_4tree * const oop = PL_op; 3706 COP * const ocurcop = PL_curcop; 3707 PAD *old_comppad; 3708 3709 n = ARG(scan); 3710 PL_op = (OP_4tree*)rexi->data->data[n]; 3711 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, 3712 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); 3713 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]); 3714 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr; 3715 3716 if (sv_yes_mark) { 3717 SV *sv_mrk = get_sv("REGMARK", 1); 3718 sv_setsv(sv_mrk, sv_yes_mark); 3719 } 3720 3721 CALLRUNOPS(aTHX); /* Scalar context. */ 3722 SPAGAIN; 3723 if (SP == before) 3724 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ 3725 else { 3726 ret = POPs; 3727 PUTBACK; 3728 } 3729 3730 PL_op = oop; 3731 PAD_RESTORE_LOCAL(old_comppad); 3732 PL_curcop = ocurcop; 3733 if (!logical) { 3734 /* /(?{...})/ */ 3735 sv_setsv(save_scalar(PL_replgv), ret); 3736 break; 3737 } 3738 } 3739 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */ 3740 logical = 0; 3741 { 3742 /* extract RE object from returned value; compiling if 3743 * necessary */ 3744 3745 MAGIC *mg = NULL; 3746 const SV *sv; 3747 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret))) 3748 mg = mg_find(sv, PERL_MAGIC_qr); 3749 else if (SvSMAGICAL(ret)) { 3750 if (SvGMAGICAL(ret)) 3751 sv_unmagic(ret, PERL_MAGIC_qr); 3752 else 3753 mg = mg_find(ret, PERL_MAGIC_qr); 3754 } 3755 3756 if (mg) { 3757 re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/ 3758 } 3759 else { 3760 U32 pm_flags = 0; 3761 const I32 osize = PL_regsize; 3762 3763 if (DO_UTF8(ret)) pm_flags |= RXf_UTF8; 3764 re = CALLREGCOMP(ret, pm_flags); 3765 if (!(SvFLAGS(ret) 3766 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY 3767 | SVs_GMG))) 3768 sv_magic(ret,MUTABLE_SV(ReREFCNT_inc(re)), 3769 PERL_MAGIC_qr,0,0); 3770 PL_regsize = osize; 3771 } 3772 } 3773 RXp_MATCH_COPIED_off(re); 3774 re->subbeg = rex->subbeg; 3775 re->sublen = rex->sublen; 3776 rei = RXi_GET(re); 3777 DEBUG_EXECUTE_r( 3778 debug_start_match(re, do_utf8, locinput, PL_regeol, 3779 "Matching embedded"); 3780 ); 3781 startpoint = rei->program + 1; 3782 ST.close_paren = 0; /* only used for GOSUB */ 3783 /* borrowed from regtry */ 3784 if (PL_reg_start_tmpl <= re->nparens) { 3785 PL_reg_start_tmpl = re->nparens*3/2 + 3; 3786 if(PL_reg_start_tmp) 3787 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 3788 else 3789 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 3790 } 3791 3792 eval_recurse_doit: /* Share code with GOSUB below this line */ 3793 /* run the pattern returned from (??{...}) */ 3794 ST.cp = regcppush(0); /* Save *all* the positions. */ 3795 REGCP_SET(ST.lastcp); 3796 3797 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */ 3798 3799 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */ 3800 PL_reglastparen = &re->lastparen; 3801 PL_reglastcloseparen = &re->lastcloseparen; 3802 re->lastparen = 0; 3803 re->lastcloseparen = 0; 3804 3805 PL_reginput = locinput; 3806 PL_regsize = 0; 3807 3808 /* XXXX This is too dramatic a measure... */ 3809 PL_reg_maxiter = 0; 3810 3811 ST.toggle_reg_flags = PL_reg_flags; 3812 if (RX_UTF8(re)) 3813 PL_reg_flags |= RF_utf8; 3814 else 3815 PL_reg_flags &= ~RF_utf8; 3816 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */ 3817 3818 ST.prev_rex = rex; 3819 ST.prev_curlyx = cur_curlyx; 3820 SETREX(rex,re); 3821 rexi = rei; 3822 cur_curlyx = NULL; 3823 ST.B = next; 3824 ST.prev_eval = cur_eval; 3825 cur_eval = st; 3826 /* now continue from first node in postoned RE */ 3827 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint); 3828 /* NOTREACHED */ 3829 } 3830 /* logical is 1, /(?(?{...})X|Y)/ */ 3831 sw = (bool)SvTRUE(ret); 3832 logical = 0; 3833 break; 3834 } 3835 3836 case EVAL_AB: /* cleanup after a successful (??{A})B */ 3837 /* note: this is called twice; first after popping B, then A */ 3838 PL_reg_flags ^= ST.toggle_reg_flags; 3839 ReREFCNT_dec(rex); 3840 SETREX(rex,ST.prev_rex); 3841 rexi = RXi_GET(rex); 3842 regcpblow(ST.cp); 3843 cur_eval = ST.prev_eval; 3844 cur_curlyx = ST.prev_curlyx; 3845 3846 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ 3847 PL_reglastparen = &rex->lastparen; 3848 PL_reglastcloseparen = &rex->lastcloseparen; 3849 /* also update PL_regoffs */ 3850 PL_regoffs = rex->offs; 3851 3852 /* XXXX This is too dramatic a measure... */ 3853 PL_reg_maxiter = 0; 3854 if ( nochange_depth ) 3855 nochange_depth--; 3856 sayYES; 3857 3858 3859 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ 3860 /* note: this is called twice; first after popping B, then A */ 3861 PL_reg_flags ^= ST.toggle_reg_flags; 3862 ReREFCNT_dec(rex); 3863 SETREX(rex,ST.prev_rex); 3864 rexi = RXi_GET(rex); 3865 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ 3866 PL_reglastparen = &rex->lastparen; 3867 PL_reglastcloseparen = &rex->lastcloseparen; 3868 3869 PL_reginput = locinput; 3870 REGCP_UNWIND(ST.lastcp); 3871 regcppop(rex); 3872 cur_eval = ST.prev_eval; 3873 cur_curlyx = ST.prev_curlyx; 3874 /* XXXX This is too dramatic a measure... */ 3875 PL_reg_maxiter = 0; 3876 if ( nochange_depth ) 3877 nochange_depth--; 3878 sayNO_SILENT; 3879 #undef ST 3880 3881 case OPEN: 3882 n = ARG(scan); /* which paren pair */ 3883 PL_reg_start_tmp[n] = locinput; 3884 if (n > PL_regsize) 3885 PL_regsize = n; 3886 lastopen = n; 3887 break; 3888 case CLOSE: 3889 n = ARG(scan); /* which paren pair */ 3890 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr; 3891 PL_regoffs[n].end = locinput - PL_bostr; 3892 /*if (n > PL_regsize) 3893 PL_regsize = n;*/ 3894 if (n > *PL_reglastparen) 3895 *PL_reglastparen = n; 3896 *PL_reglastcloseparen = n; 3897 if (cur_eval && cur_eval->u.eval.close_paren == n) { 3898 goto fake_end; 3899 } 3900 break; 3901 case ACCEPT: 3902 if (ARG(scan)){ 3903 regnode *cursor; 3904 for (cursor=scan; 3905 cursor && OP(cursor)!=END; 3906 cursor=regnext(cursor)) 3907 { 3908 if ( OP(cursor)==CLOSE ){ 3909 n = ARG(cursor); 3910 if ( n <= lastopen ) { 3911 PL_regoffs[n].start 3912 = PL_reg_start_tmp[n] - PL_bostr; 3913 PL_regoffs[n].end = locinput - PL_bostr; 3914 /*if (n > PL_regsize) 3915 PL_regsize = n;*/ 3916 if (n > *PL_reglastparen) 3917 *PL_reglastparen = n; 3918 *PL_reglastcloseparen = n; 3919 if ( n == ARG(scan) || (cur_eval && 3920 cur_eval->u.eval.close_paren == n)) 3921 break; 3922 } 3923 } 3924 } 3925 } 3926 goto fake_end; 3927 /*NOTREACHED*/ 3928 case GROUPP: 3929 n = ARG(scan); /* which paren pair */ 3930 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1); 3931 break; 3932 case NGROUPP: 3933 /* reg_check_named_buff_matched returns 0 for no match */ 3934 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan)); 3935 break; 3936 case INSUBP: 3937 n = ARG(scan); 3938 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n)); 3939 break; 3940 case DEFINEP: 3941 sw = 0; 3942 break; 3943 case IFTHEN: 3944 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ 3945 if (sw) 3946 next = NEXTOPER(NEXTOPER(scan)); 3947 else { 3948 next = scan + ARG(scan); 3949 if (OP(next) == IFTHEN) /* Fake one. */ 3950 next = NEXTOPER(NEXTOPER(next)); 3951 } 3952 break; 3953 case LOGICAL: 3954 logical = scan->flags; 3955 break; 3956 3957 /******************************************************************* 3958 3959 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ 3960 pattern, where A and B are subpatterns. (For simple A, CURLYM or 3961 STAR/PLUS/CURLY/CURLYN are used instead.) 3962 3963 A*B is compiled as <CURLYX><A><WHILEM><B> 3964 3965 On entry to the subpattern, CURLYX is called. This pushes a CURLYX 3966 state, which contains the current count, initialised to -1. It also sets 3967 cur_curlyx to point to this state, with any previous value saved in the 3968 state block. 3969 3970 CURLYX then jumps straight to the WHILEM op, rather than executing A, 3971 since the pattern may possibly match zero times (i.e. it's a while {} loop 3972 rather than a do {} while loop). 3973 3974 Each entry to WHILEM represents a successful match of A. The count in the 3975 CURLYX block is incremented, another WHILEM state is pushed, and execution 3976 passes to A or B depending on greediness and the current count. 3977 3978 For example, if matching against the string a1a2a3b (where the aN are 3979 substrings that match /A/), then the match progresses as follows: (the 3980 pushed states are interspersed with the bits of strings matched so far): 3981 3982 <CURLYX cnt=-1> 3983 <CURLYX cnt=0><WHILEM> 3984 <CURLYX cnt=1><WHILEM> a1 <WHILEM> 3985 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM> 3986 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> 3987 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b 3988 3989 (Contrast this with something like CURLYM, which maintains only a single 3990 backtrack state: 3991 3992 <CURLYM cnt=0> a1 3993 a1 <CURLYM cnt=1> a2 3994 a1 a2 <CURLYM cnt=2> a3 3995 a1 a2 a3 <CURLYM cnt=3> b 3996 ) 3997 3998 Each WHILEM state block marks a point to backtrack to upon partial failure 3999 of A or B, and also contains some minor state data related to that 4000 iteration. The CURLYX block, pointed to by cur_curlyx, contains the 4001 overall state, such as the count, and pointers to the A and B ops. 4002 4003 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx 4004 must always point to the *current* CURLYX block, the rules are: 4005 4006 When executing CURLYX, save the old cur_curlyx in the CURLYX state block, 4007 and set cur_curlyx to point the new block. 4008 4009 When popping the CURLYX block after a successful or unsuccessful match, 4010 restore the previous cur_curlyx. 4011 4012 When WHILEM is about to execute B, save the current cur_curlyx, and set it 4013 to the outer one saved in the CURLYX block. 4014 4015 When popping the WHILEM block after a successful or unsuccessful B match, 4016 restore the previous cur_curlyx. 4017 4018 Here's an example for the pattern (AI* BI)*BO 4019 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: 4020 4021 cur_ 4022 curlyx backtrack stack 4023 ------ --------------- 4024 NULL 4025 CO <CO prev=NULL> <WO> 4026 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 4027 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 4028 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo 4029 4030 At this point the pattern succeeds, and we work back down the stack to 4031 clean up, restoring as we go: 4032 4033 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 4034 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 4035 CO <CO prev=NULL> <WO> 4036 NULL 4037 4038 *******************************************************************/ 4039 4040 #define ST st->u.curlyx 4041 4042 case CURLYX: /* start of /A*B/ (for complex A) */ 4043 { 4044 /* No need to save/restore up to this paren */ 4045 I32 parenfloor = scan->flags; 4046 4047 assert(next); /* keep Coverity happy */ 4048 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ 4049 next += ARG(next); 4050 4051 /* XXXX Probably it is better to teach regpush to support 4052 parenfloor > PL_regsize... */ 4053 if (parenfloor > (I32)*PL_reglastparen) 4054 parenfloor = *PL_reglastparen; /* Pessimization... */ 4055 4056 ST.prev_curlyx= cur_curlyx; 4057 cur_curlyx = st; 4058 ST.cp = PL_savestack_ix; 4059 4060 /* these fields contain the state of the current curly. 4061 * they are accessed by subsequent WHILEMs */ 4062 ST.parenfloor = parenfloor; 4063 ST.min = ARG1(scan); 4064 ST.max = ARG2(scan); 4065 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 4066 ST.B = next; 4067 ST.minmod = minmod; 4068 minmod = 0; 4069 ST.count = -1; /* this will be updated by WHILEM */ 4070 ST.lastloc = NULL; /* this will be updated by WHILEM */ 4071 4072 PL_reginput = locinput; 4073 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next)); 4074 /* NOTREACHED */ 4075 } 4076 4077 case CURLYX_end: /* just finished matching all of A*B */ 4078 cur_curlyx = ST.prev_curlyx; 4079 sayYES; 4080 /* NOTREACHED */ 4081 4082 case CURLYX_end_fail: /* just failed to match all of A*B */ 4083 regcpblow(ST.cp); 4084 cur_curlyx = ST.prev_curlyx; 4085 sayNO; 4086 /* NOTREACHED */ 4087 4088 4089 #undef ST 4090 #define ST st->u.whilem 4091 4092 case WHILEM: /* just matched an A in /A*B/ (for complex A) */ 4093 { 4094 /* see the discussion above about CURLYX/WHILEM */ 4095 I32 n; 4096 assert(cur_curlyx); /* keep Coverity happy */ 4097 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ 4098 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; 4099 ST.cache_offset = 0; 4100 ST.cache_mask = 0; 4101 4102 PL_reginput = locinput; 4103 4104 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 4105 "%*s whilem: matched %ld out of %ld..%ld\n", 4106 REPORT_CODE_OFF+depth*2, "", (long)n, 4107 (long)cur_curlyx->u.curlyx.min, 4108 (long)cur_curlyx->u.curlyx.max) 4109 ); 4110 4111 /* First just match a string of min A's. */ 4112 4113 if (n < cur_curlyx->u.curlyx.min) { 4114 cur_curlyx->u.curlyx.lastloc = locinput; 4115 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A); 4116 /* NOTREACHED */ 4117 } 4118 4119 /* If degenerate A matches "", assume A done. */ 4120 4121 if (locinput == cur_curlyx->u.curlyx.lastloc) { 4122 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 4123 "%*s whilem: empty match detected, trying continuation...\n", 4124 REPORT_CODE_OFF+depth*2, "") 4125 ); 4126 goto do_whilem_B_max; 4127 } 4128 4129 /* super-linear cache processing */ 4130 4131 if (scan->flags) { 4132 4133 if (!PL_reg_maxiter) { 4134 /* start the countdown: Postpone detection until we 4135 * know the match is not *that* much linear. */ 4136 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4); 4137 /* possible overflow for long strings and many CURLYX's */ 4138 if (PL_reg_maxiter < 0) 4139 PL_reg_maxiter = I32_MAX; 4140 PL_reg_leftiter = PL_reg_maxiter; 4141 } 4142 4143 if (PL_reg_leftiter-- == 0) { 4144 /* initialise cache */ 4145 const I32 size = (PL_reg_maxiter + 7)/8; 4146 if (PL_reg_poscache) { 4147 if ((I32)PL_reg_poscache_size < size) { 4148 Renew(PL_reg_poscache, size, char); 4149 PL_reg_poscache_size = size; 4150 } 4151 Zero(PL_reg_poscache, size, char); 4152 } 4153 else { 4154 PL_reg_poscache_size = size; 4155 Newxz(PL_reg_poscache, size, char); 4156 } 4157 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 4158 "%swhilem: Detected a super-linear match, switching on caching%s...\n", 4159 PL_colors[4], PL_colors[5]) 4160 ); 4161 } 4162 4163 if (PL_reg_leftiter < 0) { 4164 /* have we already failed at this position? */ 4165 I32 offset, mask; 4166 offset = (scan->flags & 0xf) - 1 4167 + (locinput - PL_bostr) * (scan->flags>>4); 4168 mask = 1 << (offset % 8); 4169 offset /= 8; 4170 if (PL_reg_poscache[offset] & mask) { 4171 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 4172 "%*s whilem: (cache) already tried at this position...\n", 4173 REPORT_CODE_OFF+depth*2, "") 4174 ); 4175 sayNO; /* cache records failure */ 4176 } 4177 ST.cache_offset = offset; 4178 ST.cache_mask = mask; 4179 } 4180 } 4181 4182 /* Prefer B over A for minimal matching. */ 4183 4184 if (cur_curlyx->u.curlyx.minmod) { 4185 ST.save_curlyx = cur_curlyx; 4186 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 4187 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor); 4188 REGCP_SET(ST.lastcp); 4189 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B); 4190 /* NOTREACHED */ 4191 } 4192 4193 /* Prefer A over B for maximal matching. */ 4194 4195 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */ 4196 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); 4197 cur_curlyx->u.curlyx.lastloc = locinput; 4198 REGCP_SET(ST.lastcp); 4199 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A); 4200 /* NOTREACHED */ 4201 } 4202 goto do_whilem_B_max; 4203 } 4204 /* NOTREACHED */ 4205 4206 case WHILEM_B_min: /* just matched B in a minimal match */ 4207 case WHILEM_B_max: /* just matched B in a maximal match */ 4208 cur_curlyx = ST.save_curlyx; 4209 sayYES; 4210 /* NOTREACHED */ 4211 4212 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ 4213 cur_curlyx = ST.save_curlyx; 4214 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 4215 cur_curlyx->u.curlyx.count--; 4216 CACHEsayNO; 4217 /* NOTREACHED */ 4218 4219 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ 4220 REGCP_UNWIND(ST.lastcp); 4221 regcppop(rex); 4222 /* FALL THROUGH */ 4223 case WHILEM_A_pre_fail: /* just failed to match even minimal A */ 4224 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 4225 cur_curlyx->u.curlyx.count--; 4226 CACHEsayNO; 4227 /* NOTREACHED */ 4228 4229 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ 4230 REGCP_UNWIND(ST.lastcp); 4231 regcppop(rex); /* Restore some previous $<digit>s? */ 4232 PL_reginput = locinput; 4233 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 4234 "%*s whilem: failed, trying continuation...\n", 4235 REPORT_CODE_OFF+depth*2, "") 4236 ); 4237 do_whilem_B_max: 4238 if (cur_curlyx->u.curlyx.count >= REG_INFTY 4239 && ckWARN(WARN_REGEXP) 4240 && !(PL_reg_flags & RF_warned)) 4241 { 4242 PL_reg_flags |= RF_warned; 4243 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded", 4244 "Complex regular subexpression recursion", 4245 REG_INFTY - 1); 4246 } 4247 4248 /* now try B */ 4249 ST.save_curlyx = cur_curlyx; 4250 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 4251 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B); 4252 /* NOTREACHED */ 4253 4254 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ 4255 cur_curlyx = ST.save_curlyx; 4256 REGCP_UNWIND(ST.lastcp); 4257 regcppop(rex); 4258 4259 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) { 4260 /* Maximum greed exceeded */ 4261 if (cur_curlyx->u.curlyx.count >= REG_INFTY 4262 && ckWARN(WARN_REGEXP) 4263 && !(PL_reg_flags & RF_warned)) 4264 { 4265 PL_reg_flags |= RF_warned; 4266 Perl_warner(aTHX_ packWARN(WARN_REGEXP), 4267 "%s limit (%d) exceeded", 4268 "Complex regular subexpression recursion", 4269 REG_INFTY - 1); 4270 } 4271 cur_curlyx->u.curlyx.count--; 4272 CACHEsayNO; 4273 } 4274 4275 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 4276 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "") 4277 ); 4278 /* Try grabbing another A and see if it helps. */ 4279 PL_reginput = locinput; 4280 cur_curlyx->u.curlyx.lastloc = locinput; 4281 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); 4282 REGCP_SET(ST.lastcp); 4283 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A); 4284 /* NOTREACHED */ 4285 4286 #undef ST 4287 #define ST st->u.branch 4288 4289 case BRANCHJ: /* /(...|A|...)/ with long next pointer */ 4290 next = scan + ARG(scan); 4291 if (next == scan) 4292 next = NULL; 4293 scan = NEXTOPER(scan); 4294 /* FALL THROUGH */ 4295 4296 case BRANCH: /* /(...|A|...)/ */ 4297 scan = NEXTOPER(scan); /* scan now points to inner node */ 4298 ST.lastparen = *PL_reglastparen; 4299 ST.next_branch = next; 4300 REGCP_SET(ST.cp); 4301 PL_reginput = locinput; 4302 4303 /* Now go into the branch */ 4304 if (has_cutgroup) { 4305 PUSH_YES_STATE_GOTO(BRANCH_next, scan); 4306 } else { 4307 PUSH_STATE_GOTO(BRANCH_next, scan); 4308 } 4309 /* NOTREACHED */ 4310 case CUTGROUP: 4311 PL_reginput = locinput; 4312 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : 4313 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 4314 PUSH_STATE_GOTO(CUTGROUP_next,next); 4315 /* NOTREACHED */ 4316 case CUTGROUP_next_fail: 4317 do_cutgroup = 1; 4318 no_final = 1; 4319 if (st->u.mark.mark_name) 4320 sv_commit = st->u.mark.mark_name; 4321 sayNO; 4322 /* NOTREACHED */ 4323 case BRANCH_next: 4324 sayYES; 4325 /* NOTREACHED */ 4326 case BRANCH_next_fail: /* that branch failed; try the next, if any */ 4327 if (do_cutgroup) { 4328 do_cutgroup = 0; 4329 no_final = 0; 4330 } 4331 REGCP_UNWIND(ST.cp); 4332 for (n = *PL_reglastparen; n > ST.lastparen; n--) 4333 PL_regoffs[n].end = -1; 4334 *PL_reglastparen = n; 4335 /*dmq: *PL_reglastcloseparen = n; */ 4336 scan = ST.next_branch; 4337 /* no more branches? */ 4338 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { 4339 DEBUG_EXECUTE_r({ 4340 PerlIO_printf( Perl_debug_log, 4341 "%*s %sBRANCH failed...%s\n", 4342 REPORT_CODE_OFF+depth*2, "", 4343 PL_colors[4], 4344 PL_colors[5] ); 4345 }); 4346 sayNO_SILENT; 4347 } 4348 continue; /* execute next BRANCH[J] op */ 4349 /* NOTREACHED */ 4350 4351 case MINMOD: 4352 minmod = 1; 4353 break; 4354 4355 #undef ST 4356 #define ST st->u.curlym 4357 4358 case CURLYM: /* /A{m,n}B/ where A is fixed-length */ 4359 4360 /* This is an optimisation of CURLYX that enables us to push 4361 * only a single backtracking state, no matter how many matches 4362 * there are in {m,n}. It relies on the pattern being constant 4363 * length, with no parens to influence future backrefs 4364 */ 4365 4366 ST.me = scan; 4367 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 4368 4369 /* if paren positive, emulate an OPEN/CLOSE around A */ 4370 if (ST.me->flags) { 4371 U32 paren = ST.me->flags; 4372 if (paren > PL_regsize) 4373 PL_regsize = paren; 4374 if (paren > *PL_reglastparen) 4375 *PL_reglastparen = paren; 4376 scan += NEXT_OFF(scan); /* Skip former OPEN. */ 4377 } 4378 ST.A = scan; 4379 ST.B = next; 4380 ST.alen = 0; 4381 ST.count = 0; 4382 ST.minmod = minmod; 4383 minmod = 0; 4384 ST.c1 = CHRTEST_UNINIT; 4385 REGCP_SET(ST.cp); 4386 4387 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ 4388 goto curlym_do_B; 4389 4390 curlym_do_A: /* execute the A in /A{m,n}B/ */ 4391 PL_reginput = locinput; 4392 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */ 4393 /* NOTREACHED */ 4394 4395 case CURLYM_A: /* we've just matched an A */ 4396 locinput = st->locinput; 4397 nextchr = UCHARAT(locinput); 4398 4399 ST.count++; 4400 /* after first match, determine A's length: u.curlym.alen */ 4401 if (ST.count == 1) { 4402 if (PL_reg_match_utf8) { 4403 char *s = locinput; 4404 while (s < PL_reginput) { 4405 ST.alen++; 4406 s += UTF8SKIP(s); 4407 } 4408 } 4409 else { 4410 ST.alen = PL_reginput - locinput; 4411 } 4412 if (ST.alen == 0) 4413 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); 4414 } 4415 DEBUG_EXECUTE_r( 4416 PerlIO_printf(Perl_debug_log, 4417 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n", 4418 (int)(REPORT_CODE_OFF+(depth*2)), "", 4419 (IV) ST.count, (IV)ST.alen) 4420 ); 4421 4422 locinput = PL_reginput; 4423 4424 if (cur_eval && cur_eval->u.eval.close_paren && 4425 cur_eval->u.eval.close_paren == (U32)ST.me->flags) 4426 goto fake_end; 4427 4428 { 4429 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); 4430 if ( max == REG_INFTY || ST.count < max ) 4431 goto curlym_do_A; /* try to match another A */ 4432 } 4433 goto curlym_do_B; /* try to match B */ 4434 4435 case CURLYM_A_fail: /* just failed to match an A */ 4436 REGCP_UNWIND(ST.cp); 4437 4438 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ 4439 || (cur_eval && cur_eval->u.eval.close_paren && 4440 cur_eval->u.eval.close_paren == (U32)ST.me->flags)) 4441 sayNO; 4442 4443 curlym_do_B: /* execute the B in /A{m,n}B/ */ 4444 PL_reginput = locinput; 4445 if (ST.c1 == CHRTEST_UNINIT) { 4446 /* calculate c1 and c2 for possible match of 1st char 4447 * following curly */ 4448 ST.c1 = ST.c2 = CHRTEST_VOID; 4449 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { 4450 regnode *text_node = ST.B; 4451 if (! HAS_TEXT(text_node)) 4452 FIND_NEXT_IMPT(text_node); 4453 /* this used to be 4454 4455 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT) 4456 4457 But the former is redundant in light of the latter. 4458 4459 if this changes back then the macro for 4460 IS_TEXT and friends need to change. 4461 */ 4462 if (PL_regkind[OP(text_node)] == EXACT) 4463 { 4464 4465 ST.c1 = (U8)*STRING(text_node); 4466 ST.c2 = 4467 (IS_TEXTF(text_node)) 4468 ? PL_fold[ST.c1] 4469 : (IS_TEXTFL(text_node)) 4470 ? PL_fold_locale[ST.c1] 4471 : ST.c1; 4472 } 4473 } 4474 } 4475 4476 DEBUG_EXECUTE_r( 4477 PerlIO_printf(Perl_debug_log, 4478 "%*s CURLYM trying tail with matches=%"IVdf"...\n", 4479 (int)(REPORT_CODE_OFF+(depth*2)), 4480 "", (IV)ST.count) 4481 ); 4482 if (ST.c1 != CHRTEST_VOID 4483 && UCHARAT(PL_reginput) != ST.c1 4484 && UCHARAT(PL_reginput) != ST.c2) 4485 { 4486 /* simulate B failing */ 4487 DEBUG_OPTIMISE_r( 4488 PerlIO_printf(Perl_debug_log, 4489 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n", 4490 (int)(REPORT_CODE_OFF+(depth*2)),"", 4491 (IV)ST.c1,(IV)ST.c2 4492 )); 4493 state_num = CURLYM_B_fail; 4494 goto reenter_switch; 4495 } 4496 4497 if (ST.me->flags) { 4498 /* mark current A as captured */ 4499 I32 paren = ST.me->flags; 4500 if (ST.count) { 4501 PL_regoffs[paren].start 4502 = HOPc(PL_reginput, -ST.alen) - PL_bostr; 4503 PL_regoffs[paren].end = PL_reginput - PL_bostr; 4504 /*dmq: *PL_reglastcloseparen = paren; */ 4505 } 4506 else 4507 PL_regoffs[paren].end = -1; 4508 if (cur_eval && cur_eval->u.eval.close_paren && 4509 cur_eval->u.eval.close_paren == (U32)ST.me->flags) 4510 { 4511 if (ST.count) 4512 goto fake_end; 4513 else 4514 sayNO; 4515 } 4516 } 4517 4518 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */ 4519 /* NOTREACHED */ 4520 4521 case CURLYM_B_fail: /* just failed to match a B */ 4522 REGCP_UNWIND(ST.cp); 4523 if (ST.minmod) { 4524 I32 max = ARG2(ST.me); 4525 if (max != REG_INFTY && ST.count == max) 4526 sayNO; 4527 goto curlym_do_A; /* try to match a further A */ 4528 } 4529 /* backtrack one A */ 4530 if (ST.count == ARG1(ST.me) /* min */) 4531 sayNO; 4532 ST.count--; 4533 locinput = HOPc(locinput, -ST.alen); 4534 goto curlym_do_B; /* try to match B */ 4535 4536 #undef ST 4537 #define ST st->u.curly 4538 4539 #define CURLY_SETPAREN(paren, success) \ 4540 if (paren) { \ 4541 if (success) { \ 4542 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \ 4543 PL_regoffs[paren].end = locinput - PL_bostr; \ 4544 *PL_reglastcloseparen = paren; \ 4545 } \ 4546 else \ 4547 PL_regoffs[paren].end = -1; \ 4548 } 4549 4550 case STAR: /* /A*B/ where A is width 1 */ 4551 ST.paren = 0; 4552 ST.min = 0; 4553 ST.max = REG_INFTY; 4554 scan = NEXTOPER(scan); 4555 goto repeat; 4556 case PLUS: /* /A+B/ where A is width 1 */ 4557 ST.paren = 0; 4558 ST.min = 1; 4559 ST.max = REG_INFTY; 4560 scan = NEXTOPER(scan); 4561 goto repeat; 4562 case CURLYN: /* /(A){m,n}B/ where A is width 1 */ 4563 ST.paren = scan->flags; /* Which paren to set */ 4564 if (ST.paren > PL_regsize) 4565 PL_regsize = ST.paren; 4566 if (ST.paren > *PL_reglastparen) 4567 *PL_reglastparen = ST.paren; 4568 ST.min = ARG1(scan); /* min to match */ 4569 ST.max = ARG2(scan); /* max to match */ 4570 if (cur_eval && cur_eval->u.eval.close_paren && 4571 cur_eval->u.eval.close_paren == (U32)ST.paren) { 4572 ST.min=1; 4573 ST.max=1; 4574 } 4575 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); 4576 goto repeat; 4577 case CURLY: /* /A{m,n}B/ where A is width 1 */ 4578 ST.paren = 0; 4579 ST.min = ARG1(scan); /* min to match */ 4580 ST.max = ARG2(scan); /* max to match */ 4581 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 4582 repeat: 4583 /* 4584 * Lookahead to avoid useless match attempts 4585 * when we know what character comes next. 4586 * 4587 * Used to only do .*x and .*?x, but now it allows 4588 * for )'s, ('s and (?{ ... })'s to be in the way 4589 * of the quantifier and the EXACT-like node. -- japhy 4590 */ 4591 4592 if (ST.min > ST.max) /* XXX make this a compile-time check? */ 4593 sayNO; 4594 if (HAS_TEXT(next) || JUMPABLE(next)) { 4595 U8 *s; 4596 regnode *text_node = next; 4597 4598 if (! HAS_TEXT(text_node)) 4599 FIND_NEXT_IMPT(text_node); 4600 4601 if (! HAS_TEXT(text_node)) 4602 ST.c1 = ST.c2 = CHRTEST_VOID; 4603 else { 4604 if ( PL_regkind[OP(text_node)] != EXACT ) { 4605 ST.c1 = ST.c2 = CHRTEST_VOID; 4606 goto assume_ok_easy; 4607 } 4608 else 4609 s = (U8*)STRING(text_node); 4610 4611 /* Currently we only get here when 4612 4613 PL_rekind[OP(text_node)] == EXACT 4614 4615 if this changes back then the macro for IS_TEXT and 4616 friends need to change. */ 4617 if (!UTF) { 4618 ST.c2 = ST.c1 = *s; 4619 if (IS_TEXTF(text_node)) 4620 ST.c2 = PL_fold[ST.c1]; 4621 else if (IS_TEXTFL(text_node)) 4622 ST.c2 = PL_fold_locale[ST.c1]; 4623 } 4624 else { /* UTF */ 4625 if (IS_TEXTF(text_node)) { 4626 STRLEN ulen1, ulen2; 4627 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; 4628 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; 4629 4630 to_utf8_lower((U8*)s, tmpbuf1, &ulen1); 4631 to_utf8_upper((U8*)s, tmpbuf2, &ulen2); 4632 #ifdef EBCDIC 4633 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0, 4634 ckWARN(WARN_UTF8) ? 4635 0 : UTF8_ALLOW_ANY); 4636 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0, 4637 ckWARN(WARN_UTF8) ? 4638 0 : UTF8_ALLOW_ANY); 4639 #else 4640 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0, 4641 uniflags); 4642 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0, 4643 uniflags); 4644 #endif 4645 } 4646 else { 4647 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, 4648 uniflags); 4649 } 4650 } 4651 } 4652 } 4653 else 4654 ST.c1 = ST.c2 = CHRTEST_VOID; 4655 assume_ok_easy: 4656 4657 ST.A = scan; 4658 ST.B = next; 4659 PL_reginput = locinput; 4660 if (minmod) { 4661 minmod = 0; 4662 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min) 4663 sayNO; 4664 ST.count = ST.min; 4665 locinput = PL_reginput; 4666 REGCP_SET(ST.cp); 4667 if (ST.c1 == CHRTEST_VOID) 4668 goto curly_try_B_min; 4669 4670 ST.oldloc = locinput; 4671 4672 /* set ST.maxpos to the furthest point along the 4673 * string that could possibly match */ 4674 if (ST.max == REG_INFTY) { 4675 ST.maxpos = PL_regeol - 1; 4676 if (do_utf8) 4677 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) 4678 ST.maxpos--; 4679 } 4680 else if (do_utf8) { 4681 int m = ST.max - ST.min; 4682 for (ST.maxpos = locinput; 4683 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--) 4684 ST.maxpos += UTF8SKIP(ST.maxpos); 4685 } 4686 else { 4687 ST.maxpos = locinput + ST.max - ST.min; 4688 if (ST.maxpos >= PL_regeol) 4689 ST.maxpos = PL_regeol - 1; 4690 } 4691 goto curly_try_B_min_known; 4692 4693 } 4694 else { 4695 ST.count = regrepeat(rex, ST.A, ST.max, depth); 4696 locinput = PL_reginput; 4697 if (ST.count < ST.min) 4698 sayNO; 4699 if ((ST.count > ST.min) 4700 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) 4701 { 4702 /* A{m,n} must come at the end of the string, there's 4703 * no point in backing off ... */ 4704 ST.min = ST.count; 4705 /* ...except that $ and \Z can match before *and* after 4706 newline at the end. Consider "\n\n" =~ /\n+\Z\n/. 4707 We may back off by one in this case. */ 4708 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS) 4709 ST.min--; 4710 } 4711 REGCP_SET(ST.cp); 4712 goto curly_try_B_max; 4713 } 4714 /* NOTREACHED */ 4715 4716 4717 case CURLY_B_min_known_fail: 4718 /* failed to find B in a non-greedy match where c1,c2 valid */ 4719 if (ST.paren && ST.count) 4720 PL_regoffs[ST.paren].end = -1; 4721 4722 PL_reginput = locinput; /* Could be reset... */ 4723 REGCP_UNWIND(ST.cp); 4724 /* Couldn't or didn't -- move forward. */ 4725 ST.oldloc = locinput; 4726 if (do_utf8) 4727 locinput += UTF8SKIP(locinput); 4728 else 4729 locinput++; 4730 ST.count++; 4731 curly_try_B_min_known: 4732 /* find the next place where 'B' could work, then call B */ 4733 { 4734 int n; 4735 if (do_utf8) { 4736 n = (ST.oldloc == locinput) ? 0 : 1; 4737 if (ST.c1 == ST.c2) { 4738 STRLEN len; 4739 /* set n to utf8_distance(oldloc, locinput) */ 4740 while (locinput <= ST.maxpos && 4741 utf8n_to_uvchr((U8*)locinput, 4742 UTF8_MAXBYTES, &len, 4743 uniflags) != (UV)ST.c1) { 4744 locinput += len; 4745 n++; 4746 } 4747 } 4748 else { 4749 /* set n to utf8_distance(oldloc, locinput) */ 4750 while (locinput <= ST.maxpos) { 4751 STRLEN len; 4752 const UV c = utf8n_to_uvchr((U8*)locinput, 4753 UTF8_MAXBYTES, &len, 4754 uniflags); 4755 if (c == (UV)ST.c1 || c == (UV)ST.c2) 4756 break; 4757 locinput += len; 4758 n++; 4759 } 4760 } 4761 } 4762 else { 4763 if (ST.c1 == ST.c2) { 4764 while (locinput <= ST.maxpos && 4765 UCHARAT(locinput) != ST.c1) 4766 locinput++; 4767 } 4768 else { 4769 while (locinput <= ST.maxpos 4770 && UCHARAT(locinput) != ST.c1 4771 && UCHARAT(locinput) != ST.c2) 4772 locinput++; 4773 } 4774 n = locinput - ST.oldloc; 4775 } 4776 if (locinput > ST.maxpos) 4777 sayNO; 4778 /* PL_reginput == oldloc now */ 4779 if (n) { 4780 ST.count += n; 4781 if (regrepeat(rex, ST.A, n, depth) < n) 4782 sayNO; 4783 } 4784 PL_reginput = locinput; 4785 CURLY_SETPAREN(ST.paren, ST.count); 4786 if (cur_eval && cur_eval->u.eval.close_paren && 4787 cur_eval->u.eval.close_paren == (U32)ST.paren) { 4788 goto fake_end; 4789 } 4790 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B); 4791 } 4792 /* NOTREACHED */ 4793 4794 4795 case CURLY_B_min_fail: 4796 /* failed to find B in a non-greedy match where c1,c2 invalid */ 4797 if (ST.paren && ST.count) 4798 PL_regoffs[ST.paren].end = -1; 4799 4800 REGCP_UNWIND(ST.cp); 4801 /* failed -- move forward one */ 4802 PL_reginput = locinput; 4803 if (regrepeat(rex, ST.A, 1, depth)) { 4804 ST.count++; 4805 locinput = PL_reginput; 4806 if (ST.count <= ST.max || (ST.max == REG_INFTY && 4807 ST.count > 0)) /* count overflow ? */ 4808 { 4809 curly_try_B_min: 4810 CURLY_SETPAREN(ST.paren, ST.count); 4811 if (cur_eval && cur_eval->u.eval.close_paren && 4812 cur_eval->u.eval.close_paren == (U32)ST.paren) { 4813 goto fake_end; 4814 } 4815 PUSH_STATE_GOTO(CURLY_B_min, ST.B); 4816 } 4817 } 4818 sayNO; 4819 /* NOTREACHED */ 4820 4821 4822 curly_try_B_max: 4823 /* a successful greedy match: now try to match B */ 4824 if (cur_eval && cur_eval->u.eval.close_paren && 4825 cur_eval->u.eval.close_paren == (U32)ST.paren) { 4826 goto fake_end; 4827 } 4828 { 4829 UV c = 0; 4830 if (ST.c1 != CHRTEST_VOID) 4831 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput, 4832 UTF8_MAXBYTES, 0, uniflags) 4833 : (UV) UCHARAT(PL_reginput); 4834 /* If it could work, try it. */ 4835 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) { 4836 CURLY_SETPAREN(ST.paren, ST.count); 4837 PUSH_STATE_GOTO(CURLY_B_max, ST.B); 4838 /* NOTREACHED */ 4839 } 4840 } 4841 /* FALL THROUGH */ 4842 case CURLY_B_max_fail: 4843 /* failed to find B in a greedy match */ 4844 if (ST.paren && ST.count) 4845 PL_regoffs[ST.paren].end = -1; 4846 4847 REGCP_UNWIND(ST.cp); 4848 /* back up. */ 4849 if (--ST.count < ST.min) 4850 sayNO; 4851 PL_reginput = locinput = HOPc(locinput, -1); 4852 goto curly_try_B_max; 4853 4854 #undef ST 4855 4856 case END: 4857 fake_end: 4858 if (cur_eval) { 4859 /* we've just finished A in /(??{A})B/; now continue with B */ 4860 I32 tmpix; 4861 st->u.eval.toggle_reg_flags 4862 = cur_eval->u.eval.toggle_reg_flags; 4863 PL_reg_flags ^= st->u.eval.toggle_reg_flags; 4864 4865 st->u.eval.prev_rex = rex; /* inner */ 4866 SETREX(rex,cur_eval->u.eval.prev_rex); 4867 rexi = RXi_GET(rex); 4868 cur_curlyx = cur_eval->u.eval.prev_curlyx; 4869 ReREFCNT_inc(rex); 4870 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */ 4871 4872 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ 4873 PL_reglastparen = &rex->lastparen; 4874 PL_reglastcloseparen = &rex->lastcloseparen; 4875 4876 REGCP_SET(st->u.eval.lastcp); 4877 PL_reginput = locinput; 4878 4879 /* Restore parens of the outer rex without popping the 4880 * savestack */ 4881 tmpix = PL_savestack_ix; 4882 PL_savestack_ix = cur_eval->u.eval.lastcp; 4883 regcppop(rex); 4884 PL_savestack_ix = tmpix; 4885 4886 st->u.eval.prev_eval = cur_eval; 4887 cur_eval = cur_eval->u.eval.prev_eval; 4888 DEBUG_EXECUTE_r( 4889 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n", 4890 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval));); 4891 if ( nochange_depth ) 4892 nochange_depth--; 4893 4894 PUSH_YES_STATE_GOTO(EVAL_AB, 4895 st->u.eval.prev_eval->u.eval.B); /* match B */ 4896 } 4897 4898 if (locinput < reginfo->till) { 4899 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 4900 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", 4901 PL_colors[4], 4902 (long)(locinput - PL_reg_starttry), 4903 (long)(reginfo->till - PL_reg_starttry), 4904 PL_colors[5])); 4905 4906 sayNO_SILENT; /* Cannot match: too short. */ 4907 } 4908 PL_reginput = locinput; /* put where regtry can find it */ 4909 sayYES; /* Success! */ 4910 4911 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */ 4912 DEBUG_EXECUTE_r( 4913 PerlIO_printf(Perl_debug_log, 4914 "%*s %ssubpattern success...%s\n", 4915 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])); 4916 PL_reginput = locinput; /* put where regtry can find it */ 4917 sayYES; /* Success! */ 4918 4919 #undef ST 4920 #define ST st->u.ifmatch 4921 4922 case SUSPEND: /* (?>A) */ 4923 ST.wanted = 1; 4924 PL_reginput = locinput; 4925 goto do_ifmatch; 4926 4927 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */ 4928 ST.wanted = 0; 4929 goto ifmatch_trivial_fail_test; 4930 4931 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */ 4932 ST.wanted = 1; 4933 ifmatch_trivial_fail_test: 4934 if (scan->flags) { 4935 char * const s = HOPBACKc(locinput, scan->flags); 4936 if (!s) { 4937 /* trivial fail */ 4938 if (logical) { 4939 logical = 0; 4940 sw = 1 - (bool)ST.wanted; 4941 } 4942 else if (ST.wanted) 4943 sayNO; 4944 next = scan + ARG(scan); 4945 if (next == scan) 4946 next = NULL; 4947 break; 4948 } 4949 PL_reginput = s; 4950 } 4951 else 4952 PL_reginput = locinput; 4953 4954 do_ifmatch: 4955 ST.me = scan; 4956 ST.logical = logical; 4957 logical = 0; /* XXX: reset state of logical once it has been saved into ST */ 4958 4959 /* execute body of (?...A) */ 4960 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan))); 4961 /* NOTREACHED */ 4962 4963 case IFMATCH_A_fail: /* body of (?...A) failed */ 4964 ST.wanted = !ST.wanted; 4965 /* FALL THROUGH */ 4966 4967 case IFMATCH_A: /* body of (?...A) succeeded */ 4968 if (ST.logical) { 4969 sw = (bool)ST.wanted; 4970 } 4971 else if (!ST.wanted) 4972 sayNO; 4973 4974 if (OP(ST.me) == SUSPEND) 4975 locinput = PL_reginput; 4976 else { 4977 locinput = PL_reginput = st->locinput; 4978 nextchr = UCHARAT(locinput); 4979 } 4980 scan = ST.me + ARG(ST.me); 4981 if (scan == ST.me) 4982 scan = NULL; 4983 continue; /* execute B */ 4984 4985 #undef ST 4986 4987 case LONGJMP: 4988 next = scan + ARG(scan); 4989 if (next == scan) 4990 next = NULL; 4991 break; 4992 case COMMIT: 4993 reginfo->cutpoint = PL_regeol; 4994 /* FALLTHROUGH */ 4995 case PRUNE: 4996 PL_reginput = locinput; 4997 if (!scan->flags) 4998 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 4999 PUSH_STATE_GOTO(COMMIT_next,next); 5000 /* NOTREACHED */ 5001 case COMMIT_next_fail: 5002 no_final = 1; 5003 /* FALLTHROUGH */ 5004 case OPFAIL: 5005 sayNO; 5006 /* NOTREACHED */ 5007 5008 #define ST st->u.mark 5009 case MARKPOINT: 5010 ST.prev_mark = mark_state; 5011 ST.mark_name = sv_commit = sv_yes_mark 5012 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 5013 mark_state = st; 5014 ST.mark_loc = PL_reginput = locinput; 5015 PUSH_YES_STATE_GOTO(MARKPOINT_next,next); 5016 /* NOTREACHED */ 5017 case MARKPOINT_next: 5018 mark_state = ST.prev_mark; 5019 sayYES; 5020 /* NOTREACHED */ 5021 case MARKPOINT_next_fail: 5022 if (popmark && sv_eq(ST.mark_name,popmark)) 5023 { 5024 if (ST.mark_loc > startpoint) 5025 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 5026 popmark = NULL; /* we found our mark */ 5027 sv_commit = ST.mark_name; 5028 5029 DEBUG_EXECUTE_r({ 5030 PerlIO_printf(Perl_debug_log, 5031 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n", 5032 REPORT_CODE_OFF+depth*2, "", 5033 PL_colors[4], SVfARG(sv_commit), PL_colors[5]); 5034 }); 5035 } 5036 mark_state = ST.prev_mark; 5037 sv_yes_mark = mark_state ? 5038 mark_state->u.mark.mark_name : NULL; 5039 sayNO; 5040 /* NOTREACHED */ 5041 case SKIP: 5042 PL_reginput = locinput; 5043 if (scan->flags) { 5044 /* (*SKIP) : if we fail we cut here*/ 5045 ST.mark_name = NULL; 5046 ST.mark_loc = locinput; 5047 PUSH_STATE_GOTO(SKIP_next,next); 5048 } else { 5049 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, 5050 otherwise do nothing. Meaning we need to scan 5051 */ 5052 regmatch_state *cur = mark_state; 5053 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 5054 5055 while (cur) { 5056 if ( sv_eq( cur->u.mark.mark_name, 5057 find ) ) 5058 { 5059 ST.mark_name = find; 5060 PUSH_STATE_GOTO( SKIP_next, next ); 5061 } 5062 cur = cur->u.mark.prev_mark; 5063 } 5064 } 5065 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ 5066 break; 5067 case SKIP_next_fail: 5068 if (ST.mark_name) { 5069 /* (*CUT:NAME) - Set up to search for the name as we 5070 collapse the stack*/ 5071 popmark = ST.mark_name; 5072 } else { 5073 /* (*CUT) - No name, we cut here.*/ 5074 if (ST.mark_loc > startpoint) 5075 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 5076 /* but we set sv_commit to latest mark_name if there 5077 is one so they can test to see how things lead to this 5078 cut */ 5079 if (mark_state) 5080 sv_commit=mark_state->u.mark.mark_name; 5081 } 5082 no_final = 1; 5083 sayNO; 5084 /* NOTREACHED */ 5085 #undef ST 5086 case FOLDCHAR: 5087 n = ARG(scan); 5088 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) { 5089 locinput += ln; 5090 } else if ( 0xDF == n && !do_utf8 && !UTF ) { 5091 sayNO; 5092 } else { 5093 U8 folded[UTF8_MAXBYTES_CASE+1]; 5094 STRLEN foldlen; 5095 const char * const l = locinput; 5096 char *e = PL_regeol; 5097 to_uni_fold(n, folded, &foldlen); 5098 5099 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1, 5100 l, &e, 0, do_utf8)) { 5101 sayNO; 5102 } 5103 locinput = e; 5104 } 5105 nextchr = UCHARAT(locinput); 5106 break; 5107 case LNBREAK: 5108 if ((n=is_LNBREAK(locinput,do_utf8))) { 5109 locinput += n; 5110 nextchr = UCHARAT(locinput); 5111 } else 5112 sayNO; 5113 break; 5114 5115 #define CASE_CLASS(nAmE) \ 5116 case nAmE: \ 5117 if ((n=is_##nAmE(locinput,do_utf8))) { \ 5118 locinput += n; \ 5119 nextchr = UCHARAT(locinput); \ 5120 } else \ 5121 sayNO; \ 5122 break; \ 5123 case N##nAmE: \ 5124 if ((n=is_##nAmE(locinput,do_utf8))) { \ 5125 sayNO; \ 5126 } else { \ 5127 locinput += UTF8SKIP(locinput); \ 5128 nextchr = UCHARAT(locinput); \ 5129 } \ 5130 break 5131 5132 CASE_CLASS(VERTWS); 5133 CASE_CLASS(HORIZWS); 5134 #undef CASE_CLASS 5135 5136 default: 5137 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", 5138 PTR2UV(scan), OP(scan)); 5139 Perl_croak(aTHX_ "regexp memory corruption"); 5140 5141 } /* end switch */ 5142 5143 /* switch break jumps here */ 5144 scan = next; /* prepare to execute the next op and ... */ 5145 continue; /* ... jump back to the top, reusing st */ 5146 /* NOTREACHED */ 5147 5148 push_yes_state: 5149 /* push a state that backtracks on success */ 5150 st->u.yes.prev_yes_state = yes_state; 5151 yes_state = st; 5152 /* FALL THROUGH */ 5153 push_state: 5154 /* push a new regex state, then continue at scan */ 5155 { 5156 regmatch_state *newst; 5157 5158 DEBUG_STACK_r({ 5159 regmatch_state *cur = st; 5160 regmatch_state *curyes = yes_state; 5161 int curd = depth; 5162 regmatch_slab *slab = PL_regmatch_slab; 5163 for (;curd > -1;cur--,curd--) { 5164 if (cur < SLAB_FIRST(slab)) { 5165 slab = slab->prev; 5166 cur = SLAB_LAST(slab); 5167 } 5168 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n", 5169 REPORT_CODE_OFF + 2 + depth * 2,"", 5170 curd, PL_reg_name[cur->resume_state], 5171 (curyes == cur) ? "yes" : "" 5172 ); 5173 if (curyes == cur) 5174 curyes = cur->u.yes.prev_yes_state; 5175 } 5176 } else 5177 DEBUG_STATE_pp("push") 5178 ); 5179 depth++; 5180 st->locinput = locinput; 5181 newst = st+1; 5182 if (newst > SLAB_LAST(PL_regmatch_slab)) 5183 newst = S_push_slab(aTHX); 5184 PL_regmatch_state = newst; 5185 5186 locinput = PL_reginput; 5187 nextchr = UCHARAT(locinput); 5188 st = newst; 5189 continue; 5190 /* NOTREACHED */ 5191 } 5192 } 5193 5194 /* 5195 * We get here only if there's trouble -- normally "case END" is 5196 * the terminating point. 5197 */ 5198 Perl_croak(aTHX_ "corrupted regexp pointers"); 5199 /*NOTREACHED*/ 5200 sayNO; 5201 5202 yes: 5203 if (yes_state) { 5204 /* we have successfully completed a subexpression, but we must now 5205 * pop to the state marked by yes_state and continue from there */ 5206 assert(st != yes_state); 5207 #ifdef DEBUGGING 5208 while (st != yes_state) { 5209 st--; 5210 if (st < SLAB_FIRST(PL_regmatch_slab)) { 5211 PL_regmatch_slab = PL_regmatch_slab->prev; 5212 st = SLAB_LAST(PL_regmatch_slab); 5213 } 5214 DEBUG_STATE_r({ 5215 if (no_final) { 5216 DEBUG_STATE_pp("pop (no final)"); 5217 } else { 5218 DEBUG_STATE_pp("pop (yes)"); 5219 } 5220 }); 5221 depth--; 5222 } 5223 #else 5224 while (yes_state < SLAB_FIRST(PL_regmatch_slab) 5225 || yes_state > SLAB_LAST(PL_regmatch_slab)) 5226 { 5227 /* not in this slab, pop slab */ 5228 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); 5229 PL_regmatch_slab = PL_regmatch_slab->prev; 5230 st = SLAB_LAST(PL_regmatch_slab); 5231 } 5232 depth -= (st - yes_state); 5233 #endif 5234 st = yes_state; 5235 yes_state = st->u.yes.prev_yes_state; 5236 PL_regmatch_state = st; 5237 5238 if (no_final) { 5239 locinput= st->locinput; 5240 nextchr = UCHARAT(locinput); 5241 } 5242 state_num = st->resume_state + no_final; 5243 goto reenter_switch; 5244 } 5245 5246 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", 5247 PL_colors[4], PL_colors[5])); 5248 5249 if (PL_reg_eval_set) { 5250 /* each successfully executed (?{...}) block does the equivalent of 5251 * local $^R = do {...} 5252 * When popping the save stack, all these locals would be undone; 5253 * bypass this by setting the outermost saved $^R to the latest 5254 * value */ 5255 if (oreplsv != GvSV(PL_replgv)) 5256 sv_setsv(oreplsv, GvSV(PL_replgv)); 5257 } 5258 result = 1; 5259 goto final_exit; 5260 5261 no: 5262 DEBUG_EXECUTE_r( 5263 PerlIO_printf(Perl_debug_log, 5264 "%*s %sfailed...%s\n", 5265 REPORT_CODE_OFF+depth*2, "", 5266 PL_colors[4], PL_colors[5]) 5267 ); 5268 5269 no_silent: 5270 if (no_final) { 5271 if (yes_state) { 5272 goto yes; 5273 } else { 5274 goto final_exit; 5275 } 5276 } 5277 if (depth) { 5278 /* there's a previous state to backtrack to */ 5279 st--; 5280 if (st < SLAB_FIRST(PL_regmatch_slab)) { 5281 PL_regmatch_slab = PL_regmatch_slab->prev; 5282 st = SLAB_LAST(PL_regmatch_slab); 5283 } 5284 PL_regmatch_state = st; 5285 locinput= st->locinput; 5286 nextchr = UCHARAT(locinput); 5287 5288 DEBUG_STATE_pp("pop"); 5289 depth--; 5290 if (yes_state == st) 5291 yes_state = st->u.yes.prev_yes_state; 5292 5293 state_num = st->resume_state + 1; /* failure = success + 1 */ 5294 goto reenter_switch; 5295 } 5296 result = 0; 5297 5298 final_exit: 5299 if (rex->intflags & PREGf_VERBARG_SEEN) { 5300 SV *sv_err = get_sv("REGERROR", 1); 5301 SV *sv_mrk = get_sv("REGMARK", 1); 5302 if (result) { 5303 sv_commit = &PL_sv_no; 5304 if (!sv_yes_mark) 5305 sv_yes_mark = &PL_sv_yes; 5306 } else { 5307 if (!sv_commit) 5308 sv_commit = &PL_sv_yes; 5309 sv_yes_mark = &PL_sv_no; 5310 } 5311 sv_setsv(sv_err, sv_commit); 5312 sv_setsv(sv_mrk, sv_yes_mark); 5313 } 5314 5315 /* clean up; in particular, free all slabs above current one */ 5316 LEAVE_SCOPE(oldsave); 5317 5318 return result; 5319 } 5320 5321 /* 5322 - regrepeat - repeatedly match something simple, report how many 5323 */ 5324 /* 5325 * [This routine now assumes that it will only match on things of length 1. 5326 * That was true before, but now we assume scan - reginput is the count, 5327 * rather than incrementing count on every character. [Er, except utf8.]] 5328 */ 5329 STATIC I32 5330 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) 5331 { 5332 dVAR; 5333 register char *scan; 5334 register I32 c; 5335 register char *loceol = PL_regeol; 5336 register I32 hardcount = 0; 5337 register bool do_utf8 = PL_reg_match_utf8; 5338 #ifndef DEBUGGING 5339 PERL_UNUSED_ARG(depth); 5340 #endif 5341 5342 PERL_ARGS_ASSERT_REGREPEAT; 5343 5344 scan = PL_reginput; 5345 if (max == REG_INFTY) 5346 max = I32_MAX; 5347 else if (max < loceol - scan) 5348 loceol = scan + max; 5349 switch (OP(p)) { 5350 case REG_ANY: 5351 if (do_utf8) { 5352 loceol = PL_regeol; 5353 while (scan < loceol && hardcount < max && *scan != '\n') { 5354 scan += UTF8SKIP(scan); 5355 hardcount++; 5356 } 5357 } else { 5358 while (scan < loceol && *scan != '\n') 5359 scan++; 5360 } 5361 break; 5362 case SANY: 5363 if (do_utf8) { 5364 loceol = PL_regeol; 5365 while (scan < loceol && hardcount < max) { 5366 scan += UTF8SKIP(scan); 5367 hardcount++; 5368 } 5369 } 5370 else 5371 scan = loceol; 5372 break; 5373 case CANY: 5374 scan = loceol; 5375 break; 5376 case EXACT: /* length of string is 1 */ 5377 c = (U8)*STRING(p); 5378 while (scan < loceol && UCHARAT(scan) == c) 5379 scan++; 5380 break; 5381 case EXACTF: /* length of string is 1 */ 5382 c = (U8)*STRING(p); 5383 while (scan < loceol && 5384 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) 5385 scan++; 5386 break; 5387 case EXACTFL: /* length of string is 1 */ 5388 PL_reg_flags |= RF_tainted; 5389 c = (U8)*STRING(p); 5390 while (scan < loceol && 5391 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) 5392 scan++; 5393 break; 5394 case ANYOF: 5395 if (do_utf8) { 5396 loceol = PL_regeol; 5397 while (hardcount < max && scan < loceol && 5398 reginclass(prog, p, (U8*)scan, 0, do_utf8)) { 5399 scan += UTF8SKIP(scan); 5400 hardcount++; 5401 } 5402 } else { 5403 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan)) 5404 scan++; 5405 } 5406 break; 5407 case ALNUM: 5408 if (do_utf8) { 5409 loceol = PL_regeol; 5410 LOAD_UTF8_CHARCLASS_ALNUM(); 5411 while (hardcount < max && scan < loceol && 5412 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { 5413 scan += UTF8SKIP(scan); 5414 hardcount++; 5415 } 5416 } else { 5417 while (scan < loceol && isALNUM(*scan)) 5418 scan++; 5419 } 5420 break; 5421 case ALNUML: 5422 PL_reg_flags |= RF_tainted; 5423 if (do_utf8) { 5424 loceol = PL_regeol; 5425 while (hardcount < max && scan < loceol && 5426 isALNUM_LC_utf8((U8*)scan)) { 5427 scan += UTF8SKIP(scan); 5428 hardcount++; 5429 } 5430 } else { 5431 while (scan < loceol && isALNUM_LC(*scan)) 5432 scan++; 5433 } 5434 break; 5435 case NALNUM: 5436 if (do_utf8) { 5437 loceol = PL_regeol; 5438 LOAD_UTF8_CHARCLASS_ALNUM(); 5439 while (hardcount < max && scan < loceol && 5440 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { 5441 scan += UTF8SKIP(scan); 5442 hardcount++; 5443 } 5444 } else { 5445 while (scan < loceol && !isALNUM(*scan)) 5446 scan++; 5447 } 5448 break; 5449 case NALNUML: 5450 PL_reg_flags |= RF_tainted; 5451 if (do_utf8) { 5452 loceol = PL_regeol; 5453 while (hardcount < max && scan < loceol && 5454 !isALNUM_LC_utf8((U8*)scan)) { 5455 scan += UTF8SKIP(scan); 5456 hardcount++; 5457 } 5458 } else { 5459 while (scan < loceol && !isALNUM_LC(*scan)) 5460 scan++; 5461 } 5462 break; 5463 case SPACE: 5464 if (do_utf8) { 5465 loceol = PL_regeol; 5466 LOAD_UTF8_CHARCLASS_SPACE(); 5467 while (hardcount < max && scan < loceol && 5468 (*scan == ' ' || 5469 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { 5470 scan += UTF8SKIP(scan); 5471 hardcount++; 5472 } 5473 } else { 5474 while (scan < loceol && isSPACE(*scan)) 5475 scan++; 5476 } 5477 break; 5478 case SPACEL: 5479 PL_reg_flags |= RF_tainted; 5480 if (do_utf8) { 5481 loceol = PL_regeol; 5482 while (hardcount < max && scan < loceol && 5483 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { 5484 scan += UTF8SKIP(scan); 5485 hardcount++; 5486 } 5487 } else { 5488 while (scan < loceol && isSPACE_LC(*scan)) 5489 scan++; 5490 } 5491 break; 5492 case NSPACE: 5493 if (do_utf8) { 5494 loceol = PL_regeol; 5495 LOAD_UTF8_CHARCLASS_SPACE(); 5496 while (hardcount < max && scan < loceol && 5497 !(*scan == ' ' || 5498 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { 5499 scan += UTF8SKIP(scan); 5500 hardcount++; 5501 } 5502 } else { 5503 while (scan < loceol && !isSPACE(*scan)) 5504 scan++; 5505 } 5506 break; 5507 case NSPACEL: 5508 PL_reg_flags |= RF_tainted; 5509 if (do_utf8) { 5510 loceol = PL_regeol; 5511 while (hardcount < max && scan < loceol && 5512 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { 5513 scan += UTF8SKIP(scan); 5514 hardcount++; 5515 } 5516 } else { 5517 while (scan < loceol && !isSPACE_LC(*scan)) 5518 scan++; 5519 } 5520 break; 5521 case DIGIT: 5522 if (do_utf8) { 5523 loceol = PL_regeol; 5524 LOAD_UTF8_CHARCLASS_DIGIT(); 5525 while (hardcount < max && scan < loceol && 5526 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { 5527 scan += UTF8SKIP(scan); 5528 hardcount++; 5529 } 5530 } else { 5531 while (scan < loceol && isDIGIT(*scan)) 5532 scan++; 5533 } 5534 break; 5535 case NDIGIT: 5536 if (do_utf8) { 5537 loceol = PL_regeol; 5538 LOAD_UTF8_CHARCLASS_DIGIT(); 5539 while (hardcount < max && scan < loceol && 5540 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { 5541 scan += UTF8SKIP(scan); 5542 hardcount++; 5543 } 5544 } else { 5545 while (scan < loceol && !isDIGIT(*scan)) 5546 scan++; 5547 } 5548 case LNBREAK: 5549 if (do_utf8) { 5550 loceol = PL_regeol; 5551 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) { 5552 scan += c; 5553 hardcount++; 5554 } 5555 } else { 5556 /* 5557 LNBREAK can match two latin chars, which is ok, 5558 because we have a null terminated string, but we 5559 have to use hardcount in this situation 5560 */ 5561 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) { 5562 scan+=c; 5563 hardcount++; 5564 } 5565 } 5566 break; 5567 case HORIZWS: 5568 if (do_utf8) { 5569 loceol = PL_regeol; 5570 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) { 5571 scan += c; 5572 hardcount++; 5573 } 5574 } else { 5575 while (scan < loceol && is_HORIZWS_latin1(scan)) 5576 scan++; 5577 } 5578 break; 5579 case NHORIZWS: 5580 if (do_utf8) { 5581 loceol = PL_regeol; 5582 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) { 5583 scan += UTF8SKIP(scan); 5584 hardcount++; 5585 } 5586 } else { 5587 while (scan < loceol && !is_HORIZWS_latin1(scan)) 5588 scan++; 5589 5590 } 5591 break; 5592 case VERTWS: 5593 if (do_utf8) { 5594 loceol = PL_regeol; 5595 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) { 5596 scan += c; 5597 hardcount++; 5598 } 5599 } else { 5600 while (scan < loceol && is_VERTWS_latin1(scan)) 5601 scan++; 5602 5603 } 5604 break; 5605 case NVERTWS: 5606 if (do_utf8) { 5607 loceol = PL_regeol; 5608 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) { 5609 scan += UTF8SKIP(scan); 5610 hardcount++; 5611 } 5612 } else { 5613 while (scan < loceol && !is_VERTWS_latin1(scan)) 5614 scan++; 5615 5616 } 5617 break; 5618 5619 default: /* Called on something of 0 width. */ 5620 break; /* So match right here or not at all. */ 5621 } 5622 5623 if (hardcount) 5624 c = hardcount; 5625 else 5626 c = scan - PL_reginput; 5627 PL_reginput = scan; 5628 5629 DEBUG_r({ 5630 GET_RE_DEBUG_FLAGS_DECL; 5631 DEBUG_EXECUTE_r({ 5632 SV * const prop = sv_newmortal(); 5633 regprop(prog, prop, p); 5634 PerlIO_printf(Perl_debug_log, 5635 "%*s %s can match %"IVdf" times out of %"IVdf"...\n", 5636 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); 5637 }); 5638 }); 5639 5640 return(c); 5641 } 5642 5643 5644 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 5645 /* 5646 - regclass_swash - prepare the utf8 swash 5647 */ 5648 5649 SV * 5650 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) 5651 { 5652 dVAR; 5653 SV *sw = NULL; 5654 SV *si = NULL; 5655 SV *alt = NULL; 5656 RXi_GET_DECL(prog,progi); 5657 const struct reg_data * const data = prog ? progi->data : NULL; 5658 5659 PERL_ARGS_ASSERT_REGCLASS_SWASH; 5660 5661 if (data && data->count) { 5662 const U32 n = ARG(node); 5663 5664 if (data->what[n] == 's') { 5665 SV * const rv = MUTABLE_SV(data->data[n]); 5666 AV * const av = MUTABLE_AV(SvRV(rv)); 5667 SV **const ary = AvARRAY(av); 5668 SV **a, **b; 5669 5670 /* See the end of regcomp.c:S_regclass() for 5671 * documentation of these array elements. */ 5672 5673 si = *ary; 5674 a = SvROK(ary[1]) ? &ary[1] : NULL; 5675 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL; 5676 5677 if (a) 5678 sw = *a; 5679 else if (si && doinit) { 5680 sw = swash_init("utf8", "", si, 1, 0); 5681 (void)av_store(av, 1, sw); 5682 } 5683 if (b) 5684 alt = *b; 5685 } 5686 } 5687 5688 if (listsvp) 5689 *listsvp = si; 5690 if (altsvp) 5691 *altsvp = alt; 5692 5693 return sw; 5694 } 5695 #endif 5696 5697 /* 5698 - reginclass - determine if a character falls into a character class 5699 5700 The n is the ANYOF regnode, the p is the target string, lenp 5701 is pointer to the maximum length of how far to go in the p 5702 (if the lenp is zero, UTF8SKIP(p) is used), 5703 do_utf8 tells whether the target string is in UTF-8. 5704 5705 */ 5706 5707 STATIC bool 5708 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8) 5709 { 5710 dVAR; 5711 const char flags = ANYOF_FLAGS(n); 5712 bool match = FALSE; 5713 UV c = *p; 5714 STRLEN len = 0; 5715 STRLEN plen; 5716 5717 PERL_ARGS_ASSERT_REGINCLASS; 5718 5719 if (do_utf8 && !UTF8_IS_INVARIANT(c)) { 5720 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len, 5721 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY); 5722 /* see [perl #37836] for UTF8_ALLOW_ANYUV */ 5723 if (len == (STRLEN)-1) 5724 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); 5725 } 5726 5727 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); 5728 if (do_utf8 || (flags & ANYOF_UNICODE)) { 5729 if (lenp) 5730 *lenp = 0; 5731 if (do_utf8 && !ANYOF_RUNTIME(n)) { 5732 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) 5733 match = TRUE; 5734 } 5735 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256) 5736 match = TRUE; 5737 if (!match) { 5738 AV *av; 5739 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); 5740 5741 if (sw) { 5742 U8 * utf8_p; 5743 if (do_utf8) { 5744 utf8_p = (U8 *) p; 5745 } else { 5746 STRLEN len = 1; 5747 utf8_p = bytes_to_utf8(p, &len); 5748 } 5749 if (swash_fetch(sw, utf8_p, 1)) 5750 match = TRUE; 5751 else if (flags & ANYOF_FOLD) { 5752 if (!match && lenp && av) { 5753 I32 i; 5754 for (i = 0; i <= av_len(av); i++) { 5755 SV* const sv = *av_fetch(av, i, FALSE); 5756 STRLEN len; 5757 const char * const s = SvPV_const(sv, len); 5758 if (len <= plen && memEQ(s, (char*)utf8_p, len)) { 5759 *lenp = len; 5760 match = TRUE; 5761 break; 5762 } 5763 } 5764 } 5765 if (!match) { 5766 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 5767 5768 STRLEN tmplen; 5769 to_utf8_fold(utf8_p, tmpbuf, &tmplen); 5770 if (swash_fetch(sw, tmpbuf, 1)) 5771 match = TRUE; 5772 } 5773 } 5774 5775 /* If we allocated a string above, free it */ 5776 if (! do_utf8) Safefree(utf8_p); 5777 } 5778 } 5779 if (match && lenp && *lenp == 0) 5780 *lenp = UNISKIP(NATIVE_TO_UNI(c)); 5781 } 5782 if (!match && c < 256) { 5783 if (ANYOF_BITMAP_TEST(n, c)) 5784 match = TRUE; 5785 else if (flags & ANYOF_FOLD) { 5786 U8 f; 5787 5788 if (flags & ANYOF_LOCALE) { 5789 PL_reg_flags |= RF_tainted; 5790 f = PL_fold_locale[c]; 5791 } 5792 else 5793 f = PL_fold[c]; 5794 if (f != c && ANYOF_BITMAP_TEST(n, f)) 5795 match = TRUE; 5796 } 5797 5798 if (!match && (flags & ANYOF_CLASS)) { 5799 PL_reg_flags |= RF_tainted; 5800 if ( 5801 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || 5802 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || 5803 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || 5804 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || 5805 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || 5806 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || 5807 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || 5808 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || 5809 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || 5810 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || 5811 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || 5812 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || 5813 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || 5814 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || 5815 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || 5816 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || 5817 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || 5818 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || 5819 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || 5820 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || 5821 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || 5822 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || 5823 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || 5824 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || 5825 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || 5826 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || 5827 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || 5828 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || 5829 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || 5830 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) 5831 ) /* How's that for a conditional? */ 5832 { 5833 match = TRUE; 5834 } 5835 } 5836 } 5837 5838 return (flags & ANYOF_INVERT) ? !match : match; 5839 } 5840 5841 STATIC U8 * 5842 S_reghop3(U8 *s, I32 off, const U8* lim) 5843 { 5844 dVAR; 5845 5846 PERL_ARGS_ASSERT_REGHOP3; 5847 5848 if (off >= 0) { 5849 while (off-- && s < lim) { 5850 /* XXX could check well-formedness here */ 5851 s += UTF8SKIP(s); 5852 } 5853 } 5854 else { 5855 while (off++ && s > lim) { 5856 s--; 5857 if (UTF8_IS_CONTINUED(*s)) { 5858 while (s > lim && UTF8_IS_CONTINUATION(*s)) 5859 s--; 5860 } 5861 /* XXX could check well-formedness here */ 5862 } 5863 } 5864 return s; 5865 } 5866 5867 #ifdef XXX_dmq 5868 /* there are a bunch of places where we use two reghop3's that should 5869 be replaced with this routine. but since thats not done yet 5870 we ifdef it out - dmq 5871 */ 5872 STATIC U8 * 5873 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) 5874 { 5875 dVAR; 5876 5877 PERL_ARGS_ASSERT_REGHOP4; 5878 5879 if (off >= 0) { 5880 while (off-- && s < rlim) { 5881 /* XXX could check well-formedness here */ 5882 s += UTF8SKIP(s); 5883 } 5884 } 5885 else { 5886 while (off++ && s > llim) { 5887 s--; 5888 if (UTF8_IS_CONTINUED(*s)) { 5889 while (s > llim && UTF8_IS_CONTINUATION(*s)) 5890 s--; 5891 } 5892 /* XXX could check well-formedness here */ 5893 } 5894 } 5895 return s; 5896 } 5897 #endif 5898 5899 STATIC U8 * 5900 S_reghopmaybe3(U8* s, I32 off, const U8* lim) 5901 { 5902 dVAR; 5903 5904 PERL_ARGS_ASSERT_REGHOPMAYBE3; 5905 5906 if (off >= 0) { 5907 while (off-- && s < lim) { 5908 /* XXX could check well-formedness here */ 5909 s += UTF8SKIP(s); 5910 } 5911 if (off >= 0) 5912 return NULL; 5913 } 5914 else { 5915 while (off++ && s > lim) { 5916 s--; 5917 if (UTF8_IS_CONTINUED(*s)) { 5918 while (s > lim && UTF8_IS_CONTINUATION(*s)) 5919 s--; 5920 } 5921 /* XXX could check well-formedness here */ 5922 } 5923 if (off <= 0) 5924 return NULL; 5925 } 5926 return s; 5927 } 5928 5929 static void 5930 restore_pos(pTHX_ void *arg) 5931 { 5932 dVAR; 5933 regexp * const rex = (regexp *)arg; 5934 if (PL_reg_eval_set) { 5935 if (PL_reg_oldsaved) { 5936 rex->subbeg = PL_reg_oldsaved; 5937 rex->sublen = PL_reg_oldsavedlen; 5938 #ifdef PERL_OLD_COPY_ON_WRITE 5939 rex->saved_copy = PL_nrs; 5940 #endif 5941 RXp_MATCH_COPIED_on(rex); 5942 } 5943 PL_reg_magic->mg_len = PL_reg_oldpos; 5944 PL_reg_eval_set = 0; 5945 PL_curpm = PL_reg_oldcurpm; 5946 } 5947 } 5948 5949 STATIC void 5950 S_to_utf8_substr(pTHX_ register regexp *prog) 5951 { 5952 int i = 1; 5953 5954 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; 5955 5956 do { 5957 if (prog->substrs->data[i].substr 5958 && !prog->substrs->data[i].utf8_substr) { 5959 SV* const sv = newSVsv(prog->substrs->data[i].substr); 5960 prog->substrs->data[i].utf8_substr = sv; 5961 sv_utf8_upgrade(sv); 5962 if (SvVALID(prog->substrs->data[i].substr)) { 5963 const U8 flags = BmFLAGS(prog->substrs->data[i].substr); 5964 if (flags & FBMcf_TAIL) { 5965 /* Trim the trailing \n that fbm_compile added last 5966 time. */ 5967 SvCUR_set(sv, SvCUR(sv) - 1); 5968 /* Whilst this makes the SV technically "invalid" (as its 5969 buffer is no longer followed by "\0") when fbm_compile() 5970 adds the "\n" back, a "\0" is restored. */ 5971 } 5972 fbm_compile(sv, flags); 5973 } 5974 if (prog->substrs->data[i].substr == prog->check_substr) 5975 prog->check_utf8 = sv; 5976 } 5977 } while (i--); 5978 } 5979 5980 STATIC void 5981 S_to_byte_substr(pTHX_ register regexp *prog) 5982 { 5983 dVAR; 5984 int i = 1; 5985 5986 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; 5987 5988 do { 5989 if (prog->substrs->data[i].utf8_substr 5990 && !prog->substrs->data[i].substr) { 5991 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); 5992 if (sv_utf8_downgrade(sv, TRUE)) { 5993 if (SvVALID(prog->substrs->data[i].utf8_substr)) { 5994 const U8 flags 5995 = BmFLAGS(prog->substrs->data[i].utf8_substr); 5996 if (flags & FBMcf_TAIL) { 5997 /* Trim the trailing \n that fbm_compile added last 5998 time. */ 5999 SvCUR_set(sv, SvCUR(sv) - 1); 6000 } 6001 fbm_compile(sv, flags); 6002 } 6003 } else { 6004 SvREFCNT_dec(sv); 6005 sv = &PL_sv_undef; 6006 } 6007 prog->substrs->data[i].substr = sv; 6008 if (prog->substrs->data[i].utf8_substr == prog->check_utf8) 6009 prog->check_substr = sv; 6010 } 6011 } while (i--); 6012 } 6013 6014 /* 6015 * Local variables: 6016 * c-indentation-style: bsd 6017 * c-basic-offset: 4 6018 * indent-tabs-mode: t 6019 * End: 6020 * 6021 * ex: set ts=8 sts=4 sw=4 noet: 6022 */ 6023