1 /* regexec.c 2 */ 3 4 /* 5 * "One Ring to rule them all, One Ring to find them..." 6 */ 7 8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 9 * confused with the original package (see point 3 below). Thanks, Henry! 10 */ 11 12 /* Additional note: this code is very heavily munged from Henry's version 13 * in places. In some spots I've traded clarity for efficiency, so don't 14 * blame Henry for some of the lack of readability. 15 */ 16 17 /* The names of the functions have been changed from regcomp and 18 * regexec to pregcomp and pregexec in order to avoid conflicts 19 * with the POSIX routines of the same names. 20 */ 21 22 #ifdef PERL_EXT_RE_BUILD 23 /* need to replace pregcomp et al, so enable that */ 24 # ifndef PERL_IN_XSUB_RE 25 # define PERL_IN_XSUB_RE 26 # endif 27 /* need access to debugger hooks */ 28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING) 29 # define DEBUGGING 30 # endif 31 #endif 32 33 #ifdef PERL_IN_XSUB_RE 34 /* We *really* need to overwrite these symbols: */ 35 # define Perl_regexec_flags my_regexec 36 # define Perl_regdump my_regdump 37 # define Perl_regprop my_regprop 38 # define Perl_re_intuit_start my_re_intuit_start 39 /* *These* symbols are masked to allow static link. */ 40 # define Perl_pregexec my_pregexec 41 # define Perl_reginitcolors my_reginitcolors 42 # define Perl_regclass_swash my_regclass_swash 43 44 # define PERL_NO_GET_CONTEXT 45 #endif 46 47 /*SUPPRESS 112*/ 48 /* 49 * pregcomp and pregexec -- regsub and regerror are not used in perl 50 * 51 * Copyright (c) 1986 by University of Toronto. 52 * Written by Henry Spencer. Not derived from licensed software. 53 * 54 * Permission is granted to anyone to use this software for any 55 * purpose on any computer system, and to redistribute it freely, 56 * subject to the following restrictions: 57 * 58 * 1. The author is not responsible for the consequences of use of 59 * this software, no matter how awful, even if they arise 60 * from defects in it. 61 * 62 * 2. The origin of this software must not be misrepresented, either 63 * by explicit claim or by omission. 64 * 65 * 3. Altered versions must be plainly marked as such, and must not 66 * be misrepresented as being the original software. 67 * 68 **** Alterations to Henry's code are... 69 **** 70 **** Copyright (c) 1991-2002, Larry Wall 71 **** 72 **** You may distribute under the terms of either the GNU General Public 73 **** License or the Artistic License, as specified in the README file. 74 * 75 * Beware that some of this code is subtly aware of the way operator 76 * precedence is structured in regular expressions. Serious changes in 77 * regular-expression syntax might require a total rethink. 78 */ 79 #include "EXTERN.h" 80 #define PERL_IN_REGEXEC_C 81 #include "perl.h" 82 83 #include "regcomp.h" 84 85 #define RF_tainted 1 /* tainted information used? */ 86 #define RF_warned 2 /* warned about big count? */ 87 #define RF_evaled 4 /* Did an EVAL with setting? */ 88 #define RF_utf8 8 /* String contains multibyte chars? */ 89 90 #define UTF ((PL_reg_flags & RF_utf8) != 0) 91 92 #define RS_init 1 /* eval environment created */ 93 #define RS_set 2 /* replsv value is set */ 94 95 #ifndef STATIC 96 #define STATIC static 97 #endif 98 99 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c))) 100 101 /* 102 * Forwards. 103 */ 104 105 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv)) 106 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b) 107 108 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off)) 109 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off)) 110 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off)) 111 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off)) 112 #define HOPc(pos,off) ((char*)HOP(pos,off)) 113 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off)) 114 115 #define HOPBACK(pos, off) ( \ 116 (PL_reg_match_utf8) \ 117 ? reghopmaybe((U8*)pos, -off) \ 118 : (pos - off >= PL_bostr) \ 119 ? (U8*)(pos - off) \ 120 : (U8*)NULL \ 121 ) 122 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off) 123 124 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim)) 125 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim)) 126 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off)) 127 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off)) 128 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) 129 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim)) 130 131 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END 132 133 /* for use after a quantifier and before an EXACT-like node -- japhy */ 134 #define JUMPABLE(rn) ( \ 135 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \ 136 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ 137 OP(rn) == PLUS || OP(rn) == MINMOD || \ 138 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \ 139 ) 140 141 #define HAS_TEXT(rn) ( \ 142 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \ 143 ) 144 145 /* 146 Search for mandatory following text node; for lookahead, the text must 147 follow but for lookbehind (rn->flags != 0) we skip to the next step. 148 */ 149 #define FIND_NEXT_IMPT(rn) STMT_START { \ 150 while (JUMPABLE(rn)) \ 151 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \ 152 rn = NEXTOPER(NEXTOPER(rn)); \ 153 else if (OP(rn) == PLUS) \ 154 rn = NEXTOPER(rn); \ 155 else if (OP(rn) == IFMATCH) \ 156 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ 157 else rn += NEXT_OFF(rn); \ 158 } STMT_END 159 160 static void restore_pos(pTHX_ void *arg); 161 162 STATIC CHECKPOINT 163 S_regcppush(pTHX_ I32 parenfloor) 164 { 165 int retval = PL_savestack_ix; 166 #define REGCP_PAREN_ELEMS 4 167 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS; 168 int p; 169 170 if (paren_elems_to_push < 0) 171 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0"); 172 173 #define REGCP_OTHER_ELEMS 6 174 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS); 175 for (p = PL_regsize; p > parenfloor; p--) { 176 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ 177 SSPUSHINT(PL_regendp[p]); 178 SSPUSHINT(PL_regstartp[p]); 179 SSPUSHPTR(PL_reg_start_tmp[p]); 180 SSPUSHINT(p); 181 } 182 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ 183 SSPUSHINT(PL_regsize); 184 SSPUSHINT(*PL_reglastparen); 185 SSPUSHINT(*PL_reglastcloseparen); 186 SSPUSHPTR(PL_reginput); 187 #define REGCP_FRAME_ELEMS 2 188 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and 189 * are needed for the regexp context stack bookkeeping. */ 190 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS); 191 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */ 192 193 return retval; 194 } 195 196 /* These are needed since we do not localize EVAL nodes: */ 197 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \ 198 " Setting an EVAL scope, savestack=%"IVdf"\n", \ 199 (IV)PL_savestack_ix)); cp = PL_savestack_ix 200 201 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \ 202 PerlIO_printf(Perl_debug_log, \ 203 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ 204 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp) 205 206 STATIC char * 207 S_regcppop(pTHX) 208 { 209 I32 i; 210 U32 paren = 0; 211 char *input; 212 I32 tmps; 213 214 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ 215 i = SSPOPINT; 216 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ 217 i = SSPOPINT; /* Parentheses elements to pop. */ 218 input = (char *) SSPOPPTR; 219 *PL_reglastcloseparen = SSPOPINT; 220 *PL_reglastparen = SSPOPINT; 221 PL_regsize = SSPOPINT; 222 223 /* Now restore the parentheses context. */ 224 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS); 225 i > 0; i -= REGCP_PAREN_ELEMS) { 226 paren = (U32)SSPOPINT; 227 PL_reg_start_tmp[paren] = (char *) SSPOPPTR; 228 PL_regstartp[paren] = SSPOPINT; 229 tmps = SSPOPINT; 230 if (paren <= *PL_reglastparen) 231 PL_regendp[paren] = tmps; 232 DEBUG_r( 233 PerlIO_printf(Perl_debug_log, 234 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n", 235 (UV)paren, (IV)PL_regstartp[paren], 236 (IV)(PL_reg_start_tmp[paren] - PL_bostr), 237 (IV)PL_regendp[paren], 238 (paren > *PL_reglastparen ? "(no)" : "")); 239 ); 240 } 241 DEBUG_r( 242 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) { 243 PerlIO_printf(Perl_debug_log, 244 " restoring \\%"IVdf"..\\%"IVdf" to undef\n", 245 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar); 246 } 247 ); 248 #if 1 249 /* It would seem that the similar code in regtry() 250 * already takes care of this, and in fact it is in 251 * a better location to since this code can #if 0-ed out 252 * but the code in regtry() is needed or otherwise tests 253 * requiring null fields (pat.t#187 and split.t#{13,14} 254 * (as of patchlevel 7877) will fail. Then again, 255 * this code seems to be necessary or otherwise 256 * building DynaLoader will fail: 257 * "Error: '*' not in typemap in DynaLoader.xs, line 164" 258 * --jhi */ 259 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) { 260 if ((I32)paren > PL_regsize) 261 PL_regstartp[paren] = -1; 262 PL_regendp[paren] = -1; 263 } 264 #endif 265 return input; 266 } 267 268 STATIC char * 269 S_regcp_set_to(pTHX_ I32 ss) 270 { 271 I32 tmp = PL_savestack_ix; 272 273 PL_savestack_ix = ss; 274 regcppop(); 275 PL_savestack_ix = tmp; 276 return Nullch; 277 } 278 279 typedef struct re_cc_state 280 { 281 I32 ss; 282 regnode *node; 283 struct re_cc_state *prev; 284 CURCUR *cc; 285 regexp *re; 286 } re_cc_state; 287 288 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ 289 290 #define TRYPAREN(paren, n, input) { \ 291 if (paren) { \ 292 if (n) { \ 293 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \ 294 PL_regendp[paren] = input - PL_bostr; \ 295 } \ 296 else \ 297 PL_regendp[paren] = -1; \ 298 } \ 299 if (regmatch(next)) \ 300 sayYES; \ 301 if (paren && n) \ 302 PL_regendp[paren] = -1; \ 303 } 304 305 306 /* 307 * pregexec and friends 308 */ 309 310 /* 311 - pregexec - match a regexp against a string 312 */ 313 I32 314 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend, 315 char *strbeg, I32 minend, SV *screamer, U32 nosave) 316 /* strend: pointer to null at end of string */ 317 /* strbeg: real beginning of string */ 318 /* minend: end of match must be >=minend after stringarg. */ 319 /* nosave: For optimizations. */ 320 { 321 return 322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 323 nosave ? 0 : REXEC_COPY_STR); 324 } 325 326 STATIC void 327 S_cache_re(pTHX_ regexp *prog) 328 { 329 PL_regprecomp = prog->precomp; /* Needed for FAIL. */ 330 #ifdef DEBUGGING 331 PL_regprogram = prog->program; 332 #endif 333 PL_regnpar = prog->nparens; 334 PL_regdata = prog->data; 335 PL_reg_re = prog; 336 } 337 338 /* 339 * Need to implement the following flags for reg_anch: 340 * 341 * USE_INTUIT_NOML - Useful to call re_intuit_start() first 342 * USE_INTUIT_ML 343 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer 344 * INTUIT_AUTORITATIVE_ML 345 * INTUIT_ONCE_NOML - Intuit can match in one location only. 346 * INTUIT_ONCE_ML 347 * 348 * Another flag for this function: SECOND_TIME (so that float substrs 349 * with giant delta may be not rechecked). 350 */ 351 352 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ 353 354 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend. 355 Otherwise, only SvCUR(sv) is used to get strbeg. */ 356 357 /* XXXX We assume that strpos is strbeg unless sv. */ 358 359 /* XXXX Some places assume that there is a fixed substring. 360 An update may be needed if optimizer marks as "INTUITable" 361 RExen without fixed substrings. Similarly, it is assumed that 362 lengths of all the strings are no more than minlen, thus they 363 cannot come from lookahead. 364 (Or minlen should take into account lookahead.) */ 365 366 /* A failure to find a constant substring means that there is no need to make 367 an expensive call to REx engine, thus we celebrate a failure. Similarly, 368 finding a substring too deep into the string means that less calls to 369 regtry() should be needed. 370 371 REx compiler's optimizer found 4 possible hints: 372 a) Anchored substring; 373 b) Fixed substring; 374 c) Whether we are anchored (beginning-of-line or \G); 375 d) First node (of those at offset 0) which may distingush positions; 376 We use a)b)d) and multiline-part of c), and try to find a position in the 377 string which does not contradict any of them. 378 */ 379 380 /* Most of decisions we do here should have been done at compile time. 381 The nodes of the REx which we used for the search should have been 382 deleted from the finite automaton. */ 383 384 char * 385 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, 386 char *strend, U32 flags, re_scream_pos_data *data) 387 { 388 register I32 start_shift = 0; 389 /* Should be nonnegative! */ 390 register I32 end_shift = 0; 391 register char *s; 392 register SV *check; 393 char *strbeg; 394 char *t; 395 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */ 396 I32 ml_anch; 397 register char *other_last = Nullch; /* other substr checked before this */ 398 char *check_at = Nullch; /* check substr found at this pos */ 399 #ifdef DEBUGGING 400 char *i_strpos = strpos; 401 SV *dsv = PERL_DEBUG_PAD_ZERO(0); 402 #endif 403 404 if (prog->reganch & ROPT_UTF8) { 405 DEBUG_r(PerlIO_printf(Perl_debug_log, 406 "UTF-8 regex...\n")); 407 PL_reg_flags |= RF_utf8; 408 } 409 410 DEBUG_r({ 411 char *s = PL_reg_match_utf8 ? 412 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) : 413 strpos; 414 int len = PL_reg_match_utf8 ? 415 strlen(s) : strend - strpos; 416 if (!PL_colorset) 417 reginitcolors(); 418 if (PL_reg_match_utf8) 419 DEBUG_r(PerlIO_printf(Perl_debug_log, 420 "UTF-8 target...\n")); 421 PerlIO_printf(Perl_debug_log, 422 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n", 423 PL_colors[4],PL_colors[5],PL_colors[0], 424 prog->precomp, 425 PL_colors[1], 426 (strlen(prog->precomp) > 60 ? "..." : ""), 427 PL_colors[0], 428 (int)(len > 60 ? 60 : len), 429 s, PL_colors[1], 430 (len > 60 ? "..." : "") 431 ); 432 }); 433 434 /* CHR_DIST() would be more correct here but it makes things slow. */ 435 if (prog->minlen > strend - strpos) { 436 DEBUG_r(PerlIO_printf(Perl_debug_log, 437 "String too short... [re_intuit_start]\n")); 438 goto fail; 439 } 440 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos; 441 PL_regeol = strend; 442 if (do_utf8) { 443 if (!prog->check_utf8 && prog->check_substr) 444 to_utf8_substr(prog); 445 check = prog->check_utf8; 446 } else { 447 if (!prog->check_substr && prog->check_utf8) 448 to_byte_substr(prog); 449 check = prog->check_substr; 450 } 451 if (check == &PL_sv_undef) { 452 DEBUG_r(PerlIO_printf(Perl_debug_log, 453 "Non-utf string cannot match utf check string\n")); 454 goto fail; 455 } 456 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */ 457 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE) 458 || ( (prog->reganch & ROPT_ANCH_BOL) 459 && !PL_multiline ) ); /* Check after \n? */ 460 461 if (!ml_anch) { 462 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */ 463 | ROPT_IMPLICIT)) /* not a real BOL */ 464 /* SvCUR is not set on references: SvRV and SvPVX overlap */ 465 && sv && !SvROK(sv) 466 && (strpos != strbeg)) { 467 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); 468 goto fail; 469 } 470 if (prog->check_offset_min == prog->check_offset_max && 471 !(prog->reganch & ROPT_CANY_SEEN)) { 472 /* Substring at constant offset from beg-of-str... */ 473 I32 slen; 474 475 s = HOP3c(strpos, prog->check_offset_min, strend); 476 if (SvTAIL(check)) { 477 slen = SvCUR(check); /* >= 1 */ 478 479 if ( strend - s > slen || strend - s < slen - 1 480 || (strend - s == slen && strend[-1] != '\n')) { 481 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); 482 goto fail_finish; 483 } 484 /* Now should match s[0..slen-2] */ 485 slen--; 486 if (slen && (*SvPVX(check) != *s 487 || (slen > 1 488 && memNE(SvPVX(check), s, slen)))) { 489 report_neq: 490 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); 491 goto fail_finish; 492 } 493 } 494 else if (*SvPVX(check) != *s 495 || ((slen = SvCUR(check)) > 1 496 && memNE(SvPVX(check), s, slen))) 497 goto report_neq; 498 goto success_at_start; 499 } 500 } 501 /* Match is anchored, but substr is not anchored wrt beg-of-str. */ 502 s = strpos; 503 start_shift = prog->check_offset_min; /* okay to underestimate on CC */ 504 end_shift = prog->minlen - start_shift - 505 CHR_SVLEN(check) + (SvTAIL(check) != 0); 506 if (!ml_anch) { 507 I32 end = prog->check_offset_max + CHR_SVLEN(check) 508 - (SvTAIL(check) != 0); 509 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; 510 511 if (end_shift < eshift) 512 end_shift = eshift; 513 } 514 } 515 else { /* Can match at random position */ 516 ml_anch = 0; 517 s = strpos; 518 start_shift = prog->check_offset_min; /* okay to underestimate on CC */ 519 /* Should be nonnegative! */ 520 end_shift = prog->minlen - start_shift - 521 CHR_SVLEN(check) + (SvTAIL(check) != 0); 522 } 523 524 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ 525 if (end_shift < 0) 526 Perl_croak(aTHX_ "panic: end_shift"); 527 #endif 528 529 restart: 530 /* Find a possible match in the region s..strend by looking for 531 the "check" substring in the region corrected by start/end_shift. */ 532 if (flags & REXEC_SCREAM) { 533 I32 p = -1; /* Internal iterator of scream. */ 534 I32 *pp = data ? data->scream_pos : &p; 535 536 if (PL_screamfirst[BmRARE(check)] >= 0 537 || ( BmRARE(check) == '\n' 538 && (BmPREVIOUS(check) == SvCUR(check) - 1) 539 && SvTAIL(check) )) 540 s = screaminstr(sv, check, 541 start_shift + (s - strbeg), end_shift, pp, 0); 542 else 543 goto fail_finish; 544 if (data) 545 *data->scream_olds = s; 546 } 547 else if (prog->reganch & ROPT_CANY_SEEN) 548 s = fbm_instr((U8*)(s + start_shift), 549 (U8*)(strend - end_shift), 550 check, PL_multiline ? FBMrf_MULTILINE : 0); 551 else 552 s = fbm_instr(HOP3(s, start_shift, strend), 553 HOP3(strend, -end_shift, strbeg), 554 check, PL_multiline ? FBMrf_MULTILINE : 0); 555 556 /* Update the count-of-usability, remove useless subpatterns, 557 unshift s. */ 558 559 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s", 560 (s ? "Found" : "Did not find"), 561 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"), 562 PL_colors[0], 563 (int)(SvCUR(check) - (SvTAIL(check)!=0)), 564 SvPVX(check), 565 PL_colors[1], (SvTAIL(check) ? "$" : ""), 566 (s ? " at offset " : "...\n") ) ); 567 568 if (!s) 569 goto fail_finish; 570 571 check_at = s; 572 573 /* Finish the diagnostic message */ 574 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); 575 576 /* Got a candidate. Check MBOL anchoring, and the *other* substr. 577 Start with the other substr. 578 XXXX no SCREAM optimization yet - and a very coarse implementation 579 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will 580 *always* match. Probably should be marked during compile... 581 Probably it is right to do no SCREAM here... 582 */ 583 584 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) { 585 /* Take into account the "other" substring. */ 586 /* XXXX May be hopelessly wrong for UTF... */ 587 if (!other_last) 588 other_last = strpos; 589 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) { 590 do_other_anchored: 591 { 592 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2; 593 char *s1 = s; 594 SV* must; 595 596 t = s - prog->check_offset_max; 597 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ 598 && (!(prog->reganch & ROPT_UTF8) 599 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos)) 600 && t > strpos))) 601 /* EMPTY */; 602 else 603 t = strpos; 604 t = HOP3c(t, prog->anchored_offset, strend); 605 if (t < other_last) /* These positions already checked */ 606 t = other_last; 607 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); 608 if (last < last1) 609 last1 = last; 610 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ 611 /* On end-of-str: see comment below. */ 612 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; 613 if (must == &PL_sv_undef) { 614 s = (char*)NULL; 615 DEBUG_r(must = prog->anchored_utf8); /* for debug */ 616 } 617 else 618 s = fbm_instr( 619 (unsigned char*)t, 620 HOP3(HOP3(last1, prog->anchored_offset, strend) 621 + SvCUR(must), -(SvTAIL(must)!=0), strbeg), 622 must, 623 PL_multiline ? FBMrf_MULTILINE : 0 624 ); 625 DEBUG_r(PerlIO_printf(Perl_debug_log, 626 "%s anchored substr `%s%.*s%s'%s", 627 (s ? "Found" : "Contradicts"), 628 PL_colors[0], 629 (int)(SvCUR(must) 630 - (SvTAIL(must)!=0)), 631 SvPVX(must), 632 PL_colors[1], (SvTAIL(must) ? "$" : ""))); 633 if (!s) { 634 if (last1 >= last2) { 635 DEBUG_r(PerlIO_printf(Perl_debug_log, 636 ", giving up...\n")); 637 goto fail_finish; 638 } 639 DEBUG_r(PerlIO_printf(Perl_debug_log, 640 ", trying floating at offset %ld...\n", 641 (long)(HOP3c(s1, 1, strend) - i_strpos))); 642 other_last = HOP3c(last1, prog->anchored_offset+1, strend); 643 s = HOP3c(last, 1, strend); 644 goto restart; 645 } 646 else { 647 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", 648 (long)(s - i_strpos))); 649 t = HOP3c(s, -prog->anchored_offset, strbeg); 650 other_last = HOP3c(s, 1, strend); 651 s = s1; 652 if (t == strpos) 653 goto try_at_start; 654 goto try_at_offset; 655 } 656 } 657 } 658 else { /* Take into account the floating substring. */ 659 char *last, *last1; 660 char *s1 = s; 661 SV* must; 662 663 t = HOP3c(s, -start_shift, strbeg); 664 last1 = last = 665 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); 666 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) 667 last = HOP3c(t, prog->float_max_offset, strend); 668 s = HOP3c(t, prog->float_min_offset, strend); 669 if (s < other_last) 670 s = other_last; 671 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ 672 must = do_utf8 ? prog->float_utf8 : prog->float_substr; 673 /* fbm_instr() takes into account exact value of end-of-str 674 if the check is SvTAIL(ed). Since false positives are OK, 675 and end-of-str is not later than strend we are OK. */ 676 if (must == &PL_sv_undef) { 677 s = (char*)NULL; 678 DEBUG_r(must = prog->float_utf8); /* for debug message */ 679 } 680 else 681 s = fbm_instr((unsigned char*)s, 682 (unsigned char*)last + SvCUR(must) 683 - (SvTAIL(must)!=0), 684 must, PL_multiline ? FBMrf_MULTILINE : 0); 685 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s", 686 (s ? "Found" : "Contradicts"), 687 PL_colors[0], 688 (int)(SvCUR(must) - (SvTAIL(must)!=0)), 689 SvPVX(must), 690 PL_colors[1], (SvTAIL(must) ? "$" : ""))); 691 if (!s) { 692 if (last1 == last) { 693 DEBUG_r(PerlIO_printf(Perl_debug_log, 694 ", giving up...\n")); 695 goto fail_finish; 696 } 697 DEBUG_r(PerlIO_printf(Perl_debug_log, 698 ", trying anchored starting at offset %ld...\n", 699 (long)(s1 + 1 - i_strpos))); 700 other_last = last; 701 s = HOP3c(t, 1, strend); 702 goto restart; 703 } 704 else { 705 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", 706 (long)(s - i_strpos))); 707 other_last = s; /* Fix this later. --Hugo */ 708 s = s1; 709 if (t == strpos) 710 goto try_at_start; 711 goto try_at_offset; 712 } 713 } 714 } 715 716 t = s - prog->check_offset_max; 717 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ 718 && (!(prog->reganch & ROPT_UTF8) 719 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos)) 720 && t > strpos))) { 721 /* Fixed substring is found far enough so that the match 722 cannot start at strpos. */ 723 try_at_offset: 724 if (ml_anch && t[-1] != '\n') { 725 /* Eventually fbm_*() should handle this, but often 726 anchored_offset is not 0, so this check will not be wasted. */ 727 /* XXXX In the code below we prefer to look for "^" even in 728 presence of anchored substrings. And we search even 729 beyond the found float position. These pessimizations 730 are historical artefacts only. */ 731 find_anchor: 732 while (t < strend - prog->minlen) { 733 if (*t == '\n') { 734 if (t < check_at - prog->check_offset_min) { 735 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) { 736 /* Since we moved from the found position, 737 we definitely contradict the found anchored 738 substr. Due to the above check we do not 739 contradict "check" substr. 740 Thus we can arrive here only if check substr 741 is float. Redo checking for "other"=="fixed". 742 */ 743 strpos = t + 1; 744 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", 745 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); 746 goto do_other_anchored; 747 } 748 /* We don't contradict the found floating substring. */ 749 /* XXXX Why not check for STCLASS? */ 750 s = t + 1; 751 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", 752 PL_colors[0],PL_colors[1], (long)(s - i_strpos))); 753 goto set_useful; 754 } 755 /* Position contradicts check-string */ 756 /* XXXX probably better to look for check-string 757 than for "\n", so one should lower the limit for t? */ 758 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", 759 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos))); 760 other_last = strpos = s = t + 1; 761 goto restart; 762 } 763 t++; 764 } 765 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", 766 PL_colors[0],PL_colors[1])); 767 goto fail_finish; 768 } 769 else { 770 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", 771 PL_colors[0],PL_colors[1])); 772 } 773 s = t; 774 set_useful: 775 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ 776 } 777 else { 778 /* The found string does not prohibit matching at strpos, 779 - no optimization of calling REx engine can be performed, 780 unless it was an MBOL and we are not after MBOL, 781 or a future STCLASS check will fail this. */ 782 try_at_start: 783 /* Even in this situation we may use MBOL flag if strpos is offset 784 wrt the start of the string. */ 785 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */ 786 && (strpos != strbeg) && strpos[-1] != '\n' 787 /* May be due to an implicit anchor of m{.*foo} */ 788 && !(prog->reganch & ROPT_IMPLICIT)) 789 { 790 t = strpos; 791 goto find_anchor; 792 } 793 DEBUG_r( if (ml_anch) 794 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", 795 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]); 796 ); 797 success_at_start: 798 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */ 799 && (do_utf8 ? ( 800 prog->check_utf8 /* Could be deleted already */ 801 && --BmUSEFUL(prog->check_utf8) < 0 802 && (prog->check_utf8 == prog->float_utf8) 803 ) : ( 804 prog->check_substr /* Could be deleted already */ 805 && --BmUSEFUL(prog->check_substr) < 0 806 && (prog->check_substr == prog->float_substr) 807 ))) 808 { 809 /* If flags & SOMETHING - do not do it many times on the same match */ 810 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); 811 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr); 812 if (do_utf8 ? prog->check_substr : prog->check_utf8) 813 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8); 814 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */ 815 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */ 816 check = Nullsv; /* abort */ 817 s = strpos; 818 /* XXXX This is a remnant of the old implementation. It 819 looks wasteful, since now INTUIT can use many 820 other heuristics. */ 821 prog->reganch &= ~RE_USE_INTUIT; 822 } 823 else 824 s = strpos; 825 } 826 827 /* Last resort... */ 828 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ 829 if (prog->regstclass) { 830 /* minlen == 0 is possible if regstclass is \b or \B, 831 and the fixed substr is ''$. 832 Since minlen is already taken into account, s+1 is before strend; 833 accidentally, minlen >= 1 guaranties no false positives at s + 1 834 even for \b or \B. But (minlen? 1 : 0) below assumes that 835 regstclass does not come from lookahead... */ 836 /* If regstclass takes bytelength more than 1: If charlength==1, OK. 837 This leaves EXACTF only, which is dealt with in find_byclass(). */ 838 U8* str = (U8*)STRING(prog->regstclass); 839 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT 840 ? CHR_DIST(str+STR_LEN(prog->regstclass), str) 841 : 1); 842 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch) 843 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend) 844 : (prog->float_substr || prog->float_utf8 845 ? HOP3c(HOP3c(check_at, -start_shift, strbeg), 846 cl_l, strend) 847 : strend); 848 char *startpos = strbeg; 849 850 t = s; 851 if (prog->reganch & ROPT_UTF8) { 852 PL_regdata = prog->data; 853 PL_bostr = startpos; 854 } 855 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1); 856 if (!s) { 857 #ifdef DEBUGGING 858 char *what = 0; 859 #endif 860 if (endpos == strend) { 861 DEBUG_r( PerlIO_printf(Perl_debug_log, 862 "Could not match STCLASS...\n") ); 863 goto fail; 864 } 865 DEBUG_r( PerlIO_printf(Perl_debug_log, 866 "This position contradicts STCLASS...\n") ); 867 if ((prog->reganch & ROPT_ANCH) && !ml_anch) 868 goto fail; 869 /* Contradict one of substrings */ 870 if (prog->anchored_substr || prog->anchored_utf8) { 871 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) { 872 DEBUG_r( what = "anchored" ); 873 hop_and_restart: 874 s = HOP3c(t, 1, strend); 875 if (s + start_shift + end_shift > strend) { 876 /* XXXX Should be taken into account earlier? */ 877 DEBUG_r( PerlIO_printf(Perl_debug_log, 878 "Could not match STCLASS...\n") ); 879 goto fail; 880 } 881 if (!check) 882 goto giveup; 883 DEBUG_r( PerlIO_printf(Perl_debug_log, 884 "Looking for %s substr starting at offset %ld...\n", 885 what, (long)(s + start_shift - i_strpos)) ); 886 goto restart; 887 } 888 /* Have both, check_string is floating */ 889 if (t + start_shift >= check_at) /* Contradicts floating=check */ 890 goto retry_floating_check; 891 /* Recheck anchored substring, but not floating... */ 892 s = check_at; 893 if (!check) 894 goto giveup; 895 DEBUG_r( PerlIO_printf(Perl_debug_log, 896 "Looking for anchored substr starting at offset %ld...\n", 897 (long)(other_last - i_strpos)) ); 898 goto do_other_anchored; 899 } 900 /* Another way we could have checked stclass at the 901 current position only: */ 902 if (ml_anch) { 903 s = t = t + 1; 904 if (!check) 905 goto giveup; 906 DEBUG_r( PerlIO_printf(Perl_debug_log, 907 "Looking for /%s^%s/m starting at offset %ld...\n", 908 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) ); 909 goto try_at_offset; 910 } 911 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ 912 goto fail; 913 /* Check is floating subtring. */ 914 retry_floating_check: 915 t = check_at - start_shift; 916 DEBUG_r( what = "floating" ); 917 goto hop_and_restart; 918 } 919 if (t != s) { 920 DEBUG_r(PerlIO_printf(Perl_debug_log, 921 "By STCLASS: moving %ld --> %ld\n", 922 (long)(t - i_strpos), (long)(s - i_strpos)) 923 ); 924 } 925 else { 926 DEBUG_r(PerlIO_printf(Perl_debug_log, 927 "Does not contradict STCLASS...\n"); 928 ); 929 } 930 } 931 giveup: 932 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", 933 PL_colors[4], (check ? "Guessed" : "Giving up"), 934 PL_colors[5], (long)(s - i_strpos)) ); 935 return s; 936 937 fail_finish: /* Substring not found */ 938 if (prog->check_substr || prog->check_utf8) /* could be removed already */ 939 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ 940 fail: 941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", 942 PL_colors[4],PL_colors[5])); 943 return Nullch; 944 } 945 946 /* We know what class REx starts with. Try to find this position... */ 947 STATIC char * 948 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun) 949 { 950 I32 doevery = (prog->reganch & ROPT_SKIP) == 0; 951 char *m; 952 STRLEN ln; 953 unsigned int c1; 954 unsigned int c2; 955 char *e; 956 register I32 tmp = 1; /* Scratch variable? */ 957 register bool do_utf8 = PL_reg_match_utf8; 958 959 /* We know what class it must start with. */ 960 switch (OP(c)) { 961 case ANYOF: 962 while (s < strend) { 963 STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1; 964 965 if (do_utf8 ? 966 reginclass(c, (U8*)s, 0, do_utf8) : 967 REGINCLASS(c, (U8*)s) || 968 (ANYOF_FOLD_SHARP_S(c, s, strend) && 969 /* The assignment of 2 is intentional: 970 * for the sharp s, the skip is 2. */ 971 (skip = SHARP_S_SKIP) 972 )) { 973 if (tmp && (norun || regtry(prog, s))) 974 goto got_it; 975 else 976 tmp = doevery; 977 } 978 else 979 tmp = 1; 980 s += skip; 981 } 982 break; 983 case CANY: 984 while (s < strend) { 985 if (tmp && (norun || regtry(prog, s))) 986 goto got_it; 987 else 988 tmp = doevery; 989 s++; 990 } 991 break; 992 case EXACTF: 993 m = STRING(c); 994 ln = STR_LEN(c); 995 if (UTF) { 996 STRLEN ulen1, ulen2; 997 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; 998 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; 999 1000 to_utf8_lower((U8*)m, tmpbuf1, &ulen1); 1001 to_utf8_upper((U8*)m, tmpbuf2, &ulen2); 1002 1003 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 1004 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1005 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC, 1006 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1007 } 1008 else { 1009 c1 = *(U8*)m; 1010 c2 = PL_fold[c1]; 1011 } 1012 goto do_exactf; 1013 case EXACTFL: 1014 m = STRING(c); 1015 ln = STR_LEN(c); 1016 c1 = *(U8*)m; 1017 c2 = PL_fold_locale[c1]; 1018 do_exactf: 1019 e = HOP3c(strend, -(I32)ln, s); 1020 1021 if (norun && e < s) 1022 e = s; /* Due to minlen logic of intuit() */ 1023 1024 /* The idea in the EXACTF* cases is to first find the 1025 * first character of the EXACTF* node and then, if 1026 * necessary, case-insensitively compare the full 1027 * text of the node. The c1 and c2 are the first 1028 * characters (though in Unicode it gets a bit 1029 * more complicated because there are more cases 1030 * than just upper and lower: one needs to use 1031 * the so-called folding case for case-insensitive 1032 * matching (called "loose matching" in Unicode). 1033 * ibcmp_utf8() will do just that. */ 1034 1035 if (do_utf8) { 1036 UV c, f; 1037 U8 tmpbuf [UTF8_MAXLEN+1]; 1038 U8 foldbuf[UTF8_MAXLEN_FOLD+1]; 1039 STRLEN len, foldlen; 1040 1041 if (c1 == c2) { 1042 while (s <= e) { 1043 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, 1044 ckWARN(WARN_UTF8) ? 1045 0 : UTF8_ALLOW_ANY); 1046 if ( c == c1 1047 && (ln == len || 1048 ibcmp_utf8(s, (char **)0, 0, do_utf8, 1049 m, (char **)0, ln, (bool)UTF)) 1050 && (norun || regtry(prog, s)) ) 1051 goto got_it; 1052 else { 1053 uvchr_to_utf8(tmpbuf, c); 1054 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); 1055 if ( f != c 1056 && (f == c1 || f == c2) 1057 && (ln == foldlen || 1058 !ibcmp_utf8((char *) foldbuf, 1059 (char **)0, foldlen, do_utf8, 1060 m, 1061 (char **)0, ln, (bool)UTF)) 1062 && (norun || regtry(prog, s)) ) 1063 goto got_it; 1064 } 1065 s += len; 1066 } 1067 } 1068 else { 1069 while (s <= e) { 1070 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, 1071 ckWARN(WARN_UTF8) ? 1072 0 : UTF8_ALLOW_ANY); 1073 1074 /* Handle some of the three Greek sigmas cases. 1075 * Note that not all the possible combinations 1076 * are handled here: some of them are handled 1077 * by the standard folding rules, and some of 1078 * them (the character class or ANYOF cases) 1079 * are handled during compiletime in 1080 * regexec.c:S_regclass(). */ 1081 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA || 1082 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) 1083 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA; 1084 1085 if ( (c == c1 || c == c2) 1086 && (ln == len || 1087 ibcmp_utf8(s, (char **)0, 0, do_utf8, 1088 m, (char **)0, ln, (bool)UTF)) 1089 && (norun || regtry(prog, s)) ) 1090 goto got_it; 1091 else { 1092 uvchr_to_utf8(tmpbuf, c); 1093 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); 1094 if ( f != c 1095 && (f == c1 || f == c2) 1096 && (ln == foldlen || 1097 !ibcmp_utf8((char *) foldbuf, 1098 (char **)0, foldlen, do_utf8, 1099 m, 1100 (char **)0, ln, (bool)UTF)) 1101 && (norun || regtry(prog, s)) ) 1102 goto got_it; 1103 } 1104 s += len; 1105 } 1106 } 1107 } 1108 else { 1109 if (c1 == c2) 1110 while (s <= e) { 1111 if ( *(U8*)s == c1 1112 && (ln == 1 || !(OP(c) == EXACTF 1113 ? ibcmp(s, m, ln) 1114 : ibcmp_locale(s, m, ln))) 1115 && (norun || regtry(prog, s)) ) 1116 goto got_it; 1117 s++; 1118 } 1119 else 1120 while (s <= e) { 1121 if ( (*(U8*)s == c1 || *(U8*)s == c2) 1122 && (ln == 1 || !(OP(c) == EXACTF 1123 ? ibcmp(s, m, ln) 1124 : ibcmp_locale(s, m, ln))) 1125 && (norun || regtry(prog, s)) ) 1126 goto got_it; 1127 s++; 1128 } 1129 } 1130 break; 1131 case BOUNDL: 1132 PL_reg_flags |= RF_tainted; 1133 /* FALL THROUGH */ 1134 case BOUND: 1135 if (do_utf8) { 1136 if (s == PL_bostr) 1137 tmp = '\n'; 1138 else { 1139 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr); 1140 1141 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); 1142 } 1143 tmp = ((OP(c) == BOUND ? 1144 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); 1145 LOAD_UTF8_CHARCLASS(alnum,"a"); 1146 while (s < strend) { 1147 if (tmp == !(OP(c) == BOUND ? 1148 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : 1149 isALNUM_LC_utf8((U8*)s))) 1150 { 1151 tmp = !tmp; 1152 if ((norun || regtry(prog, s))) 1153 goto got_it; 1154 } 1155 s += UTF8SKIP(s); 1156 } 1157 } 1158 else { 1159 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; 1160 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); 1161 while (s < strend) { 1162 if (tmp == 1163 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { 1164 tmp = !tmp; 1165 if ((norun || regtry(prog, s))) 1166 goto got_it; 1167 } 1168 s++; 1169 } 1170 } 1171 if ((!prog->minlen && tmp) && (norun || regtry(prog, s))) 1172 goto got_it; 1173 break; 1174 case NBOUNDL: 1175 PL_reg_flags |= RF_tainted; 1176 /* FALL THROUGH */ 1177 case NBOUND: 1178 if (do_utf8) { 1179 if (s == PL_bostr) 1180 tmp = '\n'; 1181 else { 1182 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr); 1183 1184 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); 1185 } 1186 tmp = ((OP(c) == NBOUND ? 1187 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); 1188 LOAD_UTF8_CHARCLASS(alnum,"a"); 1189 while (s < strend) { 1190 if (tmp == !(OP(c) == NBOUND ? 1191 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : 1192 isALNUM_LC_utf8((U8*)s))) 1193 tmp = !tmp; 1194 else if ((norun || regtry(prog, s))) 1195 goto got_it; 1196 s += UTF8SKIP(s); 1197 } 1198 } 1199 else { 1200 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; 1201 tmp = ((OP(c) == NBOUND ? 1202 isALNUM(tmp) : isALNUM_LC(tmp)) != 0); 1203 while (s < strend) { 1204 if (tmp == 1205 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) 1206 tmp = !tmp; 1207 else if ((norun || regtry(prog, s))) 1208 goto got_it; 1209 s++; 1210 } 1211 } 1212 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s))) 1213 goto got_it; 1214 break; 1215 case ALNUM: 1216 if (do_utf8) { 1217 LOAD_UTF8_CHARCLASS(alnum,"a"); 1218 while (s < strend) { 1219 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { 1220 if (tmp && (norun || regtry(prog, s))) 1221 goto got_it; 1222 else 1223 tmp = doevery; 1224 } 1225 else 1226 tmp = 1; 1227 s += UTF8SKIP(s); 1228 } 1229 } 1230 else { 1231 while (s < strend) { 1232 if (isALNUM(*s)) { 1233 if (tmp && (norun || regtry(prog, s))) 1234 goto got_it; 1235 else 1236 tmp = doevery; 1237 } 1238 else 1239 tmp = 1; 1240 s++; 1241 } 1242 } 1243 break; 1244 case ALNUML: 1245 PL_reg_flags |= RF_tainted; 1246 if (do_utf8) { 1247 while (s < strend) { 1248 if (isALNUM_LC_utf8((U8*)s)) { 1249 if (tmp && (norun || regtry(prog, s))) 1250 goto got_it; 1251 else 1252 tmp = doevery; 1253 } 1254 else 1255 tmp = 1; 1256 s += UTF8SKIP(s); 1257 } 1258 } 1259 else { 1260 while (s < strend) { 1261 if (isALNUM_LC(*s)) { 1262 if (tmp && (norun || regtry(prog, s))) 1263 goto got_it; 1264 else 1265 tmp = doevery; 1266 } 1267 else 1268 tmp = 1; 1269 s++; 1270 } 1271 } 1272 break; 1273 case NALNUM: 1274 if (do_utf8) { 1275 LOAD_UTF8_CHARCLASS(alnum,"a"); 1276 while (s < strend) { 1277 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { 1278 if (tmp && (norun || regtry(prog, s))) 1279 goto got_it; 1280 else 1281 tmp = doevery; 1282 } 1283 else 1284 tmp = 1; 1285 s += UTF8SKIP(s); 1286 } 1287 } 1288 else { 1289 while (s < strend) { 1290 if (!isALNUM(*s)) { 1291 if (tmp && (norun || regtry(prog, s))) 1292 goto got_it; 1293 else 1294 tmp = doevery; 1295 } 1296 else 1297 tmp = 1; 1298 s++; 1299 } 1300 } 1301 break; 1302 case NALNUML: 1303 PL_reg_flags |= RF_tainted; 1304 if (do_utf8) { 1305 while (s < strend) { 1306 if (!isALNUM_LC_utf8((U8*)s)) { 1307 if (tmp && (norun || regtry(prog, s))) 1308 goto got_it; 1309 else 1310 tmp = doevery; 1311 } 1312 else 1313 tmp = 1; 1314 s += UTF8SKIP(s); 1315 } 1316 } 1317 else { 1318 while (s < strend) { 1319 if (!isALNUM_LC(*s)) { 1320 if (tmp && (norun || regtry(prog, s))) 1321 goto got_it; 1322 else 1323 tmp = doevery; 1324 } 1325 else 1326 tmp = 1; 1327 s++; 1328 } 1329 } 1330 break; 1331 case SPACE: 1332 if (do_utf8) { 1333 LOAD_UTF8_CHARCLASS(space," "); 1334 while (s < strend) { 1335 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) { 1336 if (tmp && (norun || regtry(prog, s))) 1337 goto got_it; 1338 else 1339 tmp = doevery; 1340 } 1341 else 1342 tmp = 1; 1343 s += UTF8SKIP(s); 1344 } 1345 } 1346 else { 1347 while (s < strend) { 1348 if (isSPACE(*s)) { 1349 if (tmp && (norun || regtry(prog, s))) 1350 goto got_it; 1351 else 1352 tmp = doevery; 1353 } 1354 else 1355 tmp = 1; 1356 s++; 1357 } 1358 } 1359 break; 1360 case SPACEL: 1361 PL_reg_flags |= RF_tainted; 1362 if (do_utf8) { 1363 while (s < strend) { 1364 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { 1365 if (tmp && (norun || regtry(prog, s))) 1366 goto got_it; 1367 else 1368 tmp = doevery; 1369 } 1370 else 1371 tmp = 1; 1372 s += UTF8SKIP(s); 1373 } 1374 } 1375 else { 1376 while (s < strend) { 1377 if (isSPACE_LC(*s)) { 1378 if (tmp && (norun || regtry(prog, s))) 1379 goto got_it; 1380 else 1381 tmp = doevery; 1382 } 1383 else 1384 tmp = 1; 1385 s++; 1386 } 1387 } 1388 break; 1389 case NSPACE: 1390 if (do_utf8) { 1391 LOAD_UTF8_CHARCLASS(space," "); 1392 while (s < strend) { 1393 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) { 1394 if (tmp && (norun || regtry(prog, s))) 1395 goto got_it; 1396 else 1397 tmp = doevery; 1398 } 1399 else 1400 tmp = 1; 1401 s += UTF8SKIP(s); 1402 } 1403 } 1404 else { 1405 while (s < strend) { 1406 if (!isSPACE(*s)) { 1407 if (tmp && (norun || regtry(prog, s))) 1408 goto got_it; 1409 else 1410 tmp = doevery; 1411 } 1412 else 1413 tmp = 1; 1414 s++; 1415 } 1416 } 1417 break; 1418 case NSPACEL: 1419 PL_reg_flags |= RF_tainted; 1420 if (do_utf8) { 1421 while (s < strend) { 1422 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { 1423 if (tmp && (norun || regtry(prog, s))) 1424 goto got_it; 1425 else 1426 tmp = doevery; 1427 } 1428 else 1429 tmp = 1; 1430 s += UTF8SKIP(s); 1431 } 1432 } 1433 else { 1434 while (s < strend) { 1435 if (!isSPACE_LC(*s)) { 1436 if (tmp && (norun || regtry(prog, s))) 1437 goto got_it; 1438 else 1439 tmp = doevery; 1440 } 1441 else 1442 tmp = 1; 1443 s++; 1444 } 1445 } 1446 break; 1447 case DIGIT: 1448 if (do_utf8) { 1449 LOAD_UTF8_CHARCLASS(digit,"0"); 1450 while (s < strend) { 1451 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { 1452 if (tmp && (norun || regtry(prog, s))) 1453 goto got_it; 1454 else 1455 tmp = doevery; 1456 } 1457 else 1458 tmp = 1; 1459 s += UTF8SKIP(s); 1460 } 1461 } 1462 else { 1463 while (s < strend) { 1464 if (isDIGIT(*s)) { 1465 if (tmp && (norun || regtry(prog, s))) 1466 goto got_it; 1467 else 1468 tmp = doevery; 1469 } 1470 else 1471 tmp = 1; 1472 s++; 1473 } 1474 } 1475 break; 1476 case DIGITL: 1477 PL_reg_flags |= RF_tainted; 1478 if (do_utf8) { 1479 while (s < strend) { 1480 if (isDIGIT_LC_utf8((U8*)s)) { 1481 if (tmp && (norun || regtry(prog, s))) 1482 goto got_it; 1483 else 1484 tmp = doevery; 1485 } 1486 else 1487 tmp = 1; 1488 s += UTF8SKIP(s); 1489 } 1490 } 1491 else { 1492 while (s < strend) { 1493 if (isDIGIT_LC(*s)) { 1494 if (tmp && (norun || regtry(prog, s))) 1495 goto got_it; 1496 else 1497 tmp = doevery; 1498 } 1499 else 1500 tmp = 1; 1501 s++; 1502 } 1503 } 1504 break; 1505 case NDIGIT: 1506 if (do_utf8) { 1507 LOAD_UTF8_CHARCLASS(digit,"0"); 1508 while (s < strend) { 1509 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { 1510 if (tmp && (norun || regtry(prog, s))) 1511 goto got_it; 1512 else 1513 tmp = doevery; 1514 } 1515 else 1516 tmp = 1; 1517 s += UTF8SKIP(s); 1518 } 1519 } 1520 else { 1521 while (s < strend) { 1522 if (!isDIGIT(*s)) { 1523 if (tmp && (norun || regtry(prog, s))) 1524 goto got_it; 1525 else 1526 tmp = doevery; 1527 } 1528 else 1529 tmp = 1; 1530 s++; 1531 } 1532 } 1533 break; 1534 case NDIGITL: 1535 PL_reg_flags |= RF_tainted; 1536 if (do_utf8) { 1537 while (s < strend) { 1538 if (!isDIGIT_LC_utf8((U8*)s)) { 1539 if (tmp && (norun || regtry(prog, s))) 1540 goto got_it; 1541 else 1542 tmp = doevery; 1543 } 1544 else 1545 tmp = 1; 1546 s += UTF8SKIP(s); 1547 } 1548 } 1549 else { 1550 while (s < strend) { 1551 if (!isDIGIT_LC(*s)) { 1552 if (tmp && (norun || regtry(prog, s))) 1553 goto got_it; 1554 else 1555 tmp = doevery; 1556 } 1557 else 1558 tmp = 1; 1559 s++; 1560 } 1561 } 1562 break; 1563 default: 1564 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); 1565 break; 1566 } 1567 return 0; 1568 got_it: 1569 return s; 1570 } 1571 1572 /* 1573 - regexec_flags - match a regexp against a string 1574 */ 1575 I32 1576 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend, 1577 char *strbeg, I32 minend, SV *sv, void *data, U32 flags) 1578 /* strend: pointer to null at end of string */ 1579 /* strbeg: real beginning of string */ 1580 /* minend: end of match must be >=minend after stringarg. */ 1581 /* data: May be used for some additional optimizations. */ 1582 /* nosave: For optimizations. */ 1583 { 1584 register char *s; 1585 register regnode *c; 1586 register char *startpos = stringarg; 1587 I32 minlen; /* must match at least this many chars */ 1588 I32 dontbother = 0; /* how many characters not to try at end */ 1589 /* I32 start_shift = 0; */ /* Offset of the start to find 1590 constant substr. */ /* CC */ 1591 I32 end_shift = 0; /* Same for the end. */ /* CC */ 1592 I32 scream_pos = -1; /* Internal iterator of scream. */ 1593 char *scream_olds; 1594 SV* oreplsv = GvSV(PL_replgv); 1595 bool do_utf8 = DO_UTF8(sv); 1596 #ifdef DEBUGGING 1597 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0); 1598 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1); 1599 #endif 1600 1601 PL_regcc = 0; 1602 1603 cache_re(prog); 1604 #ifdef DEBUGGING 1605 PL_regnarrate = DEBUG_r_TEST; 1606 #endif 1607 1608 /* Be paranoid... */ 1609 if (prog == NULL || startpos == NULL) { 1610 Perl_croak(aTHX_ "NULL regexp parameter"); 1611 return 0; 1612 } 1613 1614 minlen = prog->minlen; 1615 if (strend - startpos < minlen) { 1616 DEBUG_r(PerlIO_printf(Perl_debug_log, 1617 "String too short [regexec_flags]...\n")); 1618 goto phooey; 1619 } 1620 1621 /* Check validity of program. */ 1622 if (UCHARAT(prog->program) != REG_MAGIC) { 1623 Perl_croak(aTHX_ "corrupted regexp program"); 1624 } 1625 1626 PL_reg_flags = 0; 1627 PL_reg_eval_set = 0; 1628 PL_reg_maxiter = 0; 1629 1630 if (prog->reganch & ROPT_UTF8) 1631 PL_reg_flags |= RF_utf8; 1632 1633 /* Mark beginning of line for ^ and lookbehind. */ 1634 PL_regbol = startpos; 1635 PL_bostr = strbeg; 1636 PL_reg_sv = sv; 1637 1638 /* Mark end of line for $ (and such) */ 1639 PL_regeol = strend; 1640 1641 /* see how far we have to get to not match where we matched before */ 1642 PL_regtill = startpos+minend; 1643 1644 /* We start without call_cc context. */ 1645 PL_reg_call_cc = 0; 1646 1647 /* If there is a "must appear" string, look for it. */ 1648 s = startpos; 1649 1650 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */ 1651 MAGIC *mg; 1652 1653 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */ 1654 PL_reg_ganch = startpos; 1655 else if (sv && SvTYPE(sv) >= SVt_PVMG 1656 && SvMAGIC(sv) 1657 && (mg = mg_find(sv, PERL_MAGIC_regex_global)) 1658 && mg->mg_len >= 0) { 1659 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */ 1660 if (prog->reganch & ROPT_ANCH_GPOS) { 1661 if (s > PL_reg_ganch) 1662 goto phooey; 1663 s = PL_reg_ganch; 1664 } 1665 } 1666 else /* pos() not defined */ 1667 PL_reg_ganch = strbeg; 1668 } 1669 1670 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) { 1671 re_scream_pos_data d; 1672 1673 d.scream_olds = &scream_olds; 1674 d.scream_pos = &scream_pos; 1675 s = re_intuit_start(prog, sv, s, strend, flags, &d); 1676 if (!s) { 1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); 1678 goto phooey; /* not present */ 1679 } 1680 } 1681 1682 DEBUG_r({ 1683 char *s0 = UTF ? 1684 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60, 1685 UNI_DISPLAY_REGEX) : 1686 prog->precomp; 1687 int len0 = UTF ? SvCUR(dsv0) : prog->prelen; 1688 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60, 1689 UNI_DISPLAY_REGEX) : startpos; 1690 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos; 1691 if (!PL_colorset) 1692 reginitcolors(); 1693 PerlIO_printf(Perl_debug_log, 1694 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n", 1695 PL_colors[4],PL_colors[5],PL_colors[0], 1696 len0, len0, s0, 1697 PL_colors[1], 1698 len0 > 60 ? "..." : "", 1699 PL_colors[0], 1700 (int)(len1 > 60 ? 60 : len1), 1701 s1, PL_colors[1], 1702 (len1 > 60 ? "..." : "") 1703 ); 1704 }); 1705 1706 /* Simplest case: anchored match need be tried only once. */ 1707 /* [unless only anchor is BOL and multiline is set] */ 1708 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) { 1709 if (s == startpos && regtry(prog, startpos)) 1710 goto got_it; 1711 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT) 1712 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */ 1713 { 1714 char *end; 1715 1716 if (minlen) 1717 dontbother = minlen - 1; 1718 end = HOP3c(strend, -dontbother, strbeg) - 1; 1719 /* for multiline we only have to try after newlines */ 1720 if (prog->check_substr || prog->check_utf8) { 1721 if (s == startpos) 1722 goto after_try; 1723 while (1) { 1724 if (regtry(prog, s)) 1725 goto got_it; 1726 after_try: 1727 if (s >= end) 1728 goto phooey; 1729 if (prog->reganch & RE_USE_INTUIT) { 1730 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL); 1731 if (!s) 1732 goto phooey; 1733 } 1734 else 1735 s++; 1736 } 1737 } else { 1738 if (s > startpos) 1739 s--; 1740 while (s < end) { 1741 if (*s++ == '\n') { /* don't need PL_utf8skip here */ 1742 if (regtry(prog, s)) 1743 goto got_it; 1744 } 1745 } 1746 } 1747 } 1748 goto phooey; 1749 } else if (prog->reganch & ROPT_ANCH_GPOS) { 1750 if (regtry(prog, PL_reg_ganch)) 1751 goto got_it; 1752 goto phooey; 1753 } 1754 1755 /* Messy cases: unanchored match. */ 1756 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) { 1757 /* we have /x+whatever/ */ 1758 /* it must be a one character string (XXXX Except UTF?) */ 1759 char ch; 1760 #ifdef DEBUGGING 1761 int did_match = 0; 1762 #endif 1763 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) 1764 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1765 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0]; 1766 1767 if (do_utf8) { 1768 while (s < strend) { 1769 if (*s == ch) { 1770 DEBUG_r( did_match = 1 ); 1771 if (regtry(prog, s)) goto got_it; 1772 s += UTF8SKIP(s); 1773 while (s < strend && *s == ch) 1774 s += UTF8SKIP(s); 1775 } 1776 s += UTF8SKIP(s); 1777 } 1778 } 1779 else { 1780 while (s < strend) { 1781 if (*s == ch) { 1782 DEBUG_r( did_match = 1 ); 1783 if (regtry(prog, s)) goto got_it; 1784 s++; 1785 while (s < strend && *s == ch) 1786 s++; 1787 } 1788 s++; 1789 } 1790 } 1791 DEBUG_r(if (!did_match) 1792 PerlIO_printf(Perl_debug_log, 1793 "Did not find anchored character...\n") 1794 ); 1795 } 1796 /*SUPPRESS 560*/ 1797 else if (prog->anchored_substr != Nullsv 1798 || prog->anchored_utf8 != Nullsv 1799 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) 1800 && prog->float_max_offset < strend - s)) { 1801 SV *must; 1802 I32 back_max; 1803 I32 back_min; 1804 char *last; 1805 char *last1; /* Last position checked before */ 1806 #ifdef DEBUGGING 1807 int did_match = 0; 1808 #endif 1809 if (prog->anchored_substr || prog->anchored_utf8) { 1810 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) 1811 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1812 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; 1813 back_max = back_min = prog->anchored_offset; 1814 } else { 1815 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) 1816 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1817 must = do_utf8 ? prog->float_utf8 : prog->float_substr; 1818 back_max = prog->float_max_offset; 1819 back_min = prog->float_min_offset; 1820 } 1821 if (must == &PL_sv_undef) 1822 /* could not downgrade utf8 check substring, so must fail */ 1823 goto phooey; 1824 1825 last = HOP3c(strend, /* Cannot start after this */ 1826 -(I32)(CHR_SVLEN(must) 1827 - (SvTAIL(must) != 0) + back_min), strbeg); 1828 1829 if (s > PL_bostr) 1830 last1 = HOPc(s, -1); 1831 else 1832 last1 = s - 1; /* bogus */ 1833 1834 /* XXXX check_substr already used to find `s', can optimize if 1835 check_substr==must. */ 1836 scream_pos = -1; 1837 dontbother = end_shift; 1838 strend = HOPc(strend, -dontbother); 1839 while ( (s <= last) && 1840 ((flags & REXEC_SCREAM) 1841 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg, 1842 end_shift, &scream_pos, 0)) 1843 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend), 1844 (unsigned char*)strend, must, 1845 PL_multiline ? FBMrf_MULTILINE : 0))) ) { 1846 DEBUG_r( did_match = 1 ); 1847 if (HOPc(s, -back_max) > last1) { 1848 last1 = HOPc(s, -back_min); 1849 s = HOPc(s, -back_max); 1850 } 1851 else { 1852 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1; 1853 1854 last1 = HOPc(s, -back_min); 1855 s = t; 1856 } 1857 if (do_utf8) { 1858 while (s <= last1) { 1859 if (regtry(prog, s)) 1860 goto got_it; 1861 s += UTF8SKIP(s); 1862 } 1863 } 1864 else { 1865 while (s <= last1) { 1866 if (regtry(prog, s)) 1867 goto got_it; 1868 s++; 1869 } 1870 } 1871 } 1872 DEBUG_r(if (!did_match) 1873 PerlIO_printf(Perl_debug_log, 1874 "Did not find %s substr `%s%.*s%s'%s...\n", 1875 ((must == prog->anchored_substr || must == prog->anchored_utf8) 1876 ? "anchored" : "floating"), 1877 PL_colors[0], 1878 (int)(SvCUR(must) - (SvTAIL(must)!=0)), 1879 SvPVX(must), 1880 PL_colors[1], (SvTAIL(must) ? "$" : "")) 1881 ); 1882 goto phooey; 1883 } 1884 else if ((c = prog->regstclass)) { 1885 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT) 1886 /* don't bother with what can't match */ 1887 strend = HOPc(strend, -(minlen - 1)); 1888 DEBUG_r({ 1889 SV *prop = sv_newmortal(); 1890 char *s0; 1891 char *s1; 1892 int len0; 1893 int len1; 1894 1895 regprop(prop, c); 1896 s0 = UTF ? 1897 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60, 1898 UNI_DISPLAY_REGEX) : 1899 SvPVX(prop); 1900 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop); 1901 s1 = UTF ? 1902 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s; 1903 len1 = UTF ? SvCUR(dsv1) : strend - s; 1904 PerlIO_printf(Perl_debug_log, 1905 "Matching stclass `%*.*s' against `%*.*s'\n", 1906 len0, len0, s0, 1907 len1, len1, s1); 1908 }); 1909 if (find_byclass(prog, c, s, strend, startpos, 0)) 1910 goto got_it; 1911 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n")); 1912 } 1913 else { 1914 dontbother = 0; 1915 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) { 1916 /* Trim the end. */ 1917 char *last; 1918 SV* float_real; 1919 1920 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) 1921 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1922 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr; 1923 1924 if (flags & REXEC_SCREAM) { 1925 last = screaminstr(sv, float_real, s - strbeg, 1926 end_shift, &scream_pos, 1); /* last one */ 1927 if (!last) 1928 last = scream_olds; /* Only one occurrence. */ 1929 } 1930 else { 1931 STRLEN len; 1932 char *little = SvPV(float_real, len); 1933 1934 if (SvTAIL(float_real)) { 1935 if (memEQ(strend - len + 1, little, len - 1)) 1936 last = strend - len + 1; 1937 else if (!PL_multiline) 1938 last = memEQ(strend - len, little, len) 1939 ? strend - len : Nullch; 1940 else 1941 goto find_last; 1942 } else { 1943 find_last: 1944 if (len) 1945 last = rninstr(s, strend, little, little + len); 1946 else 1947 last = strend; /* matching `$' */ 1948 } 1949 } 1950 if (last == NULL) { 1951 DEBUG_r(PerlIO_printf(Perl_debug_log, 1952 "%sCan't trim the tail, match fails (should not happen)%s\n", 1953 PL_colors[4],PL_colors[5])); 1954 goto phooey; /* Should not happen! */ 1955 } 1956 dontbother = strend - last + prog->float_min_offset; 1957 } 1958 if (minlen && (dontbother < minlen)) 1959 dontbother = minlen - 1; 1960 strend -= dontbother; /* this one's always in bytes! */ 1961 /* We don't know much -- general case. */ 1962 if (do_utf8) { 1963 for (;;) { 1964 if (regtry(prog, s)) 1965 goto got_it; 1966 if (s >= strend) 1967 break; 1968 s += UTF8SKIP(s); 1969 }; 1970 } 1971 else { 1972 do { 1973 if (regtry(prog, s)) 1974 goto got_it; 1975 } while (s++ < strend); 1976 } 1977 } 1978 1979 /* Failure. */ 1980 goto phooey; 1981 1982 got_it: 1983 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted); 1984 1985 if (PL_reg_eval_set) { 1986 /* Preserve the current value of $^R */ 1987 if (oreplsv != GvSV(PL_replgv)) 1988 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is 1989 restored, the value remains 1990 the same. */ 1991 restore_pos(aTHX_ 0); 1992 } 1993 1994 /* make sure $`, $&, $', and $digit will work later */ 1995 if ( !(flags & REXEC_NOT_FIRST) ) { 1996 if (RX_MATCH_COPIED(prog)) { 1997 Safefree(prog->subbeg); 1998 RX_MATCH_COPIED_off(prog); 1999 } 2000 if (flags & REXEC_COPY_STR) { 2001 I32 i = PL_regeol - startpos + (stringarg - strbeg); 2002 2003 s = savepvn(strbeg, i); 2004 prog->subbeg = s; 2005 prog->sublen = i; 2006 RX_MATCH_COPIED_on(prog); 2007 } 2008 else { 2009 prog->subbeg = strbeg; 2010 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ 2011 } 2012 } 2013 2014 return 1; 2015 2016 phooey: 2017 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", 2018 PL_colors[4],PL_colors[5])); 2019 if (PL_reg_eval_set) 2020 restore_pos(aTHX_ 0); 2021 return 0; 2022 } 2023 2024 /* 2025 - regtry - try match at specific point 2026 */ 2027 STATIC I32 /* 0 failure, 1 success */ 2028 S_regtry(pTHX_ regexp *prog, char *startpos) 2029 { 2030 register I32 i; 2031 register I32 *sp; 2032 register I32 *ep; 2033 CHECKPOINT lastcp; 2034 2035 #ifdef DEBUGGING 2036 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */ 2037 #endif 2038 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) { 2039 MAGIC *mg; 2040 2041 PL_reg_eval_set = RS_init; 2042 DEBUG_r(DEBUG_s( 2043 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n", 2044 (IV)(PL_stack_sp - PL_stack_base)); 2045 )); 2046 SAVEI32(cxstack[cxstack_ix].blk_oldsp); 2047 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; 2048 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ 2049 SAVETMPS; 2050 /* Apparently this is not needed, judging by wantarray. */ 2051 /* SAVEI8(cxstack[cxstack_ix].blk_gimme); 2052 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ 2053 2054 if (PL_reg_sv) { 2055 /* Make $_ available to executed code. */ 2056 if (PL_reg_sv != DEFSV) { 2057 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */ 2058 SAVESPTR(DEFSV); 2059 DEFSV = PL_reg_sv; 2060 } 2061 2062 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv) 2063 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) { 2064 /* prepare for quick setting of pos */ 2065 sv_magic(PL_reg_sv, (SV*)0, 2066 PERL_MAGIC_regex_global, Nullch, 0); 2067 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global); 2068 mg->mg_len = -1; 2069 } 2070 PL_reg_magic = mg; 2071 PL_reg_oldpos = mg->mg_len; 2072 SAVEDESTRUCTOR_X(restore_pos, 0); 2073 } 2074 if (!PL_reg_curpm) { 2075 Newz(22,PL_reg_curpm, 1, PMOP); 2076 #ifdef USE_ITHREADS 2077 { 2078 SV* repointer = newSViv(0); 2079 /* so we know which PL_regex_padav element is PL_reg_curpm */ 2080 SvFLAGS(repointer) |= SVf_BREAK; 2081 av_push(PL_regex_padav,repointer); 2082 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); 2083 PL_regex_pad = AvARRAY(PL_regex_padav); 2084 } 2085 #endif 2086 } 2087 PM_SETRE(PL_reg_curpm, prog); 2088 PL_reg_oldcurpm = PL_curpm; 2089 PL_curpm = PL_reg_curpm; 2090 if (RX_MATCH_COPIED(prog)) { 2091 /* Here is a serious problem: we cannot rewrite subbeg, 2092 since it may be needed if this match fails. Thus 2093 $` inside (?{}) could fail... */ 2094 PL_reg_oldsaved = prog->subbeg; 2095 PL_reg_oldsavedlen = prog->sublen; 2096 RX_MATCH_COPIED_off(prog); 2097 } 2098 else 2099 PL_reg_oldsaved = Nullch; 2100 prog->subbeg = PL_bostr; 2101 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ 2102 } 2103 prog->startp[0] = startpos - PL_bostr; 2104 PL_reginput = startpos; 2105 PL_regstartp = prog->startp; 2106 PL_regendp = prog->endp; 2107 PL_reglastparen = &prog->lastparen; 2108 PL_reglastcloseparen = &prog->lastcloseparen; 2109 prog->lastparen = 0; 2110 PL_regsize = 0; 2111 DEBUG_r(PL_reg_starttry = startpos); 2112 if (PL_reg_start_tmpl <= prog->nparens) { 2113 PL_reg_start_tmpl = prog->nparens*3/2 + 3; 2114 if(PL_reg_start_tmp) 2115 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 2116 else 2117 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*); 2118 } 2119 2120 /* XXXX What this code is doing here?!!! There should be no need 2121 to do this again and again, PL_reglastparen should take care of 2122 this! --ilya*/ 2123 2124 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. 2125 * Actually, the code in regcppop() (which Ilya may be meaning by 2126 * PL_reglastparen), is not needed at all by the test suite 2127 * (op/regexp, op/pat, op/split), but that code is needed, oddly 2128 * enough, for building DynaLoader, or otherwise this 2129 * "Error: '*' not in typemap in DynaLoader.xs, line 164" 2130 * will happen. Meanwhile, this code *is* needed for the 2131 * above-mentioned test suite tests to succeed. The common theme 2132 * on those tests seems to be returning null fields from matches. 2133 * --jhi */ 2134 #if 1 2135 sp = prog->startp; 2136 ep = prog->endp; 2137 if (prog->nparens) { 2138 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) { 2139 *++sp = -1; 2140 *++ep = -1; 2141 } 2142 } 2143 #endif 2144 REGCP_SET(lastcp); 2145 if (regmatch(prog->program + 1)) { 2146 prog->endp[0] = PL_reginput - PL_bostr; 2147 return 1; 2148 } 2149 REGCP_UNWIND(lastcp); 2150 return 0; 2151 } 2152 2153 #define RE_UNWIND_BRANCH 1 2154 #define RE_UNWIND_BRANCHJ 2 2155 2156 union re_unwind_t; 2157 2158 typedef struct { /* XX: makes sense to enlarge it... */ 2159 I32 type; 2160 I32 prev; 2161 CHECKPOINT lastcp; 2162 } re_unwind_generic_t; 2163 2164 typedef struct { 2165 I32 type; 2166 I32 prev; 2167 CHECKPOINT lastcp; 2168 I32 lastparen; 2169 regnode *next; 2170 char *locinput; 2171 I32 nextchr; 2172 #ifdef DEBUGGING 2173 int regindent; 2174 #endif 2175 } re_unwind_branch_t; 2176 2177 typedef union re_unwind_t { 2178 I32 type; 2179 re_unwind_generic_t generic; 2180 re_unwind_branch_t branch; 2181 } re_unwind_t; 2182 2183 #define sayYES goto yes 2184 #define sayNO goto no 2185 #define sayNO_ANYOF goto no_anyof 2186 #define sayYES_FINAL goto yes_final 2187 #define sayYES_LOUD goto yes_loud 2188 #define sayNO_FINAL goto no_final 2189 #define sayNO_SILENT goto do_no 2190 #define saySAME(x) if (x) goto yes; else goto no 2191 2192 #define REPORT_CODE_OFF 24 2193 2194 /* 2195 - regmatch - main matching routine 2196 * 2197 * Conceptually the strategy is simple: check to see whether the current 2198 * node matches, call self recursively to see whether the rest matches, 2199 * and then act accordingly. In practice we make some effort to avoid 2200 * recursion, in particular by going through "ordinary" nodes (that don't 2201 * need to know whether the rest of the match failed) by a loop instead of 2202 * by recursion. 2203 */ 2204 /* [lwall] I've hoisted the register declarations to the outer block in order to 2205 * maybe save a little bit of pushing and popping on the stack. It also takes 2206 * advantage of machines that use a register save mask on subroutine entry. 2207 */ 2208 STATIC I32 /* 0 failure, 1 success */ 2209 S_regmatch(pTHX_ regnode *prog) 2210 { 2211 register regnode *scan; /* Current node. */ 2212 regnode *next; /* Next node. */ 2213 regnode *inner; /* Next node in internal branch. */ 2214 register I32 nextchr; /* renamed nextchr - nextchar colides with 2215 function of same name */ 2216 register I32 n; /* no or next */ 2217 register I32 ln = 0; /* len or last */ 2218 register char *s = Nullch; /* operand or save */ 2219 register char *locinput = PL_reginput; 2220 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */ 2221 int minmod = 0, sw = 0, logical = 0; 2222 I32 unwind = 0; 2223 #if 0 2224 I32 firstcp = PL_savestack_ix; 2225 #endif 2226 register bool do_utf8 = PL_reg_match_utf8; 2227 #ifdef DEBUGGING 2228 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0); 2229 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1); 2230 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2); 2231 #endif 2232 2233 #ifdef DEBUGGING 2234 PL_regindent++; 2235 #endif 2236 2237 /* Note that nextchr is a byte even in UTF */ 2238 nextchr = UCHARAT(locinput); 2239 scan = prog; 2240 while (scan != NULL) { 2241 2242 DEBUG_r( { 2243 SV *prop = sv_newmortal(); 2244 int docolor = *PL_colors[0]; 2245 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ 2246 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput); 2247 /* The part of the string before starttry has one color 2248 (pref0_len chars), between starttry and current 2249 position another one (pref_len - pref0_len chars), 2250 after the current position the third one. 2251 We assume that pref0_len <= pref_len, otherwise we 2252 decrease pref0_len. */ 2253 int pref_len = (locinput - PL_bostr) > (5 + taill) - l 2254 ? (5 + taill) - l : locinput - PL_bostr; 2255 int pref0_len; 2256 2257 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) 2258 pref_len++; 2259 pref0_len = pref_len - (locinput - PL_reg_starttry); 2260 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput) 2261 l = ( PL_regeol - locinput > (5 + taill) - pref_len 2262 ? (5 + taill) - pref_len : PL_regeol - locinput); 2263 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) 2264 l--; 2265 if (pref0_len < 0) 2266 pref0_len = 0; 2267 if (pref0_len > pref_len) 2268 pref0_len = pref_len; 2269 regprop(prop, scan); 2270 { 2271 char *s0 = 2272 do_utf8 ? 2273 pv_uni_display(dsv0, (U8*)(locinput - pref_len), 2274 pref0_len, 60, UNI_DISPLAY_REGEX) : 2275 locinput - pref_len; 2276 int len0 = do_utf8 ? strlen(s0) : pref0_len; 2277 char *s1 = do_utf8 ? 2278 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len), 2279 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) : 2280 locinput - pref_len + pref0_len; 2281 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len; 2282 char *s2 = do_utf8 ? 2283 pv_uni_display(dsv2, (U8*)locinput, 2284 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) : 2285 locinput; 2286 int len2 = do_utf8 ? strlen(s2) : l; 2287 PerlIO_printf(Perl_debug_log, 2288 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n", 2289 (IV)(locinput - PL_bostr), 2290 PL_colors[4], 2291 len0, s0, 2292 PL_colors[5], 2293 PL_colors[2], 2294 len1, s1, 2295 PL_colors[3], 2296 (docolor ? "" : "> <"), 2297 PL_colors[0], 2298 len2, s2, 2299 PL_colors[1], 2300 15 - l - pref_len + 1, 2301 "", 2302 (IV)(scan - PL_regprogram), PL_regindent*2, "", 2303 SvPVX(prop)); 2304 } 2305 }); 2306 2307 next = scan + NEXT_OFF(scan); 2308 if (next == scan) 2309 next = NULL; 2310 2311 switch (OP(scan)) { 2312 case BOL: 2313 if (locinput == PL_bostr || (PL_multiline && 2314 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') ) 2315 { 2316 /* regtill = regbol; */ 2317 break; 2318 } 2319 sayNO; 2320 case MBOL: 2321 if (locinput == PL_bostr || 2322 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n')) 2323 { 2324 break; 2325 } 2326 sayNO; 2327 case SBOL: 2328 if (locinput == PL_bostr) 2329 break; 2330 sayNO; 2331 case GPOS: 2332 if (locinput == PL_reg_ganch) 2333 break; 2334 sayNO; 2335 case EOL: 2336 if (PL_multiline) 2337 goto meol; 2338 else 2339 goto seol; 2340 case MEOL: 2341 meol: 2342 if ((nextchr || locinput < PL_regeol) && nextchr != '\n') 2343 sayNO; 2344 break; 2345 case SEOL: 2346 seol: 2347 if ((nextchr || locinput < PL_regeol) && nextchr != '\n') 2348 sayNO; 2349 if (PL_regeol - locinput > 1) 2350 sayNO; 2351 break; 2352 case EOS: 2353 if (PL_regeol != locinput) 2354 sayNO; 2355 break; 2356 case SANY: 2357 if (!nextchr && locinput >= PL_regeol) 2358 sayNO; 2359 if (do_utf8) { 2360 locinput += PL_utf8skip[nextchr]; 2361 if (locinput > PL_regeol) 2362 sayNO; 2363 nextchr = UCHARAT(locinput); 2364 } 2365 else 2366 nextchr = UCHARAT(++locinput); 2367 break; 2368 case CANY: 2369 if (!nextchr && locinput >= PL_regeol) 2370 sayNO; 2371 nextchr = UCHARAT(++locinput); 2372 break; 2373 case REG_ANY: 2374 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') 2375 sayNO; 2376 if (do_utf8) { 2377 locinput += PL_utf8skip[nextchr]; 2378 if (locinput > PL_regeol) 2379 sayNO; 2380 nextchr = UCHARAT(locinput); 2381 } 2382 else 2383 nextchr = UCHARAT(++locinput); 2384 break; 2385 case EXACT: 2386 s = STRING(scan); 2387 ln = STR_LEN(scan); 2388 if (do_utf8 != UTF) { 2389 /* The target and the pattern have differing utf8ness. */ 2390 char *l = locinput; 2391 char *e = s + ln; 2392 STRLEN ulen; 2393 2394 if (do_utf8) { 2395 /* The target is utf8, the pattern is not utf8. */ 2396 while (s < e) { 2397 if (l >= PL_regeol) 2398 sayNO; 2399 if (NATIVE_TO_UNI(*(U8*)s) != 2400 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen, 2401 ckWARN(WARN_UTF8) ? 2402 0 : UTF8_ALLOW_ANY)) 2403 sayNO; 2404 l += ulen; 2405 s ++; 2406 } 2407 } 2408 else { 2409 /* The target is not utf8, the pattern is utf8. */ 2410 while (s < e) { 2411 if (l >= PL_regeol) 2412 sayNO; 2413 if (NATIVE_TO_UNI(*((U8*)l)) != 2414 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen, 2415 ckWARN(WARN_UTF8) ? 2416 0 : UTF8_ALLOW_ANY)) 2417 sayNO; 2418 s += ulen; 2419 l ++; 2420 } 2421 } 2422 locinput = l; 2423 nextchr = UCHARAT(locinput); 2424 break; 2425 } 2426 /* The target and the pattern have the same utf8ness. */ 2427 /* Inline the first character, for speed. */ 2428 if (UCHARAT(s) != nextchr) 2429 sayNO; 2430 if (PL_regeol - locinput < ln) 2431 sayNO; 2432 if (ln > 1 && memNE(s, locinput, ln)) 2433 sayNO; 2434 locinput += ln; 2435 nextchr = UCHARAT(locinput); 2436 break; 2437 case EXACTFL: 2438 PL_reg_flags |= RF_tainted; 2439 /* FALL THROUGH */ 2440 case EXACTF: 2441 s = STRING(scan); 2442 ln = STR_LEN(scan); 2443 2444 if (do_utf8 || UTF) { 2445 /* Either target or the pattern are utf8. */ 2446 char *l = locinput; 2447 char *e = PL_regeol; 2448 2449 if (ibcmp_utf8(s, 0, ln, (bool)UTF, 2450 l, &e, 0, do_utf8)) { 2451 /* One more case for the sharp s: 2452 * pack("U0U*", 0xDF) =~ /ss/i, 2453 * the 0xC3 0x9F are the UTF-8 2454 * byte sequence for the U+00DF. */ 2455 if (!(do_utf8 && 2456 toLOWER(s[0]) == 's' && 2457 ln >= 2 && 2458 toLOWER(s[1]) == 's' && 2459 (U8)l[0] == 0xC3 && 2460 e - l >= 2 && 2461 (U8)l[1] == 0x9F)) 2462 sayNO; 2463 } 2464 locinput = e; 2465 nextchr = UCHARAT(locinput); 2466 break; 2467 } 2468 2469 /* Neither the target and the pattern are utf8. */ 2470 2471 /* Inline the first character, for speed. */ 2472 if (UCHARAT(s) != nextchr && 2473 UCHARAT(s) != ((OP(scan) == EXACTF) 2474 ? PL_fold : PL_fold_locale)[nextchr]) 2475 sayNO; 2476 if (PL_regeol - locinput < ln) 2477 sayNO; 2478 if (ln > 1 && (OP(scan) == EXACTF 2479 ? ibcmp(s, locinput, ln) 2480 : ibcmp_locale(s, locinput, ln))) 2481 sayNO; 2482 locinput += ln; 2483 nextchr = UCHARAT(locinput); 2484 break; 2485 case ANYOF: 2486 if (do_utf8) { 2487 STRLEN inclasslen = PL_regeol - locinput; 2488 2489 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8)) 2490 sayNO_ANYOF; 2491 if (locinput >= PL_regeol) 2492 sayNO; 2493 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput); 2494 nextchr = UCHARAT(locinput); 2495 break; 2496 } 2497 else { 2498 if (nextchr < 0) 2499 nextchr = UCHARAT(locinput); 2500 if (!REGINCLASS(scan, (U8*)locinput)) 2501 sayNO_ANYOF; 2502 if (!nextchr && locinput >= PL_regeol) 2503 sayNO; 2504 nextchr = UCHARAT(++locinput); 2505 break; 2506 } 2507 no_anyof: 2508 /* If we might have the case of the German sharp s 2509 * in a casefolding Unicode character class. */ 2510 2511 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) { 2512 locinput += SHARP_S_SKIP; 2513 nextchr = UCHARAT(locinput); 2514 } 2515 else 2516 sayNO; 2517 break; 2518 case ALNUML: 2519 PL_reg_flags |= RF_tainted; 2520 /* FALL THROUGH */ 2521 case ALNUM: 2522 if (!nextchr) 2523 sayNO; 2524 if (do_utf8) { 2525 LOAD_UTF8_CHARCLASS(alnum,"a"); 2526 if (!(OP(scan) == ALNUM 2527 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8) 2528 : isALNUM_LC_utf8((U8*)locinput))) 2529 { 2530 sayNO; 2531 } 2532 locinput += PL_utf8skip[nextchr]; 2533 nextchr = UCHARAT(locinput); 2534 break; 2535 } 2536 if (!(OP(scan) == ALNUM 2537 ? isALNUM(nextchr) : isALNUM_LC(nextchr))) 2538 sayNO; 2539 nextchr = UCHARAT(++locinput); 2540 break; 2541 case NALNUML: 2542 PL_reg_flags |= RF_tainted; 2543 /* FALL THROUGH */ 2544 case NALNUM: 2545 if (!nextchr && locinput >= PL_regeol) 2546 sayNO; 2547 if (do_utf8) { 2548 LOAD_UTF8_CHARCLASS(alnum,"a"); 2549 if (OP(scan) == NALNUM 2550 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8) 2551 : isALNUM_LC_utf8((U8*)locinput)) 2552 { 2553 sayNO; 2554 } 2555 locinput += PL_utf8skip[nextchr]; 2556 nextchr = UCHARAT(locinput); 2557 break; 2558 } 2559 if (OP(scan) == NALNUM 2560 ? isALNUM(nextchr) : isALNUM_LC(nextchr)) 2561 sayNO; 2562 nextchr = UCHARAT(++locinput); 2563 break; 2564 case BOUNDL: 2565 case NBOUNDL: 2566 PL_reg_flags |= RF_tainted; 2567 /* FALL THROUGH */ 2568 case BOUND: 2569 case NBOUND: 2570 /* was last char in word? */ 2571 if (do_utf8) { 2572 if (locinput == PL_bostr) 2573 ln = '\n'; 2574 else { 2575 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr); 2576 2577 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); 2578 } 2579 if (OP(scan) == BOUND || OP(scan) == NBOUND) { 2580 ln = isALNUM_uni(ln); 2581 LOAD_UTF8_CHARCLASS(alnum,"a"); 2582 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8); 2583 } 2584 else { 2585 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln)); 2586 n = isALNUM_LC_utf8((U8*)locinput); 2587 } 2588 } 2589 else { 2590 ln = (locinput != PL_bostr) ? 2591 UCHARAT(locinput - 1) : '\n'; 2592 if (OP(scan) == BOUND || OP(scan) == NBOUND) { 2593 ln = isALNUM(ln); 2594 n = isALNUM(nextchr); 2595 } 2596 else { 2597 ln = isALNUM_LC(ln); 2598 n = isALNUM_LC(nextchr); 2599 } 2600 } 2601 if (((!ln) == (!n)) == (OP(scan) == BOUND || 2602 OP(scan) == BOUNDL)) 2603 sayNO; 2604 break; 2605 case SPACEL: 2606 PL_reg_flags |= RF_tainted; 2607 /* FALL THROUGH */ 2608 case SPACE: 2609 if (!nextchr) 2610 sayNO; 2611 if (do_utf8) { 2612 if (UTF8_IS_CONTINUED(nextchr)) { 2613 LOAD_UTF8_CHARCLASS(space," "); 2614 if (!(OP(scan) == SPACE 2615 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8) 2616 : isSPACE_LC_utf8((U8*)locinput))) 2617 { 2618 sayNO; 2619 } 2620 locinput += PL_utf8skip[nextchr]; 2621 nextchr = UCHARAT(locinput); 2622 break; 2623 } 2624 if (!(OP(scan) == SPACE 2625 ? isSPACE(nextchr) : isSPACE_LC(nextchr))) 2626 sayNO; 2627 nextchr = UCHARAT(++locinput); 2628 } 2629 else { 2630 if (!(OP(scan) == SPACE 2631 ? isSPACE(nextchr) : isSPACE_LC(nextchr))) 2632 sayNO; 2633 nextchr = UCHARAT(++locinput); 2634 } 2635 break; 2636 case NSPACEL: 2637 PL_reg_flags |= RF_tainted; 2638 /* FALL THROUGH */ 2639 case NSPACE: 2640 if (!nextchr && locinput >= PL_regeol) 2641 sayNO; 2642 if (do_utf8) { 2643 LOAD_UTF8_CHARCLASS(space," "); 2644 if (OP(scan) == NSPACE 2645 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8) 2646 : isSPACE_LC_utf8((U8*)locinput)) 2647 { 2648 sayNO; 2649 } 2650 locinput += PL_utf8skip[nextchr]; 2651 nextchr = UCHARAT(locinput); 2652 break; 2653 } 2654 if (OP(scan) == NSPACE 2655 ? isSPACE(nextchr) : isSPACE_LC(nextchr)) 2656 sayNO; 2657 nextchr = UCHARAT(++locinput); 2658 break; 2659 case DIGITL: 2660 PL_reg_flags |= RF_tainted; 2661 /* FALL THROUGH */ 2662 case DIGIT: 2663 if (!nextchr) 2664 sayNO; 2665 if (do_utf8) { 2666 LOAD_UTF8_CHARCLASS(digit,"0"); 2667 if (!(OP(scan) == DIGIT 2668 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8) 2669 : isDIGIT_LC_utf8((U8*)locinput))) 2670 { 2671 sayNO; 2672 } 2673 locinput += PL_utf8skip[nextchr]; 2674 nextchr = UCHARAT(locinput); 2675 break; 2676 } 2677 if (!(OP(scan) == DIGIT 2678 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) 2679 sayNO; 2680 nextchr = UCHARAT(++locinput); 2681 break; 2682 case NDIGITL: 2683 PL_reg_flags |= RF_tainted; 2684 /* FALL THROUGH */ 2685 case NDIGIT: 2686 if (!nextchr && locinput >= PL_regeol) 2687 sayNO; 2688 if (do_utf8) { 2689 LOAD_UTF8_CHARCLASS(digit,"0"); 2690 if (OP(scan) == NDIGIT 2691 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8) 2692 : isDIGIT_LC_utf8((U8*)locinput)) 2693 { 2694 sayNO; 2695 } 2696 locinput += PL_utf8skip[nextchr]; 2697 nextchr = UCHARAT(locinput); 2698 break; 2699 } 2700 if (OP(scan) == NDIGIT 2701 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) 2702 sayNO; 2703 nextchr = UCHARAT(++locinput); 2704 break; 2705 case CLUMP: 2706 if (locinput >= PL_regeol) 2707 sayNO; 2708 if (do_utf8) { 2709 LOAD_UTF8_CHARCLASS(mark,"~"); 2710 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) 2711 sayNO; 2712 locinput += PL_utf8skip[nextchr]; 2713 while (locinput < PL_regeol && 2714 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) 2715 locinput += UTF8SKIP(locinput); 2716 if (locinput > PL_regeol) 2717 sayNO; 2718 } 2719 else 2720 locinput++; 2721 nextchr = UCHARAT(locinput); 2722 break; 2723 case REFFL: 2724 PL_reg_flags |= RF_tainted; 2725 /* FALL THROUGH */ 2726 case REF: 2727 case REFF: 2728 n = ARG(scan); /* which paren pair */ 2729 ln = PL_regstartp[n]; 2730 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ 2731 if ((I32)*PL_reglastparen < n || ln == -1) 2732 sayNO; /* Do not match unless seen CLOSEn. */ 2733 if (ln == PL_regendp[n]) 2734 break; 2735 2736 s = PL_bostr + ln; 2737 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */ 2738 char *l = locinput; 2739 char *e = PL_bostr + PL_regendp[n]; 2740 /* 2741 * Note that we can't do the "other character" lookup trick as 2742 * in the 8-bit case (no pun intended) because in Unicode we 2743 * have to map both upper and title case to lower case. 2744 */ 2745 if (OP(scan) == REFF) { 2746 STRLEN ulen1, ulen2; 2747 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; 2748 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; 2749 while (s < e) { 2750 if (l >= PL_regeol) 2751 sayNO; 2752 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1); 2753 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2); 2754 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) 2755 sayNO; 2756 s += ulen1; 2757 l += ulen2; 2758 } 2759 } 2760 locinput = l; 2761 nextchr = UCHARAT(locinput); 2762 break; 2763 } 2764 2765 /* Inline the first character, for speed. */ 2766 if (UCHARAT(s) != nextchr && 2767 (OP(scan) == REF || 2768 (UCHARAT(s) != ((OP(scan) == REFF 2769 ? PL_fold : PL_fold_locale)[nextchr])))) 2770 sayNO; 2771 ln = PL_regendp[n] - ln; 2772 if (locinput + ln > PL_regeol) 2773 sayNO; 2774 if (ln > 1 && (OP(scan) == REF 2775 ? memNE(s, locinput, ln) 2776 : (OP(scan) == REFF 2777 ? ibcmp(s, locinput, ln) 2778 : ibcmp_locale(s, locinput, ln)))) 2779 sayNO; 2780 locinput += ln; 2781 nextchr = UCHARAT(locinput); 2782 break; 2783 2784 case NOTHING: 2785 case TAIL: 2786 break; 2787 case BACK: 2788 break; 2789 case EVAL: 2790 { 2791 dSP; 2792 OP_4tree *oop = PL_op; 2793 COP *ocurcop = PL_curcop; 2794 SV **ocurpad = PL_curpad; 2795 SV *ret; 2796 2797 n = ARG(scan); 2798 PL_op = (OP_4tree*)PL_regdata->data[n]; 2799 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); 2800 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]); 2801 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr; 2802 2803 { 2804 SV **before = SP; 2805 CALLRUNOPS(aTHX); /* Scalar context. */ 2806 SPAGAIN; 2807 if (SP == before) 2808 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ 2809 else { 2810 ret = POPs; 2811 PUTBACK; 2812 } 2813 } 2814 2815 PL_op = oop; 2816 PL_curpad = ocurpad; 2817 PL_curcop = ocurcop; 2818 if (logical) { 2819 if (logical == 2) { /* Postponed subexpression. */ 2820 regexp *re; 2821 MAGIC *mg = Null(MAGIC*); 2822 re_cc_state state; 2823 CHECKPOINT cp, lastcp; 2824 2825 if(SvROK(ret) || SvRMAGICAL(ret)) { 2826 SV *sv = SvROK(ret) ? SvRV(ret) : ret; 2827 2828 if(SvMAGICAL(sv)) 2829 mg = mg_find(sv, PERL_MAGIC_qr); 2830 } 2831 if (mg) { 2832 re = (regexp *)mg->mg_obj; 2833 (void)ReREFCNT_inc(re); 2834 } 2835 else { 2836 STRLEN len; 2837 char *t = SvPV(ret, len); 2838 PMOP pm; 2839 char *oprecomp = PL_regprecomp; 2840 I32 osize = PL_regsize; 2841 I32 onpar = PL_regnpar; 2842 2843 Zero(&pm, 1, PMOP); 2844 re = CALLREGCOMP(aTHX_ t, t + len, &pm); 2845 if (!(SvFLAGS(ret) 2846 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY))) 2847 sv_magic(ret,(SV*)ReREFCNT_inc(re), 2848 PERL_MAGIC_qr,0,0); 2849 PL_regprecomp = oprecomp; 2850 PL_regsize = osize; 2851 PL_regnpar = onpar; 2852 } 2853 DEBUG_r( 2854 PerlIO_printf(Perl_debug_log, 2855 "Entering embedded `%s%.60s%s%s'\n", 2856 PL_colors[0], 2857 re->precomp, 2858 PL_colors[1], 2859 (strlen(re->precomp) > 60 ? "..." : "")) 2860 ); 2861 state.node = next; 2862 state.prev = PL_reg_call_cc; 2863 state.cc = PL_regcc; 2864 state.re = PL_reg_re; 2865 2866 PL_regcc = 0; 2867 2868 cp = regcppush(0); /* Save *all* the positions. */ 2869 REGCP_SET(lastcp); 2870 cache_re(re); 2871 state.ss = PL_savestack_ix; 2872 *PL_reglastparen = 0; 2873 *PL_reglastcloseparen = 0; 2874 PL_reg_call_cc = &state; 2875 PL_reginput = locinput; 2876 2877 /* XXXX This is too dramatic a measure... */ 2878 PL_reg_maxiter = 0; 2879 2880 if (regmatch(re->program + 1)) { 2881 /* Even though we succeeded, we need to restore 2882 global variables, since we may be wrapped inside 2883 SUSPEND, thus the match may be not finished yet. */ 2884 2885 /* XXXX Do this only if SUSPENDed? */ 2886 PL_reg_call_cc = state.prev; 2887 PL_regcc = state.cc; 2888 PL_reg_re = state.re; 2889 cache_re(PL_reg_re); 2890 2891 /* XXXX This is too dramatic a measure... */ 2892 PL_reg_maxiter = 0; 2893 2894 /* These are needed even if not SUSPEND. */ 2895 ReREFCNT_dec(re); 2896 regcpblow(cp); 2897 sayYES; 2898 } 2899 ReREFCNT_dec(re); 2900 REGCP_UNWIND(lastcp); 2901 regcppop(); 2902 PL_reg_call_cc = state.prev; 2903 PL_regcc = state.cc; 2904 PL_reg_re = state.re; 2905 cache_re(PL_reg_re); 2906 2907 /* XXXX This is too dramatic a measure... */ 2908 PL_reg_maxiter = 0; 2909 2910 logical = 0; 2911 sayNO; 2912 } 2913 sw = SvTRUE(ret); 2914 logical = 0; 2915 } 2916 else 2917 sv_setsv(save_scalar(PL_replgv), ret); 2918 break; 2919 } 2920 case OPEN: 2921 n = ARG(scan); /* which paren pair */ 2922 PL_reg_start_tmp[n] = locinput; 2923 if (n > PL_regsize) 2924 PL_regsize = n; 2925 break; 2926 case CLOSE: 2927 n = ARG(scan); /* which paren pair */ 2928 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr; 2929 PL_regendp[n] = locinput - PL_bostr; 2930 if (n > (I32)*PL_reglastparen) 2931 *PL_reglastparen = n; 2932 *PL_reglastcloseparen = n; 2933 break; 2934 case GROUPP: 2935 n = ARG(scan); /* which paren pair */ 2936 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1); 2937 break; 2938 case IFTHEN: 2939 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ 2940 if (sw) 2941 next = NEXTOPER(NEXTOPER(scan)); 2942 else { 2943 next = scan + ARG(scan); 2944 if (OP(next) == IFTHEN) /* Fake one. */ 2945 next = NEXTOPER(NEXTOPER(next)); 2946 } 2947 break; 2948 case LOGICAL: 2949 logical = scan->flags; 2950 break; 2951 /******************************************************************* 2952 PL_regcc contains infoblock about the innermost (...)* loop, and 2953 a pointer to the next outer infoblock. 2954 2955 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM): 2956 2957 1) After matching X, regnode for CURLYX is processed; 2958 2959 2) This regnode creates infoblock on the stack, and calls 2960 regmatch() recursively with the starting point at WHILEM node; 2961 2962 3) Each hit of WHILEM node tries to match A and Z (in the order 2963 depending on the current iteration, min/max of {min,max} and 2964 greediness). The information about where are nodes for "A" 2965 and "Z" is read from the infoblock, as is info on how many times "A" 2966 was already matched, and greediness. 2967 2968 4) After A matches, the same WHILEM node is hit again. 2969 2970 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX 2971 of the same pair. Thus when WHILEM tries to match Z, it temporarily 2972 resets PL_regcc, since this Y(A)*Z can be a part of some other loop: 2973 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node 2974 of the external loop. 2975 2976 Currently present infoblocks form a tree with a stem formed by PL_curcc 2977 and whatever it mentions via ->next, and additional attached trees 2978 corresponding to temporarily unset infoblocks as in "5" above. 2979 2980 In the following picture infoblocks for outer loop of 2981 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block 2982 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed 2983 infoblocks are drawn below the "reset" infoblock. 2984 2985 In fact in the picture below we do not show failed matches for Z and T 2986 by WHILEM blocks. [We illustrate minimal matches, since for them it is 2987 more obvious *why* one needs to *temporary* unset infoblocks.] 2988 2989 Matched REx position InfoBlocks Comment 2990 (Y(A)*?Z)*?T x 2991 Y(A)*?Z)*?T x <- O 2992 Y (A)*?Z)*?T x <- O 2993 Y A)*?Z)*?T x <- O <- I 2994 YA )*?Z)*?T x <- O <- I 2995 YA A)*?Z)*?T x <- O <- I 2996 YAA )*?Z)*?T x <- O <- I 2997 YAA Z)*?T x <- O # Temporary unset I 2998 I 2999 3000 YAAZ Y(A)*?Z)*?T x <- O 3001 I 3002 3003 YAAZY (A)*?Z)*?T x <- O 3004 I 3005 3006 YAAZY A)*?Z)*?T x <- O <- I 3007 I 3008 3009 YAAZYA )*?Z)*?T x <- O <- I 3010 I 3011 3012 YAAZYA Z)*?T x <- O # Temporary unset I 3013 I,I 3014 3015 YAAZYAZ )*?T x <- O 3016 I,I 3017 3018 YAAZYAZ T x # Temporary unset O 3019 O 3020 I,I 3021 3022 YAAZYAZT x 3023 O 3024 I,I 3025 *******************************************************************/ 3026 case CURLYX: { 3027 CURCUR cc; 3028 CHECKPOINT cp = PL_savestack_ix; 3029 /* No need to save/restore up to this paren */ 3030 I32 parenfloor = scan->flags; 3031 3032 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ 3033 next += ARG(next); 3034 cc.oldcc = PL_regcc; 3035 PL_regcc = &cc; 3036 /* XXXX Probably it is better to teach regpush to support 3037 parenfloor > PL_regsize... */ 3038 if (parenfloor > (I32)*PL_reglastparen) 3039 parenfloor = *PL_reglastparen; /* Pessimization... */ 3040 cc.parenfloor = parenfloor; 3041 cc.cur = -1; 3042 cc.min = ARG1(scan); 3043 cc.max = ARG2(scan); 3044 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 3045 cc.next = next; 3046 cc.minmod = minmod; 3047 cc.lastloc = 0; 3048 PL_reginput = locinput; 3049 n = regmatch(PREVOPER(next)); /* start on the WHILEM */ 3050 regcpblow(cp); 3051 PL_regcc = cc.oldcc; 3052 saySAME(n); 3053 } 3054 /* NOT REACHED */ 3055 case WHILEM: { 3056 /* 3057 * This is really hard to understand, because after we match 3058 * what we're trying to match, we must make sure the rest of 3059 * the REx is going to match for sure, and to do that we have 3060 * to go back UP the parse tree by recursing ever deeper. And 3061 * if it fails, we have to reset our parent's current state 3062 * that we can try again after backing off. 3063 */ 3064 3065 CHECKPOINT cp, lastcp; 3066 CURCUR* cc = PL_regcc; 3067 char *lastloc = cc->lastloc; /* Detection of 0-len. */ 3068 3069 n = cc->cur + 1; /* how many we know we matched */ 3070 PL_reginput = locinput; 3071 3072 DEBUG_r( 3073 PerlIO_printf(Perl_debug_log, 3074 "%*s %ld out of %ld..%ld cc=%"UVxf"\n", 3075 REPORT_CODE_OFF+PL_regindent*2, "", 3076 (long)n, (long)cc->min, 3077 (long)cc->max, PTR2UV(cc)) 3078 ); 3079 3080 /* If degenerate scan matches "", assume scan done. */ 3081 3082 if (locinput == cc->lastloc && n >= cc->min) { 3083 PL_regcc = cc->oldcc; 3084 if (PL_regcc) 3085 ln = PL_regcc->cur; 3086 DEBUG_r( 3087 PerlIO_printf(Perl_debug_log, 3088 "%*s empty match detected, try continuation...\n", 3089 REPORT_CODE_OFF+PL_regindent*2, "") 3090 ); 3091 if (regmatch(cc->next)) 3092 sayYES; 3093 if (PL_regcc) 3094 PL_regcc->cur = ln; 3095 PL_regcc = cc; 3096 sayNO; 3097 } 3098 3099 /* First just match a string of min scans. */ 3100 3101 if (n < cc->min) { 3102 cc->cur = n; 3103 cc->lastloc = locinput; 3104 if (regmatch(cc->scan)) 3105 sayYES; 3106 cc->cur = n - 1; 3107 cc->lastloc = lastloc; 3108 sayNO; 3109 } 3110 3111 if (scan->flags) { 3112 /* Check whether we already were at this position. 3113 Postpone detection until we know the match is not 3114 *that* much linear. */ 3115 if (!PL_reg_maxiter) { 3116 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4); 3117 PL_reg_leftiter = PL_reg_maxiter; 3118 } 3119 if (PL_reg_leftiter-- == 0) { 3120 I32 size = (PL_reg_maxiter + 7)/8; 3121 if (PL_reg_poscache) { 3122 if ((I32)PL_reg_poscache_size < size) { 3123 Renew(PL_reg_poscache, size, char); 3124 PL_reg_poscache_size = size; 3125 } 3126 Zero(PL_reg_poscache, size, char); 3127 } 3128 else { 3129 PL_reg_poscache_size = size; 3130 Newz(29, PL_reg_poscache, size, char); 3131 } 3132 DEBUG_r( 3133 PerlIO_printf(Perl_debug_log, 3134 "%sDetected a super-linear match, switching on caching%s...\n", 3135 PL_colors[4], PL_colors[5]) 3136 ); 3137 } 3138 if (PL_reg_leftiter < 0) { 3139 I32 o = locinput - PL_bostr, b; 3140 3141 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4); 3142 b = o % 8; 3143 o /= 8; 3144 if (PL_reg_poscache[o] & (1<<b)) { 3145 DEBUG_r( 3146 PerlIO_printf(Perl_debug_log, 3147 "%*s already tried at this position...\n", 3148 REPORT_CODE_OFF+PL_regindent*2, "") 3149 ); 3150 sayNO_SILENT; 3151 } 3152 PL_reg_poscache[o] |= (1<<b); 3153 } 3154 } 3155 3156 /* Prefer next over scan for minimal matching. */ 3157 3158 if (cc->minmod) { 3159 PL_regcc = cc->oldcc; 3160 if (PL_regcc) 3161 ln = PL_regcc->cur; 3162 cp = regcppush(cc->parenfloor); 3163 REGCP_SET(lastcp); 3164 if (regmatch(cc->next)) { 3165 regcpblow(cp); 3166 sayYES; /* All done. */ 3167 } 3168 REGCP_UNWIND(lastcp); 3169 regcppop(); 3170 if (PL_regcc) 3171 PL_regcc->cur = ln; 3172 PL_regcc = cc; 3173 3174 if (n >= cc->max) { /* Maximum greed exceeded? */ 3175 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY 3176 && !(PL_reg_flags & RF_warned)) { 3177 PL_reg_flags |= RF_warned; 3178 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded", 3179 "Complex regular subexpression recursion", 3180 REG_INFTY - 1); 3181 } 3182 sayNO; 3183 } 3184 3185 DEBUG_r( 3186 PerlIO_printf(Perl_debug_log, 3187 "%*s trying longer...\n", 3188 REPORT_CODE_OFF+PL_regindent*2, "") 3189 ); 3190 /* Try scanning more and see if it helps. */ 3191 PL_reginput = locinput; 3192 cc->cur = n; 3193 cc->lastloc = locinput; 3194 cp = regcppush(cc->parenfloor); 3195 REGCP_SET(lastcp); 3196 if (regmatch(cc->scan)) { 3197 regcpblow(cp); 3198 sayYES; 3199 } 3200 REGCP_UNWIND(lastcp); 3201 regcppop(); 3202 cc->cur = n - 1; 3203 cc->lastloc = lastloc; 3204 sayNO; 3205 } 3206 3207 /* Prefer scan over next for maximal matching. */ 3208 3209 if (n < cc->max) { /* More greed allowed? */ 3210 cp = regcppush(cc->parenfloor); 3211 cc->cur = n; 3212 cc->lastloc = locinput; 3213 REGCP_SET(lastcp); 3214 if (regmatch(cc->scan)) { 3215 regcpblow(cp); 3216 sayYES; 3217 } 3218 REGCP_UNWIND(lastcp); 3219 regcppop(); /* Restore some previous $<digit>s? */ 3220 PL_reginput = locinput; 3221 DEBUG_r( 3222 PerlIO_printf(Perl_debug_log, 3223 "%*s failed, try continuation...\n", 3224 REPORT_CODE_OFF+PL_regindent*2, "") 3225 ); 3226 } 3227 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY 3228 && !(PL_reg_flags & RF_warned)) { 3229 PL_reg_flags |= RF_warned; 3230 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded", 3231 "Complex regular subexpression recursion", 3232 REG_INFTY - 1); 3233 } 3234 3235 /* Failed deeper matches of scan, so see if this one works. */ 3236 PL_regcc = cc->oldcc; 3237 if (PL_regcc) 3238 ln = PL_regcc->cur; 3239 if (regmatch(cc->next)) 3240 sayYES; 3241 if (PL_regcc) 3242 PL_regcc->cur = ln; 3243 PL_regcc = cc; 3244 cc->cur = n - 1; 3245 cc->lastloc = lastloc; 3246 sayNO; 3247 } 3248 /* NOT REACHED */ 3249 case BRANCHJ: 3250 next = scan + ARG(scan); 3251 if (next == scan) 3252 next = NULL; 3253 inner = NEXTOPER(NEXTOPER(scan)); 3254 goto do_branch; 3255 case BRANCH: 3256 inner = NEXTOPER(scan); 3257 do_branch: 3258 { 3259 c1 = OP(scan); 3260 if (OP(next) != c1) /* No choice. */ 3261 next = inner; /* Avoid recursion. */ 3262 else { 3263 I32 lastparen = *PL_reglastparen; 3264 I32 unwind1; 3265 re_unwind_branch_t *uw; 3266 3267 /* Put unwinding data on stack */ 3268 unwind1 = SSNEWt(1,re_unwind_branch_t); 3269 uw = SSPTRt(unwind1,re_unwind_branch_t); 3270 uw->prev = unwind; 3271 unwind = unwind1; 3272 uw->type = ((c1 == BRANCH) 3273 ? RE_UNWIND_BRANCH 3274 : RE_UNWIND_BRANCHJ); 3275 uw->lastparen = lastparen; 3276 uw->next = next; 3277 uw->locinput = locinput; 3278 uw->nextchr = nextchr; 3279 #ifdef DEBUGGING 3280 uw->regindent = ++PL_regindent; 3281 #endif 3282 3283 REGCP_SET(uw->lastcp); 3284 3285 /* Now go into the first branch */ 3286 next = inner; 3287 } 3288 } 3289 break; 3290 case MINMOD: 3291 minmod = 1; 3292 break; 3293 case CURLYM: 3294 { 3295 I32 l = 0; 3296 CHECKPOINT lastcp; 3297 3298 /* We suppose that the next guy does not need 3299 backtracking: in particular, it is of constant length, 3300 and has no parenths to influence future backrefs. */ 3301 ln = ARG1(scan); /* min to match */ 3302 n = ARG2(scan); /* max to match */ 3303 paren = scan->flags; 3304 if (paren) { 3305 if (paren > PL_regsize) 3306 PL_regsize = paren; 3307 if (paren > (I32)*PL_reglastparen) 3308 *PL_reglastparen = paren; 3309 } 3310 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 3311 if (paren) 3312 scan += NEXT_OFF(scan); /* Skip former OPEN. */ 3313 PL_reginput = locinput; 3314 if (minmod) { 3315 minmod = 0; 3316 if (ln && regrepeat_hard(scan, ln, &l) < ln) 3317 sayNO; 3318 /* if we matched something zero-length we don't need to 3319 backtrack - capturing parens are already defined, so 3320 the caveat in the maximal case doesn't apply 3321 3322 XXXX if ln == 0, we can redo this check first time 3323 through the following loop 3324 */ 3325 if (ln && l == 0) 3326 n = ln; /* don't backtrack */ 3327 locinput = PL_reginput; 3328 if (HAS_TEXT(next) || JUMPABLE(next)) { 3329 regnode *text_node = next; 3330 3331 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node); 3332 3333 if (! HAS_TEXT(text_node)) c1 = c2 = -1000; 3334 else { 3335 if (PL_regkind[(U8)OP(text_node)] == REF) { 3336 I32 n, ln; 3337 n = ARG(text_node); /* which paren pair */ 3338 ln = PL_regstartp[n]; 3339 /* assume yes if we haven't seen CLOSEn */ 3340 if ( 3341 (I32)*PL_reglastparen < n || 3342 ln == -1 || 3343 ln == PL_regendp[n] 3344 ) { 3345 c1 = c2 = -1000; 3346 goto assume_ok_MM; 3347 } 3348 c1 = *(PL_bostr + ln); 3349 } 3350 else { c1 = (U8)*STRING(text_node); } 3351 if (OP(text_node) == EXACTF || OP(text_node) == REFF) 3352 c2 = PL_fold[c1]; 3353 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL) 3354 c2 = PL_fold_locale[c1]; 3355 else 3356 c2 = c1; 3357 } 3358 } 3359 else 3360 c1 = c2 = -1000; 3361 assume_ok_MM: 3362 REGCP_SET(lastcp); 3363 /* This may be improved if l == 0. */ 3364 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */ 3365 /* If it could work, try it. */ 3366 if (c1 == -1000 || 3367 UCHARAT(PL_reginput) == c1 || 3368 UCHARAT(PL_reginput) == c2) 3369 { 3370 if (paren) { 3371 if (ln) { 3372 PL_regstartp[paren] = 3373 HOPc(PL_reginput, -l) - PL_bostr; 3374 PL_regendp[paren] = PL_reginput - PL_bostr; 3375 } 3376 else 3377 PL_regendp[paren] = -1; 3378 } 3379 if (regmatch(next)) 3380 sayYES; 3381 REGCP_UNWIND(lastcp); 3382 } 3383 /* Couldn't or didn't -- move forward. */ 3384 PL_reginput = locinput; 3385 if (regrepeat_hard(scan, 1, &l)) { 3386 ln++; 3387 locinput = PL_reginput; 3388 } 3389 else 3390 sayNO; 3391 } 3392 } 3393 else { 3394 n = regrepeat_hard(scan, n, &l); 3395 /* if we matched something zero-length we don't need to 3396 backtrack, unless the minimum count is zero and we 3397 are capturing the result - in that case the capture 3398 being defined or not may affect later execution 3399 */ 3400 if (n != 0 && l == 0 && !(paren && ln == 0)) 3401 ln = n; /* don't backtrack */ 3402 locinput = PL_reginput; 3403 DEBUG_r( 3404 PerlIO_printf(Perl_debug_log, 3405 "%*s matched %"IVdf" times, len=%"IVdf"...\n", 3406 (int)(REPORT_CODE_OFF+PL_regindent*2), "", 3407 (IV) n, (IV)l) 3408 ); 3409 if (n >= ln) { 3410 if (HAS_TEXT(next) || JUMPABLE(next)) { 3411 regnode *text_node = next; 3412 3413 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node); 3414 3415 if (! HAS_TEXT(text_node)) c1 = c2 = -1000; 3416 else { 3417 if (PL_regkind[(U8)OP(text_node)] == REF) { 3418 I32 n, ln; 3419 n = ARG(text_node); /* which paren pair */ 3420 ln = PL_regstartp[n]; 3421 /* assume yes if we haven't seen CLOSEn */ 3422 if ( 3423 (I32)*PL_reglastparen < n || 3424 ln == -1 || 3425 ln == PL_regendp[n] 3426 ) { 3427 c1 = c2 = -1000; 3428 goto assume_ok_REG; 3429 } 3430 c1 = *(PL_bostr + ln); 3431 } 3432 else { c1 = (U8)*STRING(text_node); } 3433 3434 if (OP(text_node) == EXACTF || OP(text_node) == REFF) 3435 c2 = PL_fold[c1]; 3436 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL) 3437 c2 = PL_fold_locale[c1]; 3438 else 3439 c2 = c1; 3440 } 3441 } 3442 else 3443 c1 = c2 = -1000; 3444 } 3445 assume_ok_REG: 3446 REGCP_SET(lastcp); 3447 while (n >= ln) { 3448 /* If it could work, try it. */ 3449 if (c1 == -1000 || 3450 UCHARAT(PL_reginput) == c1 || 3451 UCHARAT(PL_reginput) == c2) 3452 { 3453 DEBUG_r( 3454 PerlIO_printf(Perl_debug_log, 3455 "%*s trying tail with n=%"IVdf"...\n", 3456 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n) 3457 ); 3458 if (paren) { 3459 if (n) { 3460 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr; 3461 PL_regendp[paren] = PL_reginput - PL_bostr; 3462 } 3463 else 3464 PL_regendp[paren] = -1; 3465 } 3466 if (regmatch(next)) 3467 sayYES; 3468 REGCP_UNWIND(lastcp); 3469 } 3470 /* Couldn't or didn't -- back up. */ 3471 n--; 3472 locinput = HOPc(locinput, -l); 3473 PL_reginput = locinput; 3474 } 3475 } 3476 sayNO; 3477 break; 3478 } 3479 case CURLYN: 3480 paren = scan->flags; /* Which paren to set */ 3481 if (paren > PL_regsize) 3482 PL_regsize = paren; 3483 if (paren > (I32)*PL_reglastparen) 3484 *PL_reglastparen = paren; 3485 ln = ARG1(scan); /* min to match */ 3486 n = ARG2(scan); /* max to match */ 3487 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); 3488 goto repeat; 3489 case CURLY: 3490 paren = 0; 3491 ln = ARG1(scan); /* min to match */ 3492 n = ARG2(scan); /* max to match */ 3493 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 3494 goto repeat; 3495 case STAR: 3496 ln = 0; 3497 n = REG_INFTY; 3498 scan = NEXTOPER(scan); 3499 paren = 0; 3500 goto repeat; 3501 case PLUS: 3502 ln = 1; 3503 n = REG_INFTY; 3504 scan = NEXTOPER(scan); 3505 paren = 0; 3506 repeat: 3507 /* 3508 * Lookahead to avoid useless match attempts 3509 * when we know what character comes next. 3510 */ 3511 3512 /* 3513 * Used to only do .*x and .*?x, but now it allows 3514 * for )'s, ('s and (?{ ... })'s to be in the way 3515 * of the quantifier and the EXACT-like node. -- japhy 3516 */ 3517 3518 if (HAS_TEXT(next) || JUMPABLE(next)) { 3519 U8 *s; 3520 regnode *text_node = next; 3521 3522 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node); 3523 3524 if (! HAS_TEXT(text_node)) c1 = c2 = -1000; 3525 else { 3526 if (PL_regkind[(U8)OP(text_node)] == REF) { 3527 I32 n, ln; 3528 n = ARG(text_node); /* which paren pair */ 3529 ln = PL_regstartp[n]; 3530 /* assume yes if we haven't seen CLOSEn */ 3531 if ( 3532 (I32)*PL_reglastparen < n || 3533 ln == -1 || 3534 ln == PL_regendp[n] 3535 ) { 3536 c1 = c2 = -1000; 3537 goto assume_ok_easy; 3538 } 3539 s = (U8*)PL_bostr + ln; 3540 } 3541 else { s = (U8*)STRING(text_node); } 3542 3543 if (!UTF) { 3544 c2 = c1 = *s; 3545 if (OP(text_node) == EXACTF || OP(text_node) == REFF) 3546 c2 = PL_fold[c1]; 3547 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL) 3548 c2 = PL_fold_locale[c1]; 3549 } 3550 else { /* UTF */ 3551 if (OP(text_node) == EXACTF || OP(text_node) == REFF) { 3552 STRLEN ulen1, ulen2; 3553 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; 3554 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; 3555 3556 to_utf8_lower((U8*)s, tmpbuf1, &ulen1); 3557 to_utf8_upper((U8*)s, tmpbuf2, &ulen2); 3558 3559 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0, 3560 ckWARN(WARN_UTF8) ? 3561 0 : UTF8_ALLOW_ANY); 3562 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0, 3563 ckWARN(WARN_UTF8) ? 3564 0 : UTF8_ALLOW_ANY); 3565 } 3566 else { 3567 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0, 3568 ckWARN(WARN_UTF8) ? 3569 0 : UTF8_ALLOW_ANY); 3570 } 3571 } 3572 } 3573 } 3574 else 3575 c1 = c2 = -1000; 3576 assume_ok_easy: 3577 PL_reginput = locinput; 3578 if (minmod) { 3579 CHECKPOINT lastcp; 3580 minmod = 0; 3581 if (ln && regrepeat(scan, ln) < ln) 3582 sayNO; 3583 locinput = PL_reginput; 3584 REGCP_SET(lastcp); 3585 if (c1 != -1000) { 3586 char *e; /* Should not check after this */ 3587 char *old = locinput; 3588 int count = 0; 3589 3590 if (n == REG_INFTY) { 3591 e = PL_regeol - 1; 3592 if (do_utf8) 3593 while (UTF8_IS_CONTINUATION(*(U8*)e)) 3594 e--; 3595 } 3596 else if (do_utf8) { 3597 int m = n - ln; 3598 for (e = locinput; 3599 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--) 3600 e += UTF8SKIP(e); 3601 } 3602 else { 3603 e = locinput + n - ln; 3604 if (e >= PL_regeol) 3605 e = PL_regeol - 1; 3606 } 3607 while (1) { 3608 /* Find place 'next' could work */ 3609 if (!do_utf8) { 3610 if (c1 == c2) { 3611 while (locinput <= e && 3612 UCHARAT(locinput) != c1) 3613 locinput++; 3614 } else { 3615 while (locinput <= e 3616 && UCHARAT(locinput) != c1 3617 && UCHARAT(locinput) != c2) 3618 locinput++; 3619 } 3620 count = locinput - old; 3621 } 3622 else { 3623 STRLEN len; 3624 if (c1 == c2) { 3625 /* count initialised to 3626 * utf8_distance(old, locinput) */ 3627 while (locinput <= e && 3628 utf8n_to_uvchr((U8*)locinput, 3629 UTF8_MAXLEN, &len, 3630 ckWARN(WARN_UTF8) ? 3631 0 : UTF8_ALLOW_ANY) != (UV)c1) { 3632 locinput += len; 3633 count++; 3634 } 3635 } else { 3636 /* count initialised to 3637 * utf8_distance(old, locinput) */ 3638 while (locinput <= e) { 3639 UV c = utf8n_to_uvchr((U8*)locinput, 3640 UTF8_MAXLEN, &len, 3641 ckWARN(WARN_UTF8) ? 3642 0 : UTF8_ALLOW_ANY); 3643 if (c == (UV)c1 || c == (UV)c2) 3644 break; 3645 locinput += len; 3646 count++; 3647 } 3648 } 3649 } 3650 if (locinput > e) 3651 sayNO; 3652 /* PL_reginput == old now */ 3653 if (locinput != old) { 3654 ln = 1; /* Did some */ 3655 if (regrepeat(scan, count) < count) 3656 sayNO; 3657 } 3658 /* PL_reginput == locinput now */ 3659 TRYPAREN(paren, ln, locinput); 3660 PL_reginput = locinput; /* Could be reset... */ 3661 REGCP_UNWIND(lastcp); 3662 /* Couldn't or didn't -- move forward. */ 3663 old = locinput; 3664 if (do_utf8) 3665 locinput += UTF8SKIP(locinput); 3666 else 3667 locinput++; 3668 count = 1; 3669 } 3670 } 3671 else 3672 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */ 3673 UV c; 3674 if (c1 != -1000) { 3675 if (do_utf8) 3676 c = utf8n_to_uvchr((U8*)PL_reginput, 3677 UTF8_MAXLEN, 0, 3678 ckWARN(WARN_UTF8) ? 3679 0 : UTF8_ALLOW_ANY); 3680 else 3681 c = UCHARAT(PL_reginput); 3682 /* If it could work, try it. */ 3683 if (c == (UV)c1 || c == (UV)c2) 3684 { 3685 TRYPAREN(paren, n, PL_reginput); 3686 REGCP_UNWIND(lastcp); 3687 } 3688 } 3689 /* If it could work, try it. */ 3690 else if (c1 == -1000) 3691 { 3692 TRYPAREN(paren, n, PL_reginput); 3693 REGCP_UNWIND(lastcp); 3694 } 3695 /* Couldn't or didn't -- move forward. */ 3696 PL_reginput = locinput; 3697 if (regrepeat(scan, 1)) { 3698 ln++; 3699 locinput = PL_reginput; 3700 } 3701 else 3702 sayNO; 3703 } 3704 } 3705 else { 3706 CHECKPOINT lastcp; 3707 n = regrepeat(scan, n); 3708 locinput = PL_reginput; 3709 if (ln < n && PL_regkind[(U8)OP(next)] == EOL && 3710 ((!PL_multiline && OP(next) != MEOL) || 3711 OP(next) == SEOL || OP(next) == EOS)) 3712 { 3713 ln = n; /* why back off? */ 3714 /* ...because $ and \Z can match before *and* after 3715 newline at the end. Consider "\n\n" =~ /\n+\Z\n/. 3716 We should back off by one in this case. */ 3717 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS) 3718 ln--; 3719 } 3720 REGCP_SET(lastcp); 3721 if (paren) { 3722 UV c = 0; 3723 while (n >= ln) { 3724 if (c1 != -1000) { 3725 if (do_utf8) 3726 c = utf8n_to_uvchr((U8*)PL_reginput, 3727 UTF8_MAXLEN, 0, 3728 ckWARN(WARN_UTF8) ? 3729 0 : UTF8_ALLOW_ANY); 3730 else 3731 c = UCHARAT(PL_reginput); 3732 } 3733 /* If it could work, try it. */ 3734 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2) 3735 { 3736 TRYPAREN(paren, n, PL_reginput); 3737 REGCP_UNWIND(lastcp); 3738 } 3739 /* Couldn't or didn't -- back up. */ 3740 n--; 3741 PL_reginput = locinput = HOPc(locinput, -1); 3742 } 3743 } 3744 else { 3745 UV c = 0; 3746 while (n >= ln) { 3747 if (c1 != -1000) { 3748 if (do_utf8) 3749 c = utf8n_to_uvchr((U8*)PL_reginput, 3750 UTF8_MAXLEN, 0, 3751 ckWARN(WARN_UTF8) ? 3752 0 : UTF8_ALLOW_ANY); 3753 else 3754 c = UCHARAT(PL_reginput); 3755 } 3756 /* If it could work, try it. */ 3757 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2) 3758 { 3759 TRYPAREN(paren, n, PL_reginput); 3760 REGCP_UNWIND(lastcp); 3761 } 3762 /* Couldn't or didn't -- back up. */ 3763 n--; 3764 PL_reginput = locinput = HOPc(locinput, -1); 3765 } 3766 } 3767 } 3768 sayNO; 3769 break; 3770 case END: 3771 if (PL_reg_call_cc) { 3772 re_cc_state *cur_call_cc = PL_reg_call_cc; 3773 CURCUR *cctmp = PL_regcc; 3774 regexp *re = PL_reg_re; 3775 CHECKPOINT cp, lastcp; 3776 3777 cp = regcppush(0); /* Save *all* the positions. */ 3778 REGCP_SET(lastcp); 3779 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of 3780 the caller. */ 3781 PL_reginput = locinput; /* Make position available to 3782 the callcc. */ 3783 cache_re(PL_reg_call_cc->re); 3784 PL_regcc = PL_reg_call_cc->cc; 3785 PL_reg_call_cc = PL_reg_call_cc->prev; 3786 if (regmatch(cur_call_cc->node)) { 3787 PL_reg_call_cc = cur_call_cc; 3788 regcpblow(cp); 3789 sayYES; 3790 } 3791 REGCP_UNWIND(lastcp); 3792 regcppop(); 3793 PL_reg_call_cc = cur_call_cc; 3794 PL_regcc = cctmp; 3795 PL_reg_re = re; 3796 cache_re(re); 3797 3798 DEBUG_r( 3799 PerlIO_printf(Perl_debug_log, 3800 "%*s continuation failed...\n", 3801 REPORT_CODE_OFF+PL_regindent*2, "") 3802 ); 3803 sayNO_SILENT; 3804 } 3805 if (locinput < PL_regtill) { 3806 DEBUG_r(PerlIO_printf(Perl_debug_log, 3807 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", 3808 PL_colors[4], 3809 (long)(locinput - PL_reg_starttry), 3810 (long)(PL_regtill - PL_reg_starttry), 3811 PL_colors[5])); 3812 sayNO_FINAL; /* Cannot match: too short. */ 3813 } 3814 PL_reginput = locinput; /* put where regtry can find it */ 3815 sayYES_FINAL; /* Success! */ 3816 case SUCCEED: 3817 PL_reginput = locinput; /* put where regtry can find it */ 3818 sayYES_LOUD; /* Success! */ 3819 case SUSPEND: 3820 n = 1; 3821 PL_reginput = locinput; 3822 goto do_ifmatch; 3823 case UNLESSM: 3824 n = 0; 3825 if (scan->flags) { 3826 s = HOPBACKc(locinput, scan->flags); 3827 if (!s) 3828 goto say_yes; 3829 PL_reginput = s; 3830 } 3831 else 3832 PL_reginput = locinput; 3833 goto do_ifmatch; 3834 case IFMATCH: 3835 n = 1; 3836 if (scan->flags) { 3837 s = HOPBACKc(locinput, scan->flags); 3838 if (!s) 3839 goto say_no; 3840 PL_reginput = s; 3841 } 3842 else 3843 PL_reginput = locinput; 3844 3845 do_ifmatch: 3846 inner = NEXTOPER(NEXTOPER(scan)); 3847 if (regmatch(inner) != n) { 3848 say_no: 3849 if (logical) { 3850 logical = 0; 3851 sw = 0; 3852 goto do_longjump; 3853 } 3854 else 3855 sayNO; 3856 } 3857 say_yes: 3858 if (logical) { 3859 logical = 0; 3860 sw = 1; 3861 } 3862 if (OP(scan) == SUSPEND) { 3863 locinput = PL_reginput; 3864 nextchr = UCHARAT(locinput); 3865 } 3866 /* FALL THROUGH. */ 3867 case LONGJMP: 3868 do_longjump: 3869 next = scan + ARG(scan); 3870 if (next == scan) 3871 next = NULL; 3872 break; 3873 default: 3874 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", 3875 PTR2UV(scan), OP(scan)); 3876 Perl_croak(aTHX_ "regexp memory corruption"); 3877 } 3878 reenter: 3879 scan = next; 3880 } 3881 3882 /* 3883 * We get here only if there's trouble -- normally "case END" is 3884 * the terminating point. 3885 */ 3886 Perl_croak(aTHX_ "corrupted regexp pointers"); 3887 /*NOTREACHED*/ 3888 sayNO; 3889 3890 yes_loud: 3891 DEBUG_r( 3892 PerlIO_printf(Perl_debug_log, 3893 "%*s %scould match...%s\n", 3894 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5]) 3895 ); 3896 goto yes; 3897 yes_final: 3898 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", 3899 PL_colors[4],PL_colors[5])); 3900 yes: 3901 #ifdef DEBUGGING 3902 PL_regindent--; 3903 #endif 3904 3905 #if 0 /* Breaks $^R */ 3906 if (unwind) 3907 regcpblow(firstcp); 3908 #endif 3909 return 1; 3910 3911 no: 3912 DEBUG_r( 3913 PerlIO_printf(Perl_debug_log, 3914 "%*s %sfailed...%s\n", 3915 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5]) 3916 ); 3917 goto do_no; 3918 no_final: 3919 do_no: 3920 if (unwind) { 3921 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t); 3922 3923 switch (uw->type) { 3924 case RE_UNWIND_BRANCH: 3925 case RE_UNWIND_BRANCHJ: 3926 { 3927 re_unwind_branch_t *uwb = &(uw->branch); 3928 I32 lastparen = uwb->lastparen; 3929 3930 REGCP_UNWIND(uwb->lastcp); 3931 for (n = *PL_reglastparen; n > lastparen; n--) 3932 PL_regendp[n] = -1; 3933 *PL_reglastparen = n; 3934 scan = next = uwb->next; 3935 if ( !scan || 3936 OP(scan) != (uwb->type == RE_UNWIND_BRANCH 3937 ? BRANCH : BRANCHJ) ) { /* Failure */ 3938 unwind = uwb->prev; 3939 #ifdef DEBUGGING 3940 PL_regindent--; 3941 #endif 3942 goto do_no; 3943 } 3944 /* Have more choice yet. Reuse the same uwb. */ 3945 /*SUPPRESS 560*/ 3946 if ((n = (uwb->type == RE_UNWIND_BRANCH 3947 ? NEXT_OFF(next) : ARG(next)))) 3948 next += n; 3949 else 3950 next = NULL; /* XXXX Needn't unwinding in this case... */ 3951 uwb->next = next; 3952 next = NEXTOPER(scan); 3953 if (uwb->type == RE_UNWIND_BRANCHJ) 3954 next = NEXTOPER(next); 3955 locinput = uwb->locinput; 3956 nextchr = uwb->nextchr; 3957 #ifdef DEBUGGING 3958 PL_regindent = uwb->regindent; 3959 #endif 3960 3961 goto reenter; 3962 } 3963 /* NOT REACHED */ 3964 default: 3965 Perl_croak(aTHX_ "regexp unwind memory corruption"); 3966 } 3967 /* NOT REACHED */ 3968 } 3969 #ifdef DEBUGGING 3970 PL_regindent--; 3971 #endif 3972 return 0; 3973 } 3974 3975 /* 3976 - regrepeat - repeatedly match something simple, report how many 3977 */ 3978 /* 3979 * [This routine now assumes that it will only match on things of length 1. 3980 * That was true before, but now we assume scan - reginput is the count, 3981 * rather than incrementing count on every character. [Er, except utf8.]] 3982 */ 3983 STATIC I32 3984 S_regrepeat(pTHX_ regnode *p, I32 max) 3985 { 3986 register char *scan; 3987 register I32 c; 3988 register char *loceol = PL_regeol; 3989 register I32 hardcount = 0; 3990 register bool do_utf8 = PL_reg_match_utf8; 3991 3992 scan = PL_reginput; 3993 if (max == REG_INFTY) 3994 max = I32_MAX; 3995 else if (max < loceol - scan) 3996 loceol = scan + max; 3997 switch (OP(p)) { 3998 case REG_ANY: 3999 if (do_utf8) { 4000 loceol = PL_regeol; 4001 while (scan < loceol && hardcount < max && *scan != '\n') { 4002 scan += UTF8SKIP(scan); 4003 hardcount++; 4004 } 4005 } else { 4006 while (scan < loceol && *scan != '\n') 4007 scan++; 4008 } 4009 break; 4010 case SANY: 4011 if (do_utf8) { 4012 loceol = PL_regeol; 4013 while (scan < loceol && hardcount < max) { 4014 scan += UTF8SKIP(scan); 4015 hardcount++; 4016 } 4017 } 4018 else 4019 scan = loceol; 4020 break; 4021 case CANY: 4022 scan = loceol; 4023 break; 4024 case EXACT: /* length of string is 1 */ 4025 c = (U8)*STRING(p); 4026 while (scan < loceol && UCHARAT(scan) == c) 4027 scan++; 4028 break; 4029 case EXACTF: /* length of string is 1 */ 4030 c = (U8)*STRING(p); 4031 while (scan < loceol && 4032 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) 4033 scan++; 4034 break; 4035 case EXACTFL: /* length of string is 1 */ 4036 PL_reg_flags |= RF_tainted; 4037 c = (U8)*STRING(p); 4038 while (scan < loceol && 4039 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) 4040 scan++; 4041 break; 4042 case ANYOF: 4043 if (do_utf8) { 4044 loceol = PL_regeol; 4045 while (hardcount < max && scan < loceol && 4046 reginclass(p, (U8*)scan, 0, do_utf8)) { 4047 scan += UTF8SKIP(scan); 4048 hardcount++; 4049 } 4050 } else { 4051 while (scan < loceol && REGINCLASS(p, (U8*)scan)) 4052 scan++; 4053 } 4054 break; 4055 case ALNUM: 4056 if (do_utf8) { 4057 loceol = PL_regeol; 4058 LOAD_UTF8_CHARCLASS(alnum,"a"); 4059 while (hardcount < max && scan < loceol && 4060 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { 4061 scan += UTF8SKIP(scan); 4062 hardcount++; 4063 } 4064 } else { 4065 while (scan < loceol && isALNUM(*scan)) 4066 scan++; 4067 } 4068 break; 4069 case ALNUML: 4070 PL_reg_flags |= RF_tainted; 4071 if (do_utf8) { 4072 loceol = PL_regeol; 4073 while (hardcount < max && scan < loceol && 4074 isALNUM_LC_utf8((U8*)scan)) { 4075 scan += UTF8SKIP(scan); 4076 hardcount++; 4077 } 4078 } else { 4079 while (scan < loceol && isALNUM_LC(*scan)) 4080 scan++; 4081 } 4082 break; 4083 case NALNUM: 4084 if (do_utf8) { 4085 loceol = PL_regeol; 4086 LOAD_UTF8_CHARCLASS(alnum,"a"); 4087 while (hardcount < max && scan < loceol && 4088 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { 4089 scan += UTF8SKIP(scan); 4090 hardcount++; 4091 } 4092 } else { 4093 while (scan < loceol && !isALNUM(*scan)) 4094 scan++; 4095 } 4096 break; 4097 case NALNUML: 4098 PL_reg_flags |= RF_tainted; 4099 if (do_utf8) { 4100 loceol = PL_regeol; 4101 while (hardcount < max && scan < loceol && 4102 !isALNUM_LC_utf8((U8*)scan)) { 4103 scan += UTF8SKIP(scan); 4104 hardcount++; 4105 } 4106 } else { 4107 while (scan < loceol && !isALNUM_LC(*scan)) 4108 scan++; 4109 } 4110 break; 4111 case SPACE: 4112 if (do_utf8) { 4113 loceol = PL_regeol; 4114 LOAD_UTF8_CHARCLASS(space," "); 4115 while (hardcount < max && scan < loceol && 4116 (*scan == ' ' || 4117 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { 4118 scan += UTF8SKIP(scan); 4119 hardcount++; 4120 } 4121 } else { 4122 while (scan < loceol && isSPACE(*scan)) 4123 scan++; 4124 } 4125 break; 4126 case SPACEL: 4127 PL_reg_flags |= RF_tainted; 4128 if (do_utf8) { 4129 loceol = PL_regeol; 4130 while (hardcount < max && scan < loceol && 4131 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { 4132 scan += UTF8SKIP(scan); 4133 hardcount++; 4134 } 4135 } else { 4136 while (scan < loceol && isSPACE_LC(*scan)) 4137 scan++; 4138 } 4139 break; 4140 case NSPACE: 4141 if (do_utf8) { 4142 loceol = PL_regeol; 4143 LOAD_UTF8_CHARCLASS(space," "); 4144 while (hardcount < max && scan < loceol && 4145 !(*scan == ' ' || 4146 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { 4147 scan += UTF8SKIP(scan); 4148 hardcount++; 4149 } 4150 } else { 4151 while (scan < loceol && !isSPACE(*scan)) 4152 scan++; 4153 break; 4154 } 4155 case NSPACEL: 4156 PL_reg_flags |= RF_tainted; 4157 if (do_utf8) { 4158 loceol = PL_regeol; 4159 while (hardcount < max && scan < loceol && 4160 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { 4161 scan += UTF8SKIP(scan); 4162 hardcount++; 4163 } 4164 } else { 4165 while (scan < loceol && !isSPACE_LC(*scan)) 4166 scan++; 4167 } 4168 break; 4169 case DIGIT: 4170 if (do_utf8) { 4171 loceol = PL_regeol; 4172 LOAD_UTF8_CHARCLASS(digit,"0"); 4173 while (hardcount < max && scan < loceol && 4174 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { 4175 scan += UTF8SKIP(scan); 4176 hardcount++; 4177 } 4178 } else { 4179 while (scan < loceol && isDIGIT(*scan)) 4180 scan++; 4181 } 4182 break; 4183 case NDIGIT: 4184 if (do_utf8) { 4185 loceol = PL_regeol; 4186 LOAD_UTF8_CHARCLASS(digit,"0"); 4187 while (hardcount < max && scan < loceol && 4188 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { 4189 scan += UTF8SKIP(scan); 4190 hardcount++; 4191 } 4192 } else { 4193 while (scan < loceol && !isDIGIT(*scan)) 4194 scan++; 4195 } 4196 break; 4197 default: /* Called on something of 0 width. */ 4198 break; /* So match right here or not at all. */ 4199 } 4200 4201 if (hardcount) 4202 c = hardcount; 4203 else 4204 c = scan - PL_reginput; 4205 PL_reginput = scan; 4206 4207 DEBUG_r( 4208 { 4209 SV *prop = sv_newmortal(); 4210 4211 regprop(prop, p); 4212 PerlIO_printf(Perl_debug_log, 4213 "%*s %s can match %"IVdf" times out of %"IVdf"...\n", 4214 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max); 4215 }); 4216 4217 return(c); 4218 } 4219 4220 /* 4221 - regrepeat_hard - repeatedly match something, report total lenth and length 4222 * 4223 * The repeater is supposed to have constant length. 4224 */ 4225 4226 STATIC I32 4227 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp) 4228 { 4229 register char *scan = Nullch; 4230 register char *start; 4231 register char *loceol = PL_regeol; 4232 I32 l = 0; 4233 I32 count = 0, res = 1; 4234 4235 if (!max) 4236 return 0; 4237 4238 start = PL_reginput; 4239 if (PL_reg_match_utf8) { 4240 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) { 4241 if (!count++) { 4242 l = 0; 4243 while (start < PL_reginput) { 4244 l++; 4245 start += UTF8SKIP(start); 4246 } 4247 *lp = l; 4248 if (l == 0) 4249 return max; 4250 } 4251 if (count == max) 4252 return count; 4253 } 4254 } 4255 else { 4256 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) { 4257 if (!count++) { 4258 *lp = l = PL_reginput - start; 4259 if (max != REG_INFTY && l*max < loceol - scan) 4260 loceol = scan + l*max; 4261 if (l == 0) 4262 return max; 4263 } 4264 } 4265 } 4266 if (!res) 4267 PL_reginput = scan; 4268 4269 return count; 4270 } 4271 4272 /* 4273 - regclass_swash - prepare the utf8 swash 4274 */ 4275 4276 SV * 4277 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp) 4278 { 4279 SV *sw = NULL; 4280 SV *si = NULL; 4281 SV *alt = NULL; 4282 4283 if (PL_regdata && PL_regdata->count) { 4284 U32 n = ARG(node); 4285 4286 if (PL_regdata->what[n] == 's') { 4287 SV *rv = (SV*)PL_regdata->data[n]; 4288 AV *av = (AV*)SvRV((SV*)rv); 4289 SV **ary = AvARRAY(av); 4290 SV **a, **b; 4291 4292 /* See the end of regcomp.c:S_reglass() for 4293 * documentation of these array elements. */ 4294 4295 si = *ary; 4296 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0; 4297 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0; 4298 4299 if (a) 4300 sw = *a; 4301 else if (si && doinit) { 4302 sw = swash_init("utf8", "", si, 1, 0); 4303 (void)av_store(av, 1, sw); 4304 } 4305 if (b) 4306 alt = *b; 4307 } 4308 } 4309 4310 if (listsvp) 4311 *listsvp = si; 4312 if (altsvp) 4313 *altsvp = alt; 4314 4315 return sw; 4316 } 4317 4318 /* 4319 - reginclass - determine if a character falls into a character class 4320 4321 The n is the ANYOF regnode, the p is the target string, lenp 4322 is pointer to the maximum length of how far to go in the p 4323 (if the lenp is zero, UTF8SKIP(p) is used), 4324 do_utf8 tells whether the target string is in UTF-8. 4325 4326 */ 4327 4328 STATIC bool 4329 S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8) 4330 { 4331 char flags = ANYOF_FLAGS(n); 4332 bool match = FALSE; 4333 UV c = *p; 4334 STRLEN len = 0; 4335 STRLEN plen; 4336 4337 if (do_utf8 && !UTF8_IS_INVARIANT(c)) 4338 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len, 4339 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 4340 4341 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); 4342 if (do_utf8 || (flags & ANYOF_UNICODE)) { 4343 if (lenp) 4344 *lenp = 0; 4345 if (do_utf8 && !ANYOF_RUNTIME(n)) { 4346 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) 4347 match = TRUE; 4348 } 4349 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256) 4350 match = TRUE; 4351 if (!match) { 4352 AV *av; 4353 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av); 4354 4355 if (sw) { 4356 if (swash_fetch(sw, p, do_utf8)) 4357 match = TRUE; 4358 else if (flags & ANYOF_FOLD) { 4359 if (!match && lenp && av) { 4360 I32 i; 4361 4362 for (i = 0; i <= av_len(av); i++) { 4363 SV* sv = *av_fetch(av, i, FALSE); 4364 STRLEN len; 4365 char *s = SvPV(sv, len); 4366 4367 if (len <= plen && memEQ(s, (char*)p, len)) { 4368 *lenp = len; 4369 match = TRUE; 4370 break; 4371 } 4372 } 4373 } 4374 if (!match) { 4375 U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; 4376 STRLEN tmplen; 4377 4378 to_utf8_fold(p, tmpbuf, &tmplen); 4379 if (swash_fetch(sw, tmpbuf, do_utf8)) 4380 match = TRUE; 4381 } 4382 } 4383 } 4384 } 4385 if (match && lenp && *lenp == 0) 4386 *lenp = UNISKIP(NATIVE_TO_UNI(c)); 4387 } 4388 if (!match && c < 256) { 4389 if (ANYOF_BITMAP_TEST(n, c)) 4390 match = TRUE; 4391 else if (flags & ANYOF_FOLD) { 4392 U8 f; 4393 4394 if (flags & ANYOF_LOCALE) { 4395 PL_reg_flags |= RF_tainted; 4396 f = PL_fold_locale[c]; 4397 } 4398 else 4399 f = PL_fold[c]; 4400 if (f != c && ANYOF_BITMAP_TEST(n, f)) 4401 match = TRUE; 4402 } 4403 4404 if (!match && (flags & ANYOF_CLASS)) { 4405 PL_reg_flags |= RF_tainted; 4406 if ( 4407 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || 4408 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || 4409 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || 4410 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || 4411 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || 4412 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || 4413 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || 4414 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || 4415 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || 4416 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || 4417 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || 4418 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || 4419 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || 4420 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || 4421 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || 4422 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || 4423 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || 4424 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || 4425 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || 4426 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || 4427 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || 4428 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || 4429 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || 4430 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || 4431 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || 4432 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || 4433 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || 4434 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || 4435 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || 4436 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) 4437 ) /* How's that for a conditional? */ 4438 { 4439 match = TRUE; 4440 } 4441 } 4442 } 4443 4444 return (flags & ANYOF_INVERT) ? !match : match; 4445 } 4446 4447 STATIC U8 * 4448 S_reghop(pTHX_ U8 *s, I32 off) 4449 { 4450 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)); 4451 } 4452 4453 STATIC U8 * 4454 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim) 4455 { 4456 if (off >= 0) { 4457 while (off-- && s < lim) { 4458 /* XXX could check well-formedness here */ 4459 s += UTF8SKIP(s); 4460 } 4461 } 4462 else { 4463 while (off++) { 4464 if (s > lim) { 4465 s--; 4466 if (UTF8_IS_CONTINUED(*s)) { 4467 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s)) 4468 s--; 4469 } 4470 /* XXX could check well-formedness here */ 4471 } 4472 } 4473 } 4474 return s; 4475 } 4476 4477 STATIC U8 * 4478 S_reghopmaybe(pTHX_ U8 *s, I32 off) 4479 { 4480 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)); 4481 } 4482 4483 STATIC U8 * 4484 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim) 4485 { 4486 if (off >= 0) { 4487 while (off-- && s < lim) { 4488 /* XXX could check well-formedness here */ 4489 s += UTF8SKIP(s); 4490 } 4491 if (off >= 0) 4492 return 0; 4493 } 4494 else { 4495 while (off++) { 4496 if (s > lim) { 4497 s--; 4498 if (UTF8_IS_CONTINUED(*s)) { 4499 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s)) 4500 s--; 4501 } 4502 /* XXX could check well-formedness here */ 4503 } 4504 else 4505 break; 4506 } 4507 if (off <= 0) 4508 return 0; 4509 } 4510 return s; 4511 } 4512 4513 static void 4514 restore_pos(pTHX_ void *arg) 4515 { 4516 if (PL_reg_eval_set) { 4517 if (PL_reg_oldsaved) { 4518 PL_reg_re->subbeg = PL_reg_oldsaved; 4519 PL_reg_re->sublen = PL_reg_oldsavedlen; 4520 RX_MATCH_COPIED_on(PL_reg_re); 4521 } 4522 PL_reg_magic->mg_len = PL_reg_oldpos; 4523 PL_reg_eval_set = 0; 4524 PL_curpm = PL_reg_oldcurpm; 4525 } 4526 } 4527 4528 STATIC void 4529 S_to_utf8_substr(pTHX_ register regexp *prog) 4530 { 4531 SV* sv; 4532 if (prog->float_substr && !prog->float_utf8) { 4533 prog->float_utf8 = sv = NEWSV(117, 0); 4534 SvSetSV(sv, prog->float_substr); 4535 sv_utf8_upgrade(sv); 4536 if (SvTAIL(prog->float_substr)) 4537 SvTAIL_on(sv); 4538 if (prog->float_substr == prog->check_substr) 4539 prog->check_utf8 = sv; 4540 } 4541 if (prog->anchored_substr && !prog->anchored_utf8) { 4542 prog->anchored_utf8 = sv = NEWSV(118, 0); 4543 SvSetSV(sv, prog->anchored_substr); 4544 sv_utf8_upgrade(sv); 4545 if (SvTAIL(prog->anchored_substr)) 4546 SvTAIL_on(sv); 4547 if (prog->anchored_substr == prog->check_substr) 4548 prog->check_utf8 = sv; 4549 } 4550 } 4551 4552 STATIC void 4553 S_to_byte_substr(pTHX_ register regexp *prog) 4554 { 4555 SV* sv; 4556 if (prog->float_utf8 && !prog->float_substr) { 4557 prog->float_substr = sv = NEWSV(117, 0); 4558 SvSetSV(sv, prog->float_utf8); 4559 if (sv_utf8_downgrade(sv, TRUE)) { 4560 if (SvTAIL(prog->float_utf8)) 4561 SvTAIL_on(sv); 4562 } else { 4563 SvREFCNT_dec(sv); 4564 prog->float_substr = sv = &PL_sv_undef; 4565 } 4566 if (prog->float_utf8 == prog->check_utf8) 4567 prog->check_substr = sv; 4568 } 4569 if (prog->anchored_utf8 && !prog->anchored_substr) { 4570 prog->anchored_substr = sv = NEWSV(118, 0); 4571 SvSetSV(sv, prog->anchored_utf8); 4572 if (sv_utf8_downgrade(sv, TRUE)) { 4573 if (SvTAIL(prog->anchored_utf8)) 4574 SvTAIL_on(sv); 4575 } else { 4576 SvREFCNT_dec(sv); 4577 prog->anchored_substr = sv = &PL_sv_undef; 4578 } 4579 if (prog->anchored_utf8 == prog->check_utf8) 4580 prog->check_substr = sv; 4581 } 4582 } 4583