1 /* regexec.c
2 */
3
4 /*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
26 # endif
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 # define DEBUGGING
30 # endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_regexec_flags my_regexec
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_re_intuit_start my_re_intuit_start
39 /* *These* symbols are masked to allow static link. */
40 # define Perl_pregexec my_pregexec
41 # define Perl_reginitcolors my_reginitcolors
42 # define Perl_regclass_swash my_regclass_swash
43
44 # define PERL_NO_GET_CONTEXT
45 #endif
46
47 /*SUPPRESS 112*/
48 /*
49 * pregcomp and pregexec -- regsub and regerror are not used in perl
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
70 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
71 **** 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
72 ****
73 **** You may distribute under the terms of either the GNU General Public
74 **** License or the Artistic License, as specified in the README file.
75 *
76 * Beware that some of this code is subtly aware of the way operator
77 * precedence is structured in regular expressions. Serious changes in
78 * regular-expression syntax might require a total rethink.
79 */
80 #include "EXTERN.h"
81 #define PERL_IN_REGEXEC_C
82 #include "perl.h"
83
84 #include "regcomp.h"
85
86 #define RF_tainted 1 /* tainted information used? */
87 #define RF_warned 2 /* warned about big count? */
88 #define RF_evaled 4 /* Did an EVAL with setting? */
89 #define RF_utf8 8 /* String contains multibyte chars? */
90 #define RF_false 16 /* odd number of nested negatives */
91
92 #define UTF ((PL_reg_flags & RF_utf8) != 0)
93
94 #define RS_init 1 /* eval environment created */
95 #define RS_set 2 /* replsv value is set */
96
97 #ifndef STATIC
98 #define STATIC static
99 #endif
100
101 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
102
103 /*
104 * Forwards.
105 */
106
107 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
109
110 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
111 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
112 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
113 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
114 #define HOPc(pos,off) ((char*)HOP(pos,off))
115 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
116
117 #define HOPBACK(pos, off) ( \
118 (PL_reg_match_utf8) \
119 ? reghopmaybe((U8*)pos, -off) \
120 : (pos - off >= PL_bostr) \
121 ? (U8*)(pos - off) \
122 : (U8*)NULL \
123 )
124 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
125
126 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
127 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
128 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
129 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
130 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
131 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
132
133 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
134
135 /* for use after a quantifier and before an EXACT-like node -- japhy */
136 #define JUMPABLE(rn) ( \
137 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
138 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
139 OP(rn) == PLUS || OP(rn) == MINMOD || \
140 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
141 )
142
143 #define HAS_TEXT(rn) ( \
144 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
145 )
146
147 /*
148 Search for mandatory following text node; for lookahead, the text must
149 follow but for lookbehind (rn->flags != 0) we skip to the next step.
150 */
151 #define FIND_NEXT_IMPT(rn) STMT_START { \
152 while (JUMPABLE(rn)) \
153 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
154 rn = NEXTOPER(NEXTOPER(rn)); \
155 else if (OP(rn) == PLUS) \
156 rn = NEXTOPER(rn); \
157 else if (OP(rn) == IFMATCH) \
158 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
159 else rn += NEXT_OFF(rn); \
160 } STMT_END
161
162 static void restore_pos(pTHX_ void *arg);
163
164 STATIC CHECKPOINT
S_regcppush(pTHX_ I32 parenfloor)165 S_regcppush(pTHX_ I32 parenfloor)
166 {
167 int retval = PL_savestack_ix;
168 #define REGCP_PAREN_ELEMS 4
169 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
170 int p;
171
172 if (paren_elems_to_push < 0)
173 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
174
175 #define REGCP_OTHER_ELEMS 6
176 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
177 for (p = PL_regsize; p > parenfloor; p--) {
178 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
179 SSPUSHINT(PL_regendp[p]);
180 SSPUSHINT(PL_regstartp[p]);
181 SSPUSHPTR(PL_reg_start_tmp[p]);
182 SSPUSHINT(p);
183 }
184 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
187 SSPUSHINT(*PL_reglastcloseparen);
188 SSPUSHPTR(PL_reginput);
189 #define REGCP_FRAME_ELEMS 2
190 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
194
195 return retval;
196 }
197
198 /* These are needed since we do not localize EVAL nodes: */
199 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
202
203 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
204 PerlIO_printf(Perl_debug_log, \
205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
207
208 STATIC char *
S_regcppop(pTHX)209 S_regcppop(pTHX)
210 {
211 I32 i;
212 U32 paren = 0;
213 char *input;
214 I32 tmps;
215
216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
217 i = SSPOPINT;
218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
219 i = SSPOPINT; /* Parentheses elements to pop. */
220 input = (char *) SSPOPPTR;
221 *PL_reglastcloseparen = SSPOPINT;
222 *PL_reglastparen = SSPOPINT;
223 PL_regsize = SSPOPINT;
224
225 /* Now restore the parentheses context. */
226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
227 i > 0; i -= REGCP_PAREN_ELEMS) {
228 paren = (U32)SSPOPINT;
229 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
230 PL_regstartp[paren] = SSPOPINT;
231 tmps = SSPOPINT;
232 if (paren <= *PL_reglastparen)
233 PL_regendp[paren] = tmps;
234 DEBUG_r(
235 PerlIO_printf(Perl_debug_log,
236 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
237 (UV)paren, (IV)PL_regstartp[paren],
238 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
239 (IV)PL_regendp[paren],
240 (paren > *PL_reglastparen ? "(no)" : ""));
241 );
242 }
243 DEBUG_r(
244 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
245 PerlIO_printf(Perl_debug_log,
246 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
247 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
248 }
249 );
250 #if 1
251 /* It would seem that the similar code in regtry()
252 * already takes care of this, and in fact it is in
253 * a better location to since this code can #if 0-ed out
254 * but the code in regtry() is needed or otherwise tests
255 * requiring null fields (pat.t#187 and split.t#{13,14}
256 * (as of patchlevel 7877) will fail. Then again,
257 * this code seems to be necessary or otherwise
258 * building DynaLoader will fail:
259 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
260 * --jhi */
261 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
262 if ((I32)paren > PL_regsize)
263 PL_regstartp[paren] = -1;
264 PL_regendp[paren] = -1;
265 }
266 #endif
267 return input;
268 }
269
270 STATIC char *
S_regcp_set_to(pTHX_ I32 ss)271 S_regcp_set_to(pTHX_ I32 ss)
272 {
273 I32 tmp = PL_savestack_ix;
274
275 PL_savestack_ix = ss;
276 regcppop();
277 PL_savestack_ix = tmp;
278 return Nullch;
279 }
280
281 typedef struct re_cc_state
282 {
283 I32 ss;
284 regnode *node;
285 struct re_cc_state *prev;
286 CURCUR *cc;
287 regexp *re;
288 } re_cc_state;
289
290 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
291
292 #define TRYPAREN(paren, n, input) { \
293 if (paren) { \
294 if (n) { \
295 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
296 PL_regendp[paren] = input - PL_bostr; \
297 } \
298 else \
299 PL_regendp[paren] = -1; \
300 } \
301 if (regmatch(next)) \
302 sayYES; \
303 if (paren && n) \
304 PL_regendp[paren] = -1; \
305 }
306
307
308 /*
309 * pregexec and friends
310 */
311
312 /*
313 - pregexec - match a regexp against a string
314 */
315 I32
Perl_pregexec(pTHX_ register regexp * prog,char * stringarg,register char * strend,char * strbeg,I32 minend,SV * screamer,U32 nosave)316 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
317 char *strbeg, I32 minend, SV *screamer, U32 nosave)
318 /* strend: pointer to null at end of string */
319 /* strbeg: real beginning of string */
320 /* minend: end of match must be >=minend after stringarg. */
321 /* nosave: For optimizations. */
322 {
323 return
324 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
325 nosave ? 0 : REXEC_COPY_STR);
326 }
327
328 STATIC void
S_cache_re(pTHX_ regexp * prog)329 S_cache_re(pTHX_ regexp *prog)
330 {
331 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
332 #ifdef DEBUGGING
333 PL_regprogram = prog->program;
334 #endif
335 PL_regnpar = prog->nparens;
336 PL_regdata = prog->data;
337 PL_reg_re = prog;
338 }
339
340 /*
341 * Need to implement the following flags for reg_anch:
342 *
343 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
344 * USE_INTUIT_ML
345 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
346 * INTUIT_AUTORITATIVE_ML
347 * INTUIT_ONCE_NOML - Intuit can match in one location only.
348 * INTUIT_ONCE_ML
349 *
350 * Another flag for this function: SECOND_TIME (so that float substrs
351 * with giant delta may be not rechecked).
352 */
353
354 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
355
356 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
357 Otherwise, only SvCUR(sv) is used to get strbeg. */
358
359 /* XXXX We assume that strpos is strbeg unless sv. */
360
361 /* XXXX Some places assume that there is a fixed substring.
362 An update may be needed if optimizer marks as "INTUITable"
363 RExen without fixed substrings. Similarly, it is assumed that
364 lengths of all the strings are no more than minlen, thus they
365 cannot come from lookahead.
366 (Or minlen should take into account lookahead.) */
367
368 /* A failure to find a constant substring means that there is no need to make
369 an expensive call to REx engine, thus we celebrate a failure. Similarly,
370 finding a substring too deep into the string means that less calls to
371 regtry() should be needed.
372
373 REx compiler's optimizer found 4 possible hints:
374 a) Anchored substring;
375 b) Fixed substring;
376 c) Whether we are anchored (beginning-of-line or \G);
377 d) First node (of those at offset 0) which may distingush positions;
378 We use a)b)d) and multiline-part of c), and try to find a position in the
379 string which does not contradict any of them.
380 */
381
382 /* Most of decisions we do here should have been done at compile time.
383 The nodes of the REx which we used for the search should have been
384 deleted from the finite automaton. */
385
386 char *
Perl_re_intuit_start(pTHX_ regexp * prog,SV * sv,char * strpos,char * strend,U32 flags,re_scream_pos_data * data)387 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
388 char *strend, U32 flags, re_scream_pos_data *data)
389 {
390 register I32 start_shift = 0;
391 /* Should be nonnegative! */
392 register I32 end_shift = 0;
393 register char *s;
394 register SV *check;
395 char *strbeg;
396 char *t;
397 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
398 I32 ml_anch;
399 register char *other_last = Nullch; /* other substr checked before this */
400 char *check_at = Nullch; /* check substr found at this pos */
401 #ifdef DEBUGGING
402 char *i_strpos = strpos;
403 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
404 #endif
405 RX_MATCH_UTF8_set(prog,do_utf8);
406
407 if (prog->reganch & ROPT_UTF8) {
408 DEBUG_r(PerlIO_printf(Perl_debug_log,
409 "UTF-8 regex...\n"));
410 PL_reg_flags |= RF_utf8;
411 }
412
413 DEBUG_r({
414 char *s = PL_reg_match_utf8 ?
415 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
416 strpos;
417 int len = PL_reg_match_utf8 ?
418 strlen(s) : strend - strpos;
419 if (!PL_colorset)
420 reginitcolors();
421 if (PL_reg_match_utf8)
422 DEBUG_r(PerlIO_printf(Perl_debug_log,
423 "UTF-8 target...\n"));
424 PerlIO_printf(Perl_debug_log,
425 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
426 PL_colors[4],PL_colors[5],PL_colors[0],
427 prog->precomp,
428 PL_colors[1],
429 (strlen(prog->precomp) > 60 ? "..." : ""),
430 PL_colors[0],
431 (int)(len > 60 ? 60 : len),
432 s, PL_colors[1],
433 (len > 60 ? "..." : "")
434 );
435 });
436
437 /* CHR_DIST() would be more correct here but it makes things slow. */
438 if (prog->minlen > strend - strpos) {
439 DEBUG_r(PerlIO_printf(Perl_debug_log,
440 "String too short... [re_intuit_start]\n"));
441 goto fail;
442 }
443 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
444 PL_regeol = strend;
445 if (do_utf8) {
446 if (!prog->check_utf8 && prog->check_substr)
447 to_utf8_substr(prog);
448 check = prog->check_utf8;
449 } else {
450 if (!prog->check_substr && prog->check_utf8)
451 to_byte_substr(prog);
452 check = prog->check_substr;
453 }
454 if (check == &PL_sv_undef) {
455 DEBUG_r(PerlIO_printf(Perl_debug_log,
456 "Non-utf string cannot match utf check string\n"));
457 goto fail;
458 }
459 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
460 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
461 || ( (prog->reganch & ROPT_ANCH_BOL)
462 && !PL_multiline ) ); /* Check after \n? */
463
464 if (!ml_anch) {
465 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
466 | ROPT_IMPLICIT)) /* not a real BOL */
467 /* SvCUR is not set on references: SvRV and SvPVX overlap */
468 && sv && !SvROK(sv)
469 && (strpos != strbeg)) {
470 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
471 goto fail;
472 }
473 if (prog->check_offset_min == prog->check_offset_max &&
474 !(prog->reganch & ROPT_CANY_SEEN)) {
475 /* Substring at constant offset from beg-of-str... */
476 I32 slen;
477
478 s = HOP3c(strpos, prog->check_offset_min, strend);
479 if (SvTAIL(check)) {
480 slen = SvCUR(check); /* >= 1 */
481
482 if ( strend - s > slen || strend - s < slen - 1
483 || (strend - s == slen && strend[-1] != '\n')) {
484 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
485 goto fail_finish;
486 }
487 /* Now should match s[0..slen-2] */
488 slen--;
489 if (slen && (*SvPVX(check) != *s
490 || (slen > 1
491 && memNE(SvPVX(check), s, slen)))) {
492 report_neq:
493 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
494 goto fail_finish;
495 }
496 }
497 else if (*SvPVX(check) != *s
498 || ((slen = SvCUR(check)) > 1
499 && memNE(SvPVX(check), s, slen)))
500 goto report_neq;
501 goto success_at_start;
502 }
503 }
504 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
505 s = strpos;
506 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
507 end_shift = prog->minlen - start_shift -
508 CHR_SVLEN(check) + (SvTAIL(check) != 0);
509 if (!ml_anch) {
510 I32 end = prog->check_offset_max + CHR_SVLEN(check)
511 - (SvTAIL(check) != 0);
512 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
513
514 if (end_shift < eshift)
515 end_shift = eshift;
516 }
517 }
518 else { /* Can match at random position */
519 ml_anch = 0;
520 s = strpos;
521 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
522 /* Should be nonnegative! */
523 end_shift = prog->minlen - start_shift -
524 CHR_SVLEN(check) + (SvTAIL(check) != 0);
525 }
526
527 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
528 if (end_shift < 0)
529 Perl_croak(aTHX_ "panic: end_shift");
530 #endif
531
532 restart:
533 /* Find a possible match in the region s..strend by looking for
534 the "check" substring in the region corrected by start/end_shift. */
535 if (flags & REXEC_SCREAM) {
536 I32 p = -1; /* Internal iterator of scream. */
537 I32 *pp = data ? data->scream_pos : &p;
538
539 if (PL_screamfirst[BmRARE(check)] >= 0
540 || ( BmRARE(check) == '\n'
541 && (BmPREVIOUS(check) == SvCUR(check) - 1)
542 && SvTAIL(check) ))
543 s = screaminstr(sv, check,
544 start_shift + (s - strbeg), end_shift, pp, 0);
545 else
546 goto fail_finish;
547 /* we may be pointing at the wrong string */
548 if (s && RX_MATCH_COPIED(prog))
549 s = strbeg + (s - SvPVX(sv));
550 if (data)
551 *data->scream_olds = s;
552 }
553 else if (prog->reganch & ROPT_CANY_SEEN)
554 s = fbm_instr((U8*)(s + start_shift),
555 (U8*)(strend - end_shift),
556 check, PL_multiline ? FBMrf_MULTILINE : 0);
557 else
558 s = fbm_instr(HOP3(s, start_shift, strend),
559 HOP3(strend, -end_shift, strbeg),
560 check, PL_multiline ? FBMrf_MULTILINE : 0);
561
562 /* Update the count-of-usability, remove useless subpatterns,
563 unshift s. */
564
565 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
566 (s ? "Found" : "Did not find"),
567 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
568 PL_colors[0],
569 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
570 SvPVX(check),
571 PL_colors[1], (SvTAIL(check) ? "$" : ""),
572 (s ? " at offset " : "...\n") ) );
573
574 if (!s)
575 goto fail_finish;
576
577 check_at = s;
578
579 /* Finish the diagnostic message */
580 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
581
582 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
583 Start with the other substr.
584 XXXX no SCREAM optimization yet - and a very coarse implementation
585 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
586 *always* match. Probably should be marked during compile...
587 Probably it is right to do no SCREAM here...
588 */
589
590 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
591 /* Take into account the "other" substring. */
592 /* XXXX May be hopelessly wrong for UTF... */
593 if (!other_last)
594 other_last = strpos;
595 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
596 do_other_anchored:
597 {
598 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
599 char *s1 = s;
600 SV* must;
601
602 t = s - prog->check_offset_max;
603 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
604 && (!do_utf8
605 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
606 && t > strpos)))
607 /* EMPTY */;
608 else
609 t = strpos;
610 t = HOP3c(t, prog->anchored_offset, strend);
611 if (t < other_last) /* These positions already checked */
612 t = other_last;
613 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
614 if (last < last1)
615 last1 = last;
616 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
617 /* On end-of-str: see comment below. */
618 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
619 if (must == &PL_sv_undef) {
620 s = (char*)NULL;
621 DEBUG_r(must = prog->anchored_utf8); /* for debug */
622 }
623 else
624 s = fbm_instr(
625 (unsigned char*)t,
626 HOP3(HOP3(last1, prog->anchored_offset, strend)
627 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
628 must,
629 PL_multiline ? FBMrf_MULTILINE : 0
630 );
631 DEBUG_r(PerlIO_printf(Perl_debug_log,
632 "%s anchored substr `%s%.*s%s'%s",
633 (s ? "Found" : "Contradicts"),
634 PL_colors[0],
635 (int)(SvCUR(must)
636 - (SvTAIL(must)!=0)),
637 SvPVX(must),
638 PL_colors[1], (SvTAIL(must) ? "$" : "")));
639 if (!s) {
640 if (last1 >= last2) {
641 DEBUG_r(PerlIO_printf(Perl_debug_log,
642 ", giving up...\n"));
643 goto fail_finish;
644 }
645 DEBUG_r(PerlIO_printf(Perl_debug_log,
646 ", trying floating at offset %ld...\n",
647 (long)(HOP3c(s1, 1, strend) - i_strpos)));
648 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
649 s = HOP3c(last, 1, strend);
650 goto restart;
651 }
652 else {
653 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
654 (long)(s - i_strpos)));
655 t = HOP3c(s, -prog->anchored_offset, strbeg);
656 other_last = HOP3c(s, 1, strend);
657 s = s1;
658 if (t == strpos)
659 goto try_at_start;
660 goto try_at_offset;
661 }
662 }
663 }
664 else { /* Take into account the floating substring. */
665 char *last, *last1;
666 char *s1 = s;
667 SV* must;
668
669 t = HOP3c(s, -start_shift, strbeg);
670 last1 = last =
671 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
672 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
673 last = HOP3c(t, prog->float_max_offset, strend);
674 s = HOP3c(t, prog->float_min_offset, strend);
675 if (s < other_last)
676 s = other_last;
677 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
678 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
679 /* fbm_instr() takes into account exact value of end-of-str
680 if the check is SvTAIL(ed). Since false positives are OK,
681 and end-of-str is not later than strend we are OK. */
682 if (must == &PL_sv_undef) {
683 s = (char*)NULL;
684 DEBUG_r(must = prog->float_utf8); /* for debug message */
685 }
686 else
687 s = fbm_instr((unsigned char*)s,
688 (unsigned char*)last + SvCUR(must)
689 - (SvTAIL(must)!=0),
690 must, PL_multiline ? FBMrf_MULTILINE : 0);
691 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
692 (s ? "Found" : "Contradicts"),
693 PL_colors[0],
694 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
695 SvPVX(must),
696 PL_colors[1], (SvTAIL(must) ? "$" : "")));
697 if (!s) {
698 if (last1 == last) {
699 DEBUG_r(PerlIO_printf(Perl_debug_log,
700 ", giving up...\n"));
701 goto fail_finish;
702 }
703 DEBUG_r(PerlIO_printf(Perl_debug_log,
704 ", trying anchored starting at offset %ld...\n",
705 (long)(s1 + 1 - i_strpos)));
706 other_last = last;
707 s = HOP3c(t, 1, strend);
708 goto restart;
709 }
710 else {
711 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
712 (long)(s - i_strpos)));
713 other_last = s; /* Fix this later. --Hugo */
714 s = s1;
715 if (t == strpos)
716 goto try_at_start;
717 goto try_at_offset;
718 }
719 }
720 }
721
722 t = s - prog->check_offset_max;
723 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
724 && (!do_utf8
725 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
726 && t > strpos))) {
727 /* Fixed substring is found far enough so that the match
728 cannot start at strpos. */
729 try_at_offset:
730 if (ml_anch && t[-1] != '\n') {
731 /* Eventually fbm_*() should handle this, but often
732 anchored_offset is not 0, so this check will not be wasted. */
733 /* XXXX In the code below we prefer to look for "^" even in
734 presence of anchored substrings. And we search even
735 beyond the found float position. These pessimizations
736 are historical artefacts only. */
737 find_anchor:
738 while (t < strend - prog->minlen) {
739 if (*t == '\n') {
740 if (t < check_at - prog->check_offset_min) {
741 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
742 /* Since we moved from the found position,
743 we definitely contradict the found anchored
744 substr. Due to the above check we do not
745 contradict "check" substr.
746 Thus we can arrive here only if check substr
747 is float. Redo checking for "other"=="fixed".
748 */
749 strpos = t + 1;
750 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
751 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
752 goto do_other_anchored;
753 }
754 /* We don't contradict the found floating substring. */
755 /* XXXX Why not check for STCLASS? */
756 s = t + 1;
757 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
758 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
759 goto set_useful;
760 }
761 /* Position contradicts check-string */
762 /* XXXX probably better to look for check-string
763 than for "\n", so one should lower the limit for t? */
764 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
765 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
766 other_last = strpos = s = t + 1;
767 goto restart;
768 }
769 t++;
770 }
771 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
772 PL_colors[0],PL_colors[1]));
773 goto fail_finish;
774 }
775 else {
776 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
777 PL_colors[0],PL_colors[1]));
778 }
779 s = t;
780 set_useful:
781 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
782 }
783 else {
784 /* The found string does not prohibit matching at strpos,
785 - no optimization of calling REx engine can be performed,
786 unless it was an MBOL and we are not after MBOL,
787 or a future STCLASS check will fail this. */
788 try_at_start:
789 /* Even in this situation we may use MBOL flag if strpos is offset
790 wrt the start of the string. */
791 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
792 && (strpos != strbeg) && strpos[-1] != '\n'
793 /* May be due to an implicit anchor of m{.*foo} */
794 && !(prog->reganch & ROPT_IMPLICIT))
795 {
796 t = strpos;
797 goto find_anchor;
798 }
799 DEBUG_r( if (ml_anch)
800 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
801 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
802 );
803 success_at_start:
804 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
805 && (do_utf8 ? (
806 prog->check_utf8 /* Could be deleted already */
807 && --BmUSEFUL(prog->check_utf8) < 0
808 && (prog->check_utf8 == prog->float_utf8)
809 ) : (
810 prog->check_substr /* Could be deleted already */
811 && --BmUSEFUL(prog->check_substr) < 0
812 && (prog->check_substr == prog->float_substr)
813 )))
814 {
815 /* If flags & SOMETHING - do not do it many times on the same match */
816 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
817 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
818 if (do_utf8 ? prog->check_substr : prog->check_utf8)
819 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
820 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
821 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
822 check = Nullsv; /* abort */
823 s = strpos;
824 /* XXXX This is a remnant of the old implementation. It
825 looks wasteful, since now INTUIT can use many
826 other heuristics. */
827 prog->reganch &= ~RE_USE_INTUIT;
828 }
829 else
830 s = strpos;
831 }
832
833 /* Last resort... */
834 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
835 if (prog->regstclass) {
836 /* minlen == 0 is possible if regstclass is \b or \B,
837 and the fixed substr is ''$.
838 Since minlen is already taken into account, s+1 is before strend;
839 accidentally, minlen >= 1 guaranties no false positives at s + 1
840 even for \b or \B. But (minlen? 1 : 0) below assumes that
841 regstclass does not come from lookahead... */
842 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
843 This leaves EXACTF only, which is dealt with in find_byclass(). */
844 U8* str = (U8*)STRING(prog->regstclass);
845 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
846 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
847 : 1);
848 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
849 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
850 : (prog->float_substr || prog->float_utf8
851 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
852 cl_l, strend)
853 : strend);
854 char *startpos = strbeg;
855
856 t = s;
857 cache_re(prog);
858 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
859 if (!s) {
860 #ifdef DEBUGGING
861 char *what = 0;
862 #endif
863 if (endpos == strend) {
864 DEBUG_r( PerlIO_printf(Perl_debug_log,
865 "Could not match STCLASS...\n") );
866 goto fail;
867 }
868 DEBUG_r( PerlIO_printf(Perl_debug_log,
869 "This position contradicts STCLASS...\n") );
870 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
871 goto fail;
872 /* Contradict one of substrings */
873 if (prog->anchored_substr || prog->anchored_utf8) {
874 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
875 DEBUG_r( what = "anchored" );
876 hop_and_restart:
877 s = HOP3c(t, 1, strend);
878 if (s + start_shift + end_shift > strend) {
879 /* XXXX Should be taken into account earlier? */
880 DEBUG_r( PerlIO_printf(Perl_debug_log,
881 "Could not match STCLASS...\n") );
882 goto fail;
883 }
884 if (!check)
885 goto giveup;
886 DEBUG_r( PerlIO_printf(Perl_debug_log,
887 "Looking for %s substr starting at offset %ld...\n",
888 what, (long)(s + start_shift - i_strpos)) );
889 goto restart;
890 }
891 /* Have both, check_string is floating */
892 if (t + start_shift >= check_at) /* Contradicts floating=check */
893 goto retry_floating_check;
894 /* Recheck anchored substring, but not floating... */
895 s = check_at;
896 if (!check)
897 goto giveup;
898 DEBUG_r( PerlIO_printf(Perl_debug_log,
899 "Looking for anchored substr starting at offset %ld...\n",
900 (long)(other_last - i_strpos)) );
901 goto do_other_anchored;
902 }
903 /* Another way we could have checked stclass at the
904 current position only: */
905 if (ml_anch) {
906 s = t = t + 1;
907 if (!check)
908 goto giveup;
909 DEBUG_r( PerlIO_printf(Perl_debug_log,
910 "Looking for /%s^%s/m starting at offset %ld...\n",
911 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
912 goto try_at_offset;
913 }
914 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
915 goto fail;
916 /* Check is floating subtring. */
917 retry_floating_check:
918 t = check_at - start_shift;
919 DEBUG_r( what = "floating" );
920 goto hop_and_restart;
921 }
922 if (t != s) {
923 DEBUG_r(PerlIO_printf(Perl_debug_log,
924 "By STCLASS: moving %ld --> %ld\n",
925 (long)(t - i_strpos), (long)(s - i_strpos))
926 );
927 }
928 else {
929 DEBUG_r(PerlIO_printf(Perl_debug_log,
930 "Does not contradict STCLASS...\n");
931 );
932 }
933 }
934 giveup:
935 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
936 PL_colors[4], (check ? "Guessed" : "Giving up"),
937 PL_colors[5], (long)(s - i_strpos)) );
938 return s;
939
940 fail_finish: /* Substring not found */
941 if (prog->check_substr || prog->check_utf8) /* could be removed already */
942 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
943 fail:
944 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
945 PL_colors[4],PL_colors[5]));
946 return Nullch;
947 }
948
949 /* We know what class REx starts with. Try to find this position... */
950 STATIC char *
S_find_byclass(pTHX_ regexp * prog,regnode * c,char * s,char * strend,char * startpos,I32 norun)951 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
952 {
953 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
954 char *m;
955 STRLEN ln;
956 STRLEN lnc;
957 register STRLEN uskip;
958 unsigned int c1;
959 unsigned int c2;
960 char *e;
961 register I32 tmp = 1; /* Scratch variable? */
962 register bool do_utf8 = PL_reg_match_utf8;
963
964 /* We know what class it must start with. */
965 switch (OP(c)) {
966 case ANYOF:
967 if (do_utf8) {
968 while (s + (uskip = UTF8SKIP(s)) <= strend) {
969 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
970 !UTF8_IS_INVARIANT((U8)s[0]) ?
971 reginclass(c, (U8*)s, 0, do_utf8) :
972 REGINCLASS(c, (U8*)s)) {
973 if (tmp && (norun || regtry(prog, s)))
974 goto got_it;
975 else
976 tmp = doevery;
977 }
978 else
979 tmp = 1;
980 s += uskip;
981 }
982 }
983 else {
984 while (s < strend) {
985 STRLEN skip = 1;
986
987 if (REGINCLASS(c, (U8*)s) ||
988 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
989 /* The assignment of 2 is intentional:
990 * for the folded sharp s, the skip is 2. */
991 (skip = SHARP_S_SKIP))) {
992 if (tmp && (norun || regtry(prog, s)))
993 goto got_it;
994 else
995 tmp = doevery;
996 }
997 else
998 tmp = 1;
999 s += skip;
1000 }
1001 }
1002 break;
1003 case CANY:
1004 while (s < strend) {
1005 if (tmp && (norun || regtry(prog, s)))
1006 goto got_it;
1007 else
1008 tmp = doevery;
1009 s++;
1010 }
1011 break;
1012 case EXACTF:
1013 m = STRING(c);
1014 ln = STR_LEN(c); /* length to match in octets/bytes */
1015 lnc = (I32) ln; /* length to match in characters */
1016 if (UTF) {
1017 STRLEN ulen1, ulen2;
1018 U8 *sm = (U8 *) m;
1019 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1020 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
1021
1022 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1023 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1024
1025 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1026 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1027 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1028 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1029 lnc = 0;
1030 while (sm < ((U8 *) m + ln)) {
1031 lnc++;
1032 sm += UTF8SKIP(sm);
1033 }
1034 }
1035 else {
1036 c1 = *(U8*)m;
1037 c2 = PL_fold[c1];
1038 }
1039 goto do_exactf;
1040 case EXACTFL:
1041 m = STRING(c);
1042 ln = STR_LEN(c);
1043 lnc = (I32) ln;
1044 c1 = *(U8*)m;
1045 c2 = PL_fold_locale[c1];
1046 do_exactf:
1047 e = HOP3c(strend, -((I32)lnc), s);
1048
1049 if (norun && e < s)
1050 e = s; /* Due to minlen logic of intuit() */
1051
1052 /* The idea in the EXACTF* cases is to first find the
1053 * first character of the EXACTF* node and then, if
1054 * necessary, case-insensitively compare the full
1055 * text of the node. The c1 and c2 are the first
1056 * characters (though in Unicode it gets a bit
1057 * more complicated because there are more cases
1058 * than just upper and lower: one needs to use
1059 * the so-called folding case for case-insensitive
1060 * matching (called "loose matching" in Unicode).
1061 * ibcmp_utf8() will do just that. */
1062
1063 if (do_utf8) {
1064 UV c, f;
1065 U8 tmpbuf [UTF8_MAXLEN+1];
1066 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1067 STRLEN len, foldlen;
1068
1069 if (c1 == c2) {
1070 /* Upper and lower of 1st char are equal -
1071 * probably not a "letter". */
1072 while (s <= e) {
1073 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1074 ckWARN(WARN_UTF8) ?
1075 0 : UTF8_ALLOW_ANY);
1076 if ( c == c1
1077 && (ln == len ||
1078 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1079 m, (char **)0, ln, (bool)UTF))
1080 && (norun || regtry(prog, s)) )
1081 goto got_it;
1082 else {
1083 uvchr_to_utf8(tmpbuf, c);
1084 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1085 if ( f != c
1086 && (f == c1 || f == c2)
1087 && (ln == foldlen ||
1088 !ibcmp_utf8((char *) foldbuf,
1089 (char **)0, foldlen, do_utf8,
1090 m,
1091 (char **)0, ln, (bool)UTF))
1092 && (norun || regtry(prog, s)) )
1093 goto got_it;
1094 }
1095 s += len;
1096 }
1097 }
1098 else {
1099 while (s <= e) {
1100 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1101 ckWARN(WARN_UTF8) ?
1102 0 : UTF8_ALLOW_ANY);
1103
1104 /* Handle some of the three Greek sigmas cases.
1105 * Note that not all the possible combinations
1106 * are handled here: some of them are handled
1107 * by the standard folding rules, and some of
1108 * them (the character class or ANYOF cases)
1109 * are handled during compiletime in
1110 * regexec.c:S_regclass(). */
1111 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1112 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1113 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1114
1115 if ( (c == c1 || c == c2)
1116 && (ln == len ||
1117 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1118 m, (char **)0, ln, (bool)UTF))
1119 && (norun || regtry(prog, s)) )
1120 goto got_it;
1121 else {
1122 uvchr_to_utf8(tmpbuf, c);
1123 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1124 if ( f != c
1125 && (f == c1 || f == c2)
1126 && (ln == foldlen ||
1127 !ibcmp_utf8((char *) foldbuf,
1128 (char **)0, foldlen, do_utf8,
1129 m,
1130 (char **)0, ln, (bool)UTF))
1131 && (norun || regtry(prog, s)) )
1132 goto got_it;
1133 }
1134 s += len;
1135 }
1136 }
1137 }
1138 else {
1139 if (c1 == c2)
1140 while (s <= e) {
1141 if ( *(U8*)s == c1
1142 && (ln == 1 || !(OP(c) == EXACTF
1143 ? ibcmp(s, m, ln)
1144 : ibcmp_locale(s, m, ln)))
1145 && (norun || regtry(prog, s)) )
1146 goto got_it;
1147 s++;
1148 }
1149 else
1150 while (s <= e) {
1151 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1152 && (ln == 1 || !(OP(c) == EXACTF
1153 ? ibcmp(s, m, ln)
1154 : ibcmp_locale(s, m, ln)))
1155 && (norun || regtry(prog, s)) )
1156 goto got_it;
1157 s++;
1158 }
1159 }
1160 break;
1161 case BOUNDL:
1162 PL_reg_flags |= RF_tainted;
1163 /* FALL THROUGH */
1164 case BOUND:
1165 if (do_utf8) {
1166 if (s == PL_bostr)
1167 tmp = '\n';
1168 else {
1169 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1170
1171 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1172 }
1173 tmp = ((OP(c) == BOUND ?
1174 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1175 LOAD_UTF8_CHARCLASS(alnum,"a");
1176 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1177 if (tmp == !(OP(c) == BOUND ?
1178 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1179 isALNUM_LC_utf8((U8*)s)))
1180 {
1181 tmp = !tmp;
1182 if ((norun || regtry(prog, s)))
1183 goto got_it;
1184 }
1185 s += uskip;
1186 }
1187 }
1188 else {
1189 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1190 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1191 while (s < strend) {
1192 if (tmp ==
1193 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1194 tmp = !tmp;
1195 if ((norun || regtry(prog, s)))
1196 goto got_it;
1197 }
1198 s++;
1199 }
1200 }
1201 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1202 goto got_it;
1203 break;
1204 case NBOUNDL:
1205 PL_reg_flags |= RF_tainted;
1206 /* FALL THROUGH */
1207 case NBOUND:
1208 if (do_utf8) {
1209 if (s == PL_bostr)
1210 tmp = '\n';
1211 else {
1212 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1213
1214 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1215 }
1216 tmp = ((OP(c) == NBOUND ?
1217 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1218 LOAD_UTF8_CHARCLASS(alnum,"a");
1219 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1220 if (tmp == !(OP(c) == NBOUND ?
1221 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1222 isALNUM_LC_utf8((U8*)s)))
1223 tmp = !tmp;
1224 else if ((norun || regtry(prog, s)))
1225 goto got_it;
1226 s += uskip;
1227 }
1228 }
1229 else {
1230 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1231 tmp = ((OP(c) == NBOUND ?
1232 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1233 while (s < strend) {
1234 if (tmp ==
1235 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1236 tmp = !tmp;
1237 else if ((norun || regtry(prog, s)))
1238 goto got_it;
1239 s++;
1240 }
1241 }
1242 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1243 goto got_it;
1244 break;
1245 case ALNUM:
1246 if (do_utf8) {
1247 LOAD_UTF8_CHARCLASS(alnum,"a");
1248 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1249 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1250 if (tmp && (norun || regtry(prog, s)))
1251 goto got_it;
1252 else
1253 tmp = doevery;
1254 }
1255 else
1256 tmp = 1;
1257 s += uskip;
1258 }
1259 }
1260 else {
1261 while (s < strend) {
1262 if (isALNUM(*s)) {
1263 if (tmp && (norun || regtry(prog, s)))
1264 goto got_it;
1265 else
1266 tmp = doevery;
1267 }
1268 else
1269 tmp = 1;
1270 s++;
1271 }
1272 }
1273 break;
1274 case ALNUML:
1275 PL_reg_flags |= RF_tainted;
1276 if (do_utf8) {
1277 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1278 if (isALNUM_LC_utf8((U8*)s)) {
1279 if (tmp && (norun || regtry(prog, s)))
1280 goto got_it;
1281 else
1282 tmp = doevery;
1283 }
1284 else
1285 tmp = 1;
1286 s += uskip;
1287 }
1288 }
1289 else {
1290 while (s < strend) {
1291 if (isALNUM_LC(*s)) {
1292 if (tmp && (norun || regtry(prog, s)))
1293 goto got_it;
1294 else
1295 tmp = doevery;
1296 }
1297 else
1298 tmp = 1;
1299 s++;
1300 }
1301 }
1302 break;
1303 case NALNUM:
1304 if (do_utf8) {
1305 LOAD_UTF8_CHARCLASS(alnum,"a");
1306 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1307 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1308 if (tmp && (norun || regtry(prog, s)))
1309 goto got_it;
1310 else
1311 tmp = doevery;
1312 }
1313 else
1314 tmp = 1;
1315 s += uskip;
1316 }
1317 }
1318 else {
1319 while (s < strend) {
1320 if (!isALNUM(*s)) {
1321 if (tmp && (norun || regtry(prog, s)))
1322 goto got_it;
1323 else
1324 tmp = doevery;
1325 }
1326 else
1327 tmp = 1;
1328 s++;
1329 }
1330 }
1331 break;
1332 case NALNUML:
1333 PL_reg_flags |= RF_tainted;
1334 if (do_utf8) {
1335 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1336 if (!isALNUM_LC_utf8((U8*)s)) {
1337 if (tmp && (norun || regtry(prog, s)))
1338 goto got_it;
1339 else
1340 tmp = doevery;
1341 }
1342 else
1343 tmp = 1;
1344 s += uskip;
1345 }
1346 }
1347 else {
1348 while (s < strend) {
1349 if (!isALNUM_LC(*s)) {
1350 if (tmp && (norun || regtry(prog, s)))
1351 goto got_it;
1352 else
1353 tmp = doevery;
1354 }
1355 else
1356 tmp = 1;
1357 s++;
1358 }
1359 }
1360 break;
1361 case SPACE:
1362 if (do_utf8) {
1363 LOAD_UTF8_CHARCLASS(space," ");
1364 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1365 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1366 if (tmp && (norun || regtry(prog, s)))
1367 goto got_it;
1368 else
1369 tmp = doevery;
1370 }
1371 else
1372 tmp = 1;
1373 s += uskip;
1374 }
1375 }
1376 else {
1377 while (s < strend) {
1378 if (isSPACE(*s)) {
1379 if (tmp && (norun || regtry(prog, s)))
1380 goto got_it;
1381 else
1382 tmp = doevery;
1383 }
1384 else
1385 tmp = 1;
1386 s++;
1387 }
1388 }
1389 break;
1390 case SPACEL:
1391 PL_reg_flags |= RF_tainted;
1392 if (do_utf8) {
1393 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1394 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1395 if (tmp && (norun || regtry(prog, s)))
1396 goto got_it;
1397 else
1398 tmp = doevery;
1399 }
1400 else
1401 tmp = 1;
1402 s += uskip;
1403 }
1404 }
1405 else {
1406 while (s < strend) {
1407 if (isSPACE_LC(*s)) {
1408 if (tmp && (norun || regtry(prog, s)))
1409 goto got_it;
1410 else
1411 tmp = doevery;
1412 }
1413 else
1414 tmp = 1;
1415 s++;
1416 }
1417 }
1418 break;
1419 case NSPACE:
1420 if (do_utf8) {
1421 LOAD_UTF8_CHARCLASS(space," ");
1422 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1423 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1424 if (tmp && (norun || regtry(prog, s)))
1425 goto got_it;
1426 else
1427 tmp = doevery;
1428 }
1429 else
1430 tmp = 1;
1431 s += uskip;
1432 }
1433 }
1434 else {
1435 while (s < strend) {
1436 if (!isSPACE(*s)) {
1437 if (tmp && (norun || regtry(prog, s)))
1438 goto got_it;
1439 else
1440 tmp = doevery;
1441 }
1442 else
1443 tmp = 1;
1444 s++;
1445 }
1446 }
1447 break;
1448 case NSPACEL:
1449 PL_reg_flags |= RF_tainted;
1450 if (do_utf8) {
1451 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1452 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1453 if (tmp && (norun || regtry(prog, s)))
1454 goto got_it;
1455 else
1456 tmp = doevery;
1457 }
1458 else
1459 tmp = 1;
1460 s += uskip;
1461 }
1462 }
1463 else {
1464 while (s < strend) {
1465 if (!isSPACE_LC(*s)) {
1466 if (tmp && (norun || regtry(prog, s)))
1467 goto got_it;
1468 else
1469 tmp = doevery;
1470 }
1471 else
1472 tmp = 1;
1473 s++;
1474 }
1475 }
1476 break;
1477 case DIGIT:
1478 if (do_utf8) {
1479 LOAD_UTF8_CHARCLASS(digit,"0");
1480 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1481 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1482 if (tmp && (norun || regtry(prog, s)))
1483 goto got_it;
1484 else
1485 tmp = doevery;
1486 }
1487 else
1488 tmp = 1;
1489 s += uskip;
1490 }
1491 }
1492 else {
1493 while (s < strend) {
1494 if (isDIGIT(*s)) {
1495 if (tmp && (norun || regtry(prog, s)))
1496 goto got_it;
1497 else
1498 tmp = doevery;
1499 }
1500 else
1501 tmp = 1;
1502 s++;
1503 }
1504 }
1505 break;
1506 case DIGITL:
1507 PL_reg_flags |= RF_tainted;
1508 if (do_utf8) {
1509 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1510 if (isDIGIT_LC_utf8((U8*)s)) {
1511 if (tmp && (norun || regtry(prog, s)))
1512 goto got_it;
1513 else
1514 tmp = doevery;
1515 }
1516 else
1517 tmp = 1;
1518 s += uskip;
1519 }
1520 }
1521 else {
1522 while (s < strend) {
1523 if (isDIGIT_LC(*s)) {
1524 if (tmp && (norun || regtry(prog, s)))
1525 goto got_it;
1526 else
1527 tmp = doevery;
1528 }
1529 else
1530 tmp = 1;
1531 s++;
1532 }
1533 }
1534 break;
1535 case NDIGIT:
1536 if (do_utf8) {
1537 LOAD_UTF8_CHARCLASS(digit,"0");
1538 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1539 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1540 if (tmp && (norun || regtry(prog, s)))
1541 goto got_it;
1542 else
1543 tmp = doevery;
1544 }
1545 else
1546 tmp = 1;
1547 s += uskip;
1548 }
1549 }
1550 else {
1551 while (s < strend) {
1552 if (!isDIGIT(*s)) {
1553 if (tmp && (norun || regtry(prog, s)))
1554 goto got_it;
1555 else
1556 tmp = doevery;
1557 }
1558 else
1559 tmp = 1;
1560 s++;
1561 }
1562 }
1563 break;
1564 case NDIGITL:
1565 PL_reg_flags |= RF_tainted;
1566 if (do_utf8) {
1567 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1568 if (!isDIGIT_LC_utf8((U8*)s)) {
1569 if (tmp && (norun || regtry(prog, s)))
1570 goto got_it;
1571 else
1572 tmp = doevery;
1573 }
1574 else
1575 tmp = 1;
1576 s += uskip;
1577 }
1578 }
1579 else {
1580 while (s < strend) {
1581 if (!isDIGIT_LC(*s)) {
1582 if (tmp && (norun || regtry(prog, s)))
1583 goto got_it;
1584 else
1585 tmp = doevery;
1586 }
1587 else
1588 tmp = 1;
1589 s++;
1590 }
1591 }
1592 break;
1593 default:
1594 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1595 break;
1596 }
1597 return 0;
1598 got_it:
1599 return s;
1600 }
1601
1602 /*
1603 - regexec_flags - match a regexp against a string
1604 */
1605 I32
Perl_regexec_flags(pTHX_ register regexp * prog,char * stringarg,register char * strend,char * strbeg,I32 minend,SV * sv,void * data,U32 flags)1606 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1607 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1608 /* strend: pointer to null at end of string */
1609 /* strbeg: real beginning of string */
1610 /* minend: end of match must be >=minend after stringarg. */
1611 /* data: May be used for some additional optimizations. */
1612 /* nosave: For optimizations. */
1613 {
1614 register char *s;
1615 register regnode *c;
1616 register char *startpos = stringarg;
1617 I32 minlen; /* must match at least this many chars */
1618 I32 dontbother = 0; /* how many characters not to try at end */
1619 /* I32 start_shift = 0; */ /* Offset of the start to find
1620 constant substr. */ /* CC */
1621 I32 end_shift = 0; /* Same for the end. */ /* CC */
1622 I32 scream_pos = -1; /* Internal iterator of scream. */
1623 char *scream_olds;
1624 SV* oreplsv = GvSV(PL_replgv);
1625 bool do_utf8 = DO_UTF8(sv);
1626 #ifdef DEBUGGING
1627 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1628 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1629 #endif
1630 RX_MATCH_UTF8_set(prog,do_utf8);
1631
1632 PL_regcc = 0;
1633
1634 cache_re(prog);
1635 #ifdef DEBUGGING
1636 PL_regnarrate = DEBUG_r_TEST;
1637 #endif
1638
1639 /* Be paranoid... */
1640 if (prog == NULL || startpos == NULL) {
1641 Perl_croak(aTHX_ "NULL regexp parameter");
1642 return 0;
1643 }
1644
1645 minlen = prog->minlen;
1646 if (strend - startpos < minlen) {
1647 DEBUG_r(PerlIO_printf(Perl_debug_log,
1648 "String too short [regexec_flags]...\n"));
1649 goto phooey;
1650 }
1651
1652 /* Check validity of program. */
1653 if (UCHARAT(prog->program) != REG_MAGIC) {
1654 Perl_croak(aTHX_ "corrupted regexp program");
1655 }
1656
1657 PL_reg_flags = 0;
1658 PL_reg_eval_set = 0;
1659 PL_reg_maxiter = 0;
1660
1661 if (prog->reganch & ROPT_UTF8)
1662 PL_reg_flags |= RF_utf8;
1663
1664 /* Mark beginning of line for ^ and lookbehind. */
1665 PL_regbol = startpos;
1666 PL_bostr = strbeg;
1667 PL_reg_sv = sv;
1668
1669 /* Mark end of line for $ (and such) */
1670 PL_regeol = strend;
1671
1672 /* see how far we have to get to not match where we matched before */
1673 PL_regtill = startpos+minend;
1674
1675 /* We start without call_cc context. */
1676 PL_reg_call_cc = 0;
1677
1678 /* If there is a "must appear" string, look for it. */
1679 s = startpos;
1680
1681 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1682 MAGIC *mg;
1683
1684 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1685 PL_reg_ganch = startpos;
1686 else if (sv && SvTYPE(sv) >= SVt_PVMG
1687 && SvMAGIC(sv)
1688 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1689 && mg->mg_len >= 0) {
1690 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1691 if (prog->reganch & ROPT_ANCH_GPOS) {
1692 if (s > PL_reg_ganch)
1693 goto phooey;
1694 s = PL_reg_ganch;
1695 }
1696 }
1697 else /* pos() not defined */
1698 PL_reg_ganch = strbeg;
1699 }
1700
1701 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1702 re_scream_pos_data d;
1703
1704 d.scream_olds = &scream_olds;
1705 d.scream_pos = &scream_pos;
1706 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1707 if (!s) {
1708 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1709 goto phooey; /* not present */
1710 }
1711 }
1712
1713 DEBUG_r({
1714 char *s0 = UTF ?
1715 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1716 UNI_DISPLAY_REGEX) :
1717 prog->precomp;
1718 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1719 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1720 UNI_DISPLAY_REGEX) : startpos;
1721 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1722 if (!PL_colorset)
1723 reginitcolors();
1724 PerlIO_printf(Perl_debug_log,
1725 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
1726 PL_colors[4],PL_colors[5],PL_colors[0],
1727 len0, len0, s0,
1728 PL_colors[1],
1729 len0 > 60 ? "..." : "",
1730 PL_colors[0],
1731 (int)(len1 > 60 ? 60 : len1),
1732 s1, PL_colors[1],
1733 (len1 > 60 ? "..." : "")
1734 );
1735 });
1736
1737 /* Simplest case: anchored match need be tried only once. */
1738 /* [unless only anchor is BOL and multiline is set] */
1739 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1740 if (s == startpos && regtry(prog, startpos))
1741 goto got_it;
1742 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1743 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1744 {
1745 char *end;
1746
1747 if (minlen)
1748 dontbother = minlen - 1;
1749 end = HOP3c(strend, -dontbother, strbeg) - 1;
1750 /* for multiline we only have to try after newlines */
1751 if (prog->check_substr || prog->check_utf8) {
1752 if (s == startpos)
1753 goto after_try;
1754 while (1) {
1755 if (regtry(prog, s))
1756 goto got_it;
1757 after_try:
1758 if (s >= end)
1759 goto phooey;
1760 if (prog->reganch & RE_USE_INTUIT) {
1761 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1762 if (!s)
1763 goto phooey;
1764 }
1765 else
1766 s++;
1767 }
1768 } else {
1769 if (s > startpos)
1770 s--;
1771 while (s < end) {
1772 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1773 if (regtry(prog, s))
1774 goto got_it;
1775 }
1776 }
1777 }
1778 }
1779 goto phooey;
1780 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1781 if (regtry(prog, PL_reg_ganch))
1782 goto got_it;
1783 goto phooey;
1784 }
1785
1786 /* Messy cases: unanchored match. */
1787 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1788 /* we have /x+whatever/ */
1789 /* it must be a one character string (XXXX Except UTF?) */
1790 char ch;
1791 #ifdef DEBUGGING
1792 int did_match = 0;
1793 #endif
1794 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1795 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1796 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1797
1798 if (do_utf8) {
1799 while (s < strend) {
1800 if (*s == ch) {
1801 DEBUG_r( did_match = 1 );
1802 if (regtry(prog, s)) goto got_it;
1803 s += UTF8SKIP(s);
1804 while (s < strend && *s == ch)
1805 s += UTF8SKIP(s);
1806 }
1807 s += UTF8SKIP(s);
1808 }
1809 }
1810 else {
1811 while (s < strend) {
1812 if (*s == ch) {
1813 DEBUG_r( did_match = 1 );
1814 if (regtry(prog, s)) goto got_it;
1815 s++;
1816 while (s < strend && *s == ch)
1817 s++;
1818 }
1819 s++;
1820 }
1821 }
1822 DEBUG_r(if (!did_match)
1823 PerlIO_printf(Perl_debug_log,
1824 "Did not find anchored character...\n")
1825 );
1826 }
1827 /*SUPPRESS 560*/
1828 else if (prog->anchored_substr != Nullsv
1829 || prog->anchored_utf8 != Nullsv
1830 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1831 && prog->float_max_offset < strend - s)) {
1832 SV *must;
1833 I32 back_max;
1834 I32 back_min;
1835 char *last;
1836 char *last1; /* Last position checked before */
1837 #ifdef DEBUGGING
1838 int did_match = 0;
1839 #endif
1840 if (prog->anchored_substr || prog->anchored_utf8) {
1841 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1842 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1843 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1844 back_max = back_min = prog->anchored_offset;
1845 } else {
1846 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1847 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1848 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1849 back_max = prog->float_max_offset;
1850 back_min = prog->float_min_offset;
1851 }
1852 if (must == &PL_sv_undef)
1853 /* could not downgrade utf8 check substring, so must fail */
1854 goto phooey;
1855
1856 last = HOP3c(strend, /* Cannot start after this */
1857 -(I32)(CHR_SVLEN(must)
1858 - (SvTAIL(must) != 0) + back_min), strbeg);
1859
1860 if (s > PL_bostr)
1861 last1 = HOPc(s, -1);
1862 else
1863 last1 = s - 1; /* bogus */
1864
1865 /* XXXX check_substr already used to find `s', can optimize if
1866 check_substr==must. */
1867 scream_pos = -1;
1868 dontbother = end_shift;
1869 strend = HOPc(strend, -dontbother);
1870 while ( (s <= last) &&
1871 ((flags & REXEC_SCREAM)
1872 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1873 end_shift, &scream_pos, 0))
1874 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1875 (unsigned char*)strend, must,
1876 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1877 /* we may be pointing at the wrong string */
1878 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1879 s = strbeg + (s - SvPVX(sv));
1880 DEBUG_r( did_match = 1 );
1881 if (HOPc(s, -back_max) > last1) {
1882 last1 = HOPc(s, -back_min);
1883 s = HOPc(s, -back_max);
1884 }
1885 else {
1886 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1887
1888 last1 = HOPc(s, -back_min);
1889 s = t;
1890 }
1891 if (do_utf8) {
1892 while (s <= last1) {
1893 if (regtry(prog, s))
1894 goto got_it;
1895 s += UTF8SKIP(s);
1896 }
1897 }
1898 else {
1899 while (s <= last1) {
1900 if (regtry(prog, s))
1901 goto got_it;
1902 s++;
1903 }
1904 }
1905 }
1906 DEBUG_r(if (!did_match)
1907 PerlIO_printf(Perl_debug_log,
1908 "Did not find %s substr `%s%.*s%s'%s...\n",
1909 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1910 ? "anchored" : "floating"),
1911 PL_colors[0],
1912 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1913 SvPVX(must),
1914 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1915 );
1916 goto phooey;
1917 }
1918 else if ((c = prog->regstclass)) {
1919 if (minlen) {
1920 I32 op = (U8)OP(prog->regstclass);
1921 /* don't bother with what can't match */
1922 if (PL_regkind[op] != EXACT && op != CANY)
1923 strend = HOPc(strend, -(minlen - 1));
1924 }
1925 DEBUG_r({
1926 SV *prop = sv_newmortal();
1927 char *s0;
1928 char *s1;
1929 int len0;
1930 int len1;
1931
1932 regprop(prop, c);
1933 s0 = UTF ?
1934 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
1935 UNI_DISPLAY_REGEX) :
1936 SvPVX(prop);
1937 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1938 s1 = UTF ?
1939 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1940 len1 = UTF ? SvCUR(dsv1) : strend - s;
1941 PerlIO_printf(Perl_debug_log,
1942 "Matching stclass `%*.*s' against `%*.*s'\n",
1943 len0, len0, s0,
1944 len1, len1, s1);
1945 });
1946 if (find_byclass(prog, c, s, strend, startpos, 0))
1947 goto got_it;
1948 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1949 }
1950 else {
1951 dontbother = 0;
1952 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1953 /* Trim the end. */
1954 char *last;
1955 SV* float_real;
1956
1957 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1958 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1959 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1960
1961 if (flags & REXEC_SCREAM) {
1962 last = screaminstr(sv, float_real, s - strbeg,
1963 end_shift, &scream_pos, 1); /* last one */
1964 if (!last)
1965 last = scream_olds; /* Only one occurrence. */
1966 /* we may be pointing at the wrong string */
1967 else if (RX_MATCH_COPIED(prog))
1968 s = strbeg + (s - SvPVX(sv));
1969 }
1970 else {
1971 STRLEN len;
1972 char *little = SvPV(float_real, len);
1973
1974 if (SvTAIL(float_real)) {
1975 if (memEQ(strend - len + 1, little, len - 1))
1976 last = strend - len + 1;
1977 else if (!PL_multiline)
1978 last = memEQ(strend - len, little, len)
1979 ? strend - len : Nullch;
1980 else
1981 goto find_last;
1982 } else {
1983 find_last:
1984 if (len)
1985 last = rninstr(s, strend, little, little + len);
1986 else
1987 last = strend; /* matching `$' */
1988 }
1989 }
1990 if (last == NULL) {
1991 DEBUG_r(PerlIO_printf(Perl_debug_log,
1992 "%sCan't trim the tail, match fails (should not happen)%s\n",
1993 PL_colors[4],PL_colors[5]));
1994 goto phooey; /* Should not happen! */
1995 }
1996 dontbother = strend - last + prog->float_min_offset;
1997 }
1998 if (minlen && (dontbother < minlen))
1999 dontbother = minlen - 1;
2000 strend -= dontbother; /* this one's always in bytes! */
2001 /* We don't know much -- general case. */
2002 if (do_utf8) {
2003 for (;;) {
2004 if (regtry(prog, s))
2005 goto got_it;
2006 if (s >= strend)
2007 break;
2008 s += UTF8SKIP(s);
2009 };
2010 }
2011 else {
2012 do {
2013 if (regtry(prog, s))
2014 goto got_it;
2015 } while (s++ < strend);
2016 }
2017 }
2018
2019 /* Failure. */
2020 goto phooey;
2021
2022 got_it:
2023 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2024
2025 if (PL_reg_eval_set) {
2026 /* Preserve the current value of $^R */
2027 if (oreplsv != GvSV(PL_replgv))
2028 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2029 restored, the value remains
2030 the same. */
2031 restore_pos(aTHX_ 0);
2032 }
2033
2034 /* make sure $`, $&, $', and $digit will work later */
2035 if ( !(flags & REXEC_NOT_FIRST) ) {
2036 if (RX_MATCH_COPIED(prog)) {
2037 Safefree(prog->subbeg);
2038 RX_MATCH_COPIED_off(prog);
2039 }
2040 if (flags & REXEC_COPY_STR) {
2041 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2042
2043 s = savepvn(strbeg, i);
2044 prog->subbeg = s;
2045 prog->sublen = i;
2046 RX_MATCH_COPIED_on(prog);
2047 }
2048 else {
2049 prog->subbeg = strbeg;
2050 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2051 }
2052 }
2053
2054 return 1;
2055
2056 phooey:
2057 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2058 PL_colors[4],PL_colors[5]));
2059 if (PL_reg_eval_set)
2060 restore_pos(aTHX_ 0);
2061 return 0;
2062 }
2063
2064 /*
2065 - regtry - try match at specific point
2066 */
2067 STATIC I32 /* 0 failure, 1 success */
S_regtry(pTHX_ regexp * prog,char * startpos)2068 S_regtry(pTHX_ regexp *prog, char *startpos)
2069 {
2070 register I32 i;
2071 register I32 *sp;
2072 register I32 *ep;
2073 CHECKPOINT lastcp;
2074
2075 #ifdef DEBUGGING
2076 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2077 #endif
2078 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2079 MAGIC *mg;
2080
2081 PL_reg_eval_set = RS_init;
2082 DEBUG_r(DEBUG_s(
2083 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2084 (IV)(PL_stack_sp - PL_stack_base));
2085 ));
2086 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2087 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2088 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2089 SAVETMPS;
2090 /* Apparently this is not needed, judging by wantarray. */
2091 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2092 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2093
2094 if (PL_reg_sv) {
2095 /* Make $_ available to executed code. */
2096 if (PL_reg_sv != DEFSV) {
2097 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
2098 SAVESPTR(DEFSV);
2099 DEFSV = PL_reg_sv;
2100 }
2101
2102 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2103 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2104 /* prepare for quick setting of pos */
2105 sv_magic(PL_reg_sv, (SV*)0,
2106 PERL_MAGIC_regex_global, Nullch, 0);
2107 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2108 mg->mg_len = -1;
2109 }
2110 PL_reg_magic = mg;
2111 PL_reg_oldpos = mg->mg_len;
2112 SAVEDESTRUCTOR_X(restore_pos, 0);
2113 }
2114 if (!PL_reg_curpm) {
2115 Newz(22,PL_reg_curpm, 1, PMOP);
2116 #ifdef USE_ITHREADS
2117 {
2118 SV* repointer = newSViv(0);
2119 /* so we know which PL_regex_padav element is PL_reg_curpm */
2120 SvFLAGS(repointer) |= SVf_BREAK;
2121 av_push(PL_regex_padav,repointer);
2122 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2123 PL_regex_pad = AvARRAY(PL_regex_padav);
2124 }
2125 #endif
2126 }
2127 PM_SETRE(PL_reg_curpm, prog);
2128 PL_reg_oldcurpm = PL_curpm;
2129 PL_curpm = PL_reg_curpm;
2130 if (RX_MATCH_COPIED(prog)) {
2131 /* Here is a serious problem: we cannot rewrite subbeg,
2132 since it may be needed if this match fails. Thus
2133 $` inside (?{}) could fail... */
2134 PL_reg_oldsaved = prog->subbeg;
2135 PL_reg_oldsavedlen = prog->sublen;
2136 RX_MATCH_COPIED_off(prog);
2137 }
2138 else
2139 PL_reg_oldsaved = Nullch;
2140 prog->subbeg = PL_bostr;
2141 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2142 }
2143 prog->startp[0] = startpos - PL_bostr;
2144 PL_reginput = startpos;
2145 PL_regstartp = prog->startp;
2146 PL_regendp = prog->endp;
2147 PL_reglastparen = &prog->lastparen;
2148 PL_reglastcloseparen = &prog->lastcloseparen;
2149 prog->lastparen = 0;
2150 prog->lastcloseparen = 0;
2151 PL_regsize = 0;
2152 DEBUG_r(PL_reg_starttry = startpos);
2153 if (PL_reg_start_tmpl <= prog->nparens) {
2154 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2155 if(PL_reg_start_tmp)
2156 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2157 else
2158 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2159 }
2160
2161 /* XXXX What this code is doing here?!!! There should be no need
2162 to do this again and again, PL_reglastparen should take care of
2163 this! --ilya*/
2164
2165 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2166 * Actually, the code in regcppop() (which Ilya may be meaning by
2167 * PL_reglastparen), is not needed at all by the test suite
2168 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2169 * enough, for building DynaLoader, or otherwise this
2170 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2171 * will happen. Meanwhile, this code *is* needed for the
2172 * above-mentioned test suite tests to succeed. The common theme
2173 * on those tests seems to be returning null fields from matches.
2174 * --jhi */
2175 #if 1
2176 sp = prog->startp;
2177 ep = prog->endp;
2178 if (prog->nparens) {
2179 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2180 *++sp = -1;
2181 *++ep = -1;
2182 }
2183 }
2184 #endif
2185 REGCP_SET(lastcp);
2186 if (regmatch(prog->program + 1)) {
2187 prog->endp[0] = PL_reginput - PL_bostr;
2188 return 1;
2189 }
2190 REGCP_UNWIND(lastcp);
2191 return 0;
2192 }
2193
2194 #define RE_UNWIND_BRANCH 1
2195 #define RE_UNWIND_BRANCHJ 2
2196
2197 union re_unwind_t;
2198
2199 typedef struct { /* XX: makes sense to enlarge it... */
2200 I32 type;
2201 I32 prev;
2202 CHECKPOINT lastcp;
2203 } re_unwind_generic_t;
2204
2205 typedef struct {
2206 I32 type;
2207 I32 prev;
2208 CHECKPOINT lastcp;
2209 I32 lastparen;
2210 regnode *next;
2211 char *locinput;
2212 I32 nextchr;
2213 #ifdef DEBUGGING
2214 int regindent;
2215 #endif
2216 } re_unwind_branch_t;
2217
2218 typedef union re_unwind_t {
2219 I32 type;
2220 re_unwind_generic_t generic;
2221 re_unwind_branch_t branch;
2222 } re_unwind_t;
2223
2224 #define sayYES goto yes
2225 #define sayNO goto no
2226 #define sayNO_ANYOF goto no_anyof
2227 #define sayYES_FINAL goto yes_final
2228 #define sayYES_LOUD goto yes_loud
2229 #define sayNO_FINAL goto no_final
2230 #define sayNO_SILENT goto do_no
2231 #define saySAME(x) if (x) goto yes; else goto no
2232
2233 #define REPORT_CODE_OFF 24
2234
2235 /*
2236 - regmatch - main matching routine
2237 *
2238 * Conceptually the strategy is simple: check to see whether the current
2239 * node matches, call self recursively to see whether the rest matches,
2240 * and then act accordingly. In practice we make some effort to avoid
2241 * recursion, in particular by going through "ordinary" nodes (that don't
2242 * need to know whether the rest of the match failed) by a loop instead of
2243 * by recursion.
2244 */
2245 /* [lwall] I've hoisted the register declarations to the outer block in order to
2246 * maybe save a little bit of pushing and popping on the stack. It also takes
2247 * advantage of machines that use a register save mask on subroutine entry.
2248 */
2249 STATIC I32 /* 0 failure, 1 success */
S_regmatch(pTHX_ regnode * prog)2250 S_regmatch(pTHX_ regnode *prog)
2251 {
2252 register regnode *scan; /* Current node. */
2253 regnode *next; /* Next node. */
2254 regnode *inner; /* Next node in internal branch. */
2255 register I32 nextchr; /* renamed nextchr - nextchar colides with
2256 function of same name */
2257 register I32 n; /* no or next */
2258 register I32 ln = 0; /* len or last */
2259 register char *s = Nullch; /* operand or save */
2260 register char *locinput = PL_reginput;
2261 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2262 int minmod = 0, sw = 0, logical = 0;
2263 I32 unwind = 0;
2264 #if 0
2265 I32 firstcp = PL_savestack_ix;
2266 #endif
2267 register bool do_utf8 = PL_reg_match_utf8;
2268 #ifdef DEBUGGING
2269 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2270 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2271 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2272 #endif
2273
2274 #ifdef DEBUGGING
2275 PL_regindent++;
2276 #endif
2277
2278 /* Note that nextchr is a byte even in UTF */
2279 nextchr = UCHARAT(locinput);
2280 scan = prog;
2281 while (scan != NULL) {
2282
2283 DEBUG_r( {
2284 SV *prop = sv_newmortal();
2285 int docolor = *PL_colors[0];
2286 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2287 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2288 /* The part of the string before starttry has one color
2289 (pref0_len chars), between starttry and current
2290 position another one (pref_len - pref0_len chars),
2291 after the current position the third one.
2292 We assume that pref0_len <= pref_len, otherwise we
2293 decrease pref0_len. */
2294 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2295 ? (5 + taill) - l : locinput - PL_bostr;
2296 int pref0_len;
2297
2298 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2299 pref_len++;
2300 pref0_len = pref_len - (locinput - PL_reg_starttry);
2301 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2302 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2303 ? (5 + taill) - pref_len : PL_regeol - locinput);
2304 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2305 l--;
2306 if (pref0_len < 0)
2307 pref0_len = 0;
2308 if (pref0_len > pref_len)
2309 pref0_len = pref_len;
2310 regprop(prop, scan);
2311 {
2312 char *s0 =
2313 do_utf8 && OP(scan) != CANY ?
2314 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2315 pref0_len, 60, UNI_DISPLAY_REGEX) :
2316 locinput - pref_len;
2317 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2318 char *s1 = do_utf8 && OP(scan) != CANY ?
2319 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2320 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2321 locinput - pref_len + pref0_len;
2322 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2323 char *s2 = do_utf8 && OP(scan) != CANY ?
2324 pv_uni_display(dsv2, (U8*)locinput,
2325 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2326 locinput;
2327 int len2 = do_utf8 ? strlen(s2) : l;
2328 PerlIO_printf(Perl_debug_log,
2329 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2330 (IV)(locinput - PL_bostr),
2331 PL_colors[4],
2332 len0, s0,
2333 PL_colors[5],
2334 PL_colors[2],
2335 len1, s1,
2336 PL_colors[3],
2337 (docolor ? "" : "> <"),
2338 PL_colors[0],
2339 len2, s2,
2340 PL_colors[1],
2341 15 - l - pref_len + 1,
2342 "",
2343 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2344 SvPVX(prop));
2345 }
2346 });
2347
2348 next = scan + NEXT_OFF(scan);
2349 if (next == scan)
2350 next = NULL;
2351
2352 switch (OP(scan)) {
2353 case BOL:
2354 if (locinput == PL_bostr || (PL_multiline &&
2355 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2356 {
2357 /* regtill = regbol; */
2358 break;
2359 }
2360 sayNO;
2361 case MBOL:
2362 if (locinput == PL_bostr ||
2363 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2364 {
2365 break;
2366 }
2367 sayNO;
2368 case SBOL:
2369 if (locinput == PL_bostr)
2370 break;
2371 sayNO;
2372 case GPOS:
2373 if (locinput == PL_reg_ganch)
2374 break;
2375 sayNO;
2376 case EOL:
2377 if (PL_multiline)
2378 goto meol;
2379 else
2380 goto seol;
2381 case MEOL:
2382 meol:
2383 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2384 sayNO;
2385 break;
2386 case SEOL:
2387 seol:
2388 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2389 sayNO;
2390 if (PL_regeol - locinput > 1)
2391 sayNO;
2392 break;
2393 case EOS:
2394 if (PL_regeol != locinput)
2395 sayNO;
2396 break;
2397 case SANY:
2398 if (!nextchr && locinput >= PL_regeol)
2399 sayNO;
2400 if (do_utf8) {
2401 locinput += PL_utf8skip[nextchr];
2402 if (locinput > PL_regeol)
2403 sayNO;
2404 nextchr = UCHARAT(locinput);
2405 }
2406 else
2407 nextchr = UCHARAT(++locinput);
2408 break;
2409 case CANY:
2410 if (!nextchr && locinput >= PL_regeol)
2411 sayNO;
2412 nextchr = UCHARAT(++locinput);
2413 break;
2414 case REG_ANY:
2415 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2416 sayNO;
2417 if (do_utf8) {
2418 locinput += PL_utf8skip[nextchr];
2419 if (locinput > PL_regeol)
2420 sayNO;
2421 nextchr = UCHARAT(locinput);
2422 }
2423 else
2424 nextchr = UCHARAT(++locinput);
2425 break;
2426 case EXACT:
2427 s = STRING(scan);
2428 ln = STR_LEN(scan);
2429 if (do_utf8 != UTF) {
2430 /* The target and the pattern have differing utf8ness. */
2431 char *l = locinput;
2432 char *e = s + ln;
2433 STRLEN ulen;
2434
2435 if (do_utf8) {
2436 /* The target is utf8, the pattern is not utf8. */
2437 while (s < e) {
2438 if (l >= PL_regeol)
2439 sayNO;
2440 if (NATIVE_TO_UNI(*(U8*)s) !=
2441 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2442 ckWARN(WARN_UTF8) ?
2443 0 : UTF8_ALLOW_ANY))
2444 sayNO;
2445 l += ulen;
2446 s ++;
2447 }
2448 }
2449 else {
2450 /* The target is not utf8, the pattern is utf8. */
2451 while (s < e) {
2452 if (l >= PL_regeol)
2453 sayNO;
2454 if (NATIVE_TO_UNI(*((U8*)l)) !=
2455 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2456 ckWARN(WARN_UTF8) ?
2457 0 : UTF8_ALLOW_ANY))
2458 sayNO;
2459 s += ulen;
2460 l ++;
2461 }
2462 }
2463 locinput = l;
2464 nextchr = UCHARAT(locinput);
2465 break;
2466 }
2467 /* The target and the pattern have the same utf8ness. */
2468 /* Inline the first character, for speed. */
2469 if (UCHARAT(s) != nextchr)
2470 sayNO;
2471 if (PL_regeol - locinput < ln)
2472 sayNO;
2473 if (ln > 1 && memNE(s, locinput, ln))
2474 sayNO;
2475 locinput += ln;
2476 nextchr = UCHARAT(locinput);
2477 break;
2478 case EXACTFL:
2479 PL_reg_flags |= RF_tainted;
2480 /* FALL THROUGH */
2481 case EXACTF:
2482 s = STRING(scan);
2483 ln = STR_LEN(scan);
2484
2485 if (do_utf8 || UTF) {
2486 /* Either target or the pattern are utf8. */
2487 char *l = locinput;
2488 char *e = PL_regeol;
2489
2490 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2491 l, &e, 0, do_utf8)) {
2492 /* One more case for the sharp s:
2493 * pack("U0U*", 0xDF) =~ /ss/i,
2494 * the 0xC3 0x9F are the UTF-8
2495 * byte sequence for the U+00DF. */
2496 if (!(do_utf8 &&
2497 toLOWER(s[0]) == 's' &&
2498 ln >= 2 &&
2499 toLOWER(s[1]) == 's' &&
2500 (U8)l[0] == 0xC3 &&
2501 e - l >= 2 &&
2502 (U8)l[1] == 0x9F))
2503 sayNO;
2504 }
2505 locinput = e;
2506 nextchr = UCHARAT(locinput);
2507 break;
2508 }
2509
2510 /* Neither the target and the pattern are utf8. */
2511
2512 /* Inline the first character, for speed. */
2513 if (UCHARAT(s) != nextchr &&
2514 UCHARAT(s) != ((OP(scan) == EXACTF)
2515 ? PL_fold : PL_fold_locale)[nextchr])
2516 sayNO;
2517 if (PL_regeol - locinput < ln)
2518 sayNO;
2519 if (ln > 1 && (OP(scan) == EXACTF
2520 ? ibcmp(s, locinput, ln)
2521 : ibcmp_locale(s, locinput, ln)))
2522 sayNO;
2523 locinput += ln;
2524 nextchr = UCHARAT(locinput);
2525 break;
2526 case ANYOF:
2527 if (do_utf8) {
2528 STRLEN inclasslen = PL_regeol - locinput;
2529
2530 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2531 sayNO_ANYOF;
2532 if (locinput >= PL_regeol)
2533 sayNO;
2534 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2535 nextchr = UCHARAT(locinput);
2536 break;
2537 }
2538 else {
2539 if (nextchr < 0)
2540 nextchr = UCHARAT(locinput);
2541 if (!REGINCLASS(scan, (U8*)locinput))
2542 sayNO_ANYOF;
2543 if (!nextchr && locinput >= PL_regeol)
2544 sayNO;
2545 nextchr = UCHARAT(++locinput);
2546 break;
2547 }
2548 no_anyof:
2549 /* If we might have the case of the German sharp s
2550 * in a casefolding Unicode character class. */
2551
2552 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2553 locinput += SHARP_S_SKIP;
2554 nextchr = UCHARAT(locinput);
2555 }
2556 else
2557 sayNO;
2558 break;
2559 case ALNUML:
2560 PL_reg_flags |= RF_tainted;
2561 /* FALL THROUGH */
2562 case ALNUM:
2563 if (!nextchr)
2564 sayNO;
2565 if (do_utf8) {
2566 LOAD_UTF8_CHARCLASS(alnum,"a");
2567 if (!(OP(scan) == ALNUM
2568 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2569 : isALNUM_LC_utf8((U8*)locinput)))
2570 {
2571 sayNO;
2572 }
2573 locinput += PL_utf8skip[nextchr];
2574 nextchr = UCHARAT(locinput);
2575 break;
2576 }
2577 if (!(OP(scan) == ALNUM
2578 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2579 sayNO;
2580 nextchr = UCHARAT(++locinput);
2581 break;
2582 case NALNUML:
2583 PL_reg_flags |= RF_tainted;
2584 /* FALL THROUGH */
2585 case NALNUM:
2586 if (!nextchr && locinput >= PL_regeol)
2587 sayNO;
2588 if (do_utf8) {
2589 LOAD_UTF8_CHARCLASS(alnum,"a");
2590 if (OP(scan) == NALNUM
2591 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2592 : isALNUM_LC_utf8((U8*)locinput))
2593 {
2594 sayNO;
2595 }
2596 locinput += PL_utf8skip[nextchr];
2597 nextchr = UCHARAT(locinput);
2598 break;
2599 }
2600 if (OP(scan) == NALNUM
2601 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2602 sayNO;
2603 nextchr = UCHARAT(++locinput);
2604 break;
2605 case BOUNDL:
2606 case NBOUNDL:
2607 PL_reg_flags |= RF_tainted;
2608 /* FALL THROUGH */
2609 case BOUND:
2610 case NBOUND:
2611 /* was last char in word? */
2612 if (do_utf8) {
2613 if (locinput == PL_bostr)
2614 ln = '\n';
2615 else {
2616 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2617
2618 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
2619 }
2620 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2621 ln = isALNUM_uni(ln);
2622 LOAD_UTF8_CHARCLASS(alnum,"a");
2623 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2624 }
2625 else {
2626 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2627 n = isALNUM_LC_utf8((U8*)locinput);
2628 }
2629 }
2630 else {
2631 ln = (locinput != PL_bostr) ?
2632 UCHARAT(locinput - 1) : '\n';
2633 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2634 ln = isALNUM(ln);
2635 n = isALNUM(nextchr);
2636 }
2637 else {
2638 ln = isALNUM_LC(ln);
2639 n = isALNUM_LC(nextchr);
2640 }
2641 }
2642 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2643 OP(scan) == BOUNDL))
2644 sayNO;
2645 break;
2646 case SPACEL:
2647 PL_reg_flags |= RF_tainted;
2648 /* FALL THROUGH */
2649 case SPACE:
2650 if (!nextchr)
2651 sayNO;
2652 if (do_utf8) {
2653 if (UTF8_IS_CONTINUED(nextchr)) {
2654 LOAD_UTF8_CHARCLASS(space," ");
2655 if (!(OP(scan) == SPACE
2656 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2657 : isSPACE_LC_utf8((U8*)locinput)))
2658 {
2659 sayNO;
2660 }
2661 locinput += PL_utf8skip[nextchr];
2662 nextchr = UCHARAT(locinput);
2663 break;
2664 }
2665 if (!(OP(scan) == SPACE
2666 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2667 sayNO;
2668 nextchr = UCHARAT(++locinput);
2669 }
2670 else {
2671 if (!(OP(scan) == SPACE
2672 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2673 sayNO;
2674 nextchr = UCHARAT(++locinput);
2675 }
2676 break;
2677 case NSPACEL:
2678 PL_reg_flags |= RF_tainted;
2679 /* FALL THROUGH */
2680 case NSPACE:
2681 if (!nextchr && locinput >= PL_regeol)
2682 sayNO;
2683 if (do_utf8) {
2684 LOAD_UTF8_CHARCLASS(space," ");
2685 if (OP(scan) == NSPACE
2686 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2687 : isSPACE_LC_utf8((U8*)locinput))
2688 {
2689 sayNO;
2690 }
2691 locinput += PL_utf8skip[nextchr];
2692 nextchr = UCHARAT(locinput);
2693 break;
2694 }
2695 if (OP(scan) == NSPACE
2696 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2697 sayNO;
2698 nextchr = UCHARAT(++locinput);
2699 break;
2700 case DIGITL:
2701 PL_reg_flags |= RF_tainted;
2702 /* FALL THROUGH */
2703 case DIGIT:
2704 if (!nextchr)
2705 sayNO;
2706 if (do_utf8) {
2707 LOAD_UTF8_CHARCLASS(digit,"0");
2708 if (!(OP(scan) == DIGIT
2709 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2710 : isDIGIT_LC_utf8((U8*)locinput)))
2711 {
2712 sayNO;
2713 }
2714 locinput += PL_utf8skip[nextchr];
2715 nextchr = UCHARAT(locinput);
2716 break;
2717 }
2718 if (!(OP(scan) == DIGIT
2719 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2720 sayNO;
2721 nextchr = UCHARAT(++locinput);
2722 break;
2723 case NDIGITL:
2724 PL_reg_flags |= RF_tainted;
2725 /* FALL THROUGH */
2726 case NDIGIT:
2727 if (!nextchr && locinput >= PL_regeol)
2728 sayNO;
2729 if (do_utf8) {
2730 LOAD_UTF8_CHARCLASS(digit,"0");
2731 if (OP(scan) == NDIGIT
2732 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2733 : isDIGIT_LC_utf8((U8*)locinput))
2734 {
2735 sayNO;
2736 }
2737 locinput += PL_utf8skip[nextchr];
2738 nextchr = UCHARAT(locinput);
2739 break;
2740 }
2741 if (OP(scan) == NDIGIT
2742 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2743 sayNO;
2744 nextchr = UCHARAT(++locinput);
2745 break;
2746 case CLUMP:
2747 if (locinput >= PL_regeol)
2748 sayNO;
2749 if (do_utf8) {
2750 LOAD_UTF8_CHARCLASS(mark,"~");
2751 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2752 sayNO;
2753 locinput += PL_utf8skip[nextchr];
2754 while (locinput < PL_regeol &&
2755 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2756 locinput += UTF8SKIP(locinput);
2757 if (locinput > PL_regeol)
2758 sayNO;
2759 }
2760 else
2761 locinput++;
2762 nextchr = UCHARAT(locinput);
2763 break;
2764 case REFFL:
2765 PL_reg_flags |= RF_tainted;
2766 /* FALL THROUGH */
2767 case REF:
2768 case REFF:
2769 n = ARG(scan); /* which paren pair */
2770 ln = PL_regstartp[n];
2771 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2772 if ((I32)*PL_reglastparen < n || ln == -1)
2773 sayNO; /* Do not match unless seen CLOSEn. */
2774 if (ln == PL_regendp[n])
2775 break;
2776
2777 s = PL_bostr + ln;
2778 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2779 char *l = locinput;
2780 char *e = PL_bostr + PL_regendp[n];
2781 /*
2782 * Note that we can't do the "other character" lookup trick as
2783 * in the 8-bit case (no pun intended) because in Unicode we
2784 * have to map both upper and title case to lower case.
2785 */
2786 if (OP(scan) == REFF) {
2787 STRLEN ulen1, ulen2;
2788 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2789 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
2790 while (s < e) {
2791 if (l >= PL_regeol)
2792 sayNO;
2793 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2794 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2795 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2796 sayNO;
2797 s += ulen1;
2798 l += ulen2;
2799 }
2800 }
2801 locinput = l;
2802 nextchr = UCHARAT(locinput);
2803 break;
2804 }
2805
2806 /* Inline the first character, for speed. */
2807 if (UCHARAT(s) != nextchr &&
2808 (OP(scan) == REF ||
2809 (UCHARAT(s) != ((OP(scan) == REFF
2810 ? PL_fold : PL_fold_locale)[nextchr]))))
2811 sayNO;
2812 ln = PL_regendp[n] - ln;
2813 if (locinput + ln > PL_regeol)
2814 sayNO;
2815 if (ln > 1 && (OP(scan) == REF
2816 ? memNE(s, locinput, ln)
2817 : (OP(scan) == REFF
2818 ? ibcmp(s, locinput, ln)
2819 : ibcmp_locale(s, locinput, ln))))
2820 sayNO;
2821 locinput += ln;
2822 nextchr = UCHARAT(locinput);
2823 break;
2824
2825 case NOTHING:
2826 case TAIL:
2827 break;
2828 case BACK:
2829 break;
2830 case EVAL:
2831 {
2832 dSP;
2833 OP_4tree *oop = PL_op;
2834 COP *ocurcop = PL_curcop;
2835 PAD *old_comppad;
2836 SV *ret;
2837 struct regexp *oreg = PL_reg_re;
2838
2839 n = ARG(scan);
2840 PL_op = (OP_4tree*)PL_regdata->data[n];
2841 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2842 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
2843 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2844
2845 {
2846 SV **before = SP;
2847 CALLRUNOPS(aTHX); /* Scalar context. */
2848 SPAGAIN;
2849 if (SP == before)
2850 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
2851 else {
2852 ret = POPs;
2853 PUTBACK;
2854 }
2855 }
2856
2857 PL_op = oop;
2858 PAD_RESTORE_LOCAL(old_comppad);
2859 PL_curcop = ocurcop;
2860 if (logical) {
2861 if (logical == 2) { /* Postponed subexpression. */
2862 regexp *re;
2863 MAGIC *mg = Null(MAGIC*);
2864 re_cc_state state;
2865 CHECKPOINT cp, lastcp;
2866 int toggleutf;
2867 register SV *sv;
2868
2869 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2870 mg = mg_find(sv, PERL_MAGIC_qr);
2871 else if (SvSMAGICAL(ret)) {
2872 if (SvGMAGICAL(ret))
2873 sv_unmagic(ret, PERL_MAGIC_qr);
2874 else
2875 mg = mg_find(ret, PERL_MAGIC_qr);
2876 }
2877
2878 if (mg) {
2879 re = (regexp *)mg->mg_obj;
2880 (void)ReREFCNT_inc(re);
2881 }
2882 else {
2883 STRLEN len;
2884 char *t = SvPV(ret, len);
2885 PMOP pm;
2886 char *oprecomp = PL_regprecomp;
2887 I32 osize = PL_regsize;
2888 I32 onpar = PL_regnpar;
2889
2890 Zero(&pm, 1, PMOP);
2891 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
2892 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2893 if (!(SvFLAGS(ret)
2894 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2895 | SVs_GMG)))
2896 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2897 PERL_MAGIC_qr,0,0);
2898 PL_regprecomp = oprecomp;
2899 PL_regsize = osize;
2900 PL_regnpar = onpar;
2901 }
2902 DEBUG_r(
2903 PerlIO_printf(Perl_debug_log,
2904 "Entering embedded `%s%.60s%s%s'\n",
2905 PL_colors[0],
2906 re->precomp,
2907 PL_colors[1],
2908 (strlen(re->precomp) > 60 ? "..." : ""))
2909 );
2910 state.node = next;
2911 state.prev = PL_reg_call_cc;
2912 state.cc = PL_regcc;
2913 state.re = PL_reg_re;
2914
2915 PL_regcc = 0;
2916
2917 cp = regcppush(0); /* Save *all* the positions. */
2918 REGCP_SET(lastcp);
2919 cache_re(re);
2920 state.ss = PL_savestack_ix;
2921 *PL_reglastparen = 0;
2922 *PL_reglastcloseparen = 0;
2923 PL_reg_call_cc = &state;
2924 PL_reginput = locinput;
2925 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2926 ((re->reganch & ROPT_UTF8) != 0);
2927 if (toggleutf) PL_reg_flags ^= RF_utf8;
2928
2929 /* XXXX This is too dramatic a measure... */
2930 PL_reg_maxiter = 0;
2931
2932 if (regmatch(re->program + 1)) {
2933 /* Even though we succeeded, we need to restore
2934 global variables, since we may be wrapped inside
2935 SUSPEND, thus the match may be not finished yet. */
2936
2937 /* XXXX Do this only if SUSPENDed? */
2938 PL_reg_call_cc = state.prev;
2939 PL_regcc = state.cc;
2940 PL_reg_re = state.re;
2941 cache_re(PL_reg_re);
2942 if (toggleutf) PL_reg_flags ^= RF_utf8;
2943
2944 /* XXXX This is too dramatic a measure... */
2945 PL_reg_maxiter = 0;
2946
2947 /* These are needed even if not SUSPEND. */
2948 ReREFCNT_dec(re);
2949 regcpblow(cp);
2950 sayYES;
2951 }
2952 ReREFCNT_dec(re);
2953 REGCP_UNWIND(lastcp);
2954 regcppop();
2955 PL_reg_call_cc = state.prev;
2956 PL_regcc = state.cc;
2957 PL_reg_re = state.re;
2958 cache_re(PL_reg_re);
2959 if (toggleutf) PL_reg_flags ^= RF_utf8;
2960
2961 /* XXXX This is too dramatic a measure... */
2962 PL_reg_maxiter = 0;
2963
2964 logical = 0;
2965 sayNO;
2966 }
2967 sw = SvTRUE(ret);
2968 logical = 0;
2969 }
2970 else {
2971 sv_setsv(save_scalar(PL_replgv), ret);
2972 cache_re(oreg);
2973 }
2974 break;
2975 }
2976 case OPEN:
2977 n = ARG(scan); /* which paren pair */
2978 PL_reg_start_tmp[n] = locinput;
2979 if (n > PL_regsize)
2980 PL_regsize = n;
2981 break;
2982 case CLOSE:
2983 n = ARG(scan); /* which paren pair */
2984 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2985 PL_regendp[n] = locinput - PL_bostr;
2986 if (n > (I32)*PL_reglastparen)
2987 *PL_reglastparen = n;
2988 *PL_reglastcloseparen = n;
2989 break;
2990 case GROUPP:
2991 n = ARG(scan); /* which paren pair */
2992 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
2993 break;
2994 case IFTHEN:
2995 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2996 if (sw)
2997 next = NEXTOPER(NEXTOPER(scan));
2998 else {
2999 next = scan + ARG(scan);
3000 if (OP(next) == IFTHEN) /* Fake one. */
3001 next = NEXTOPER(NEXTOPER(next));
3002 }
3003 break;
3004 case LOGICAL:
3005 logical = scan->flags;
3006 break;
3007 /*******************************************************************
3008 PL_regcc contains infoblock about the innermost (...)* loop, and
3009 a pointer to the next outer infoblock.
3010
3011 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3012
3013 1) After matching X, regnode for CURLYX is processed;
3014
3015 2) This regnode creates infoblock on the stack, and calls
3016 regmatch() recursively with the starting point at WHILEM node;
3017
3018 3) Each hit of WHILEM node tries to match A and Z (in the order
3019 depending on the current iteration, min/max of {min,max} and
3020 greediness). The information about where are nodes for "A"
3021 and "Z" is read from the infoblock, as is info on how many times "A"
3022 was already matched, and greediness.
3023
3024 4) After A matches, the same WHILEM node is hit again.
3025
3026 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3027 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3028 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3029 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3030 of the external loop.
3031
3032 Currently present infoblocks form a tree with a stem formed by PL_curcc
3033 and whatever it mentions via ->next, and additional attached trees
3034 corresponding to temporarily unset infoblocks as in "5" above.
3035
3036 In the following picture infoblocks for outer loop of
3037 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3038 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3039 infoblocks are drawn below the "reset" infoblock.
3040
3041 In fact in the picture below we do not show failed matches for Z and T
3042 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3043 more obvious *why* one needs to *temporary* unset infoblocks.]
3044
3045 Matched REx position InfoBlocks Comment
3046 (Y(A)*?Z)*?T x
3047 Y(A)*?Z)*?T x <- O
3048 Y (A)*?Z)*?T x <- O
3049 Y A)*?Z)*?T x <- O <- I
3050 YA )*?Z)*?T x <- O <- I
3051 YA A)*?Z)*?T x <- O <- I
3052 YAA )*?Z)*?T x <- O <- I
3053 YAA Z)*?T x <- O # Temporary unset I
3054 I
3055
3056 YAAZ Y(A)*?Z)*?T x <- O
3057 I
3058
3059 YAAZY (A)*?Z)*?T x <- O
3060 I
3061
3062 YAAZY A)*?Z)*?T x <- O <- I
3063 I
3064
3065 YAAZYA )*?Z)*?T x <- O <- I
3066 I
3067
3068 YAAZYA Z)*?T x <- O # Temporary unset I
3069 I,I
3070
3071 YAAZYAZ )*?T x <- O
3072 I,I
3073
3074 YAAZYAZ T x # Temporary unset O
3075 O
3076 I,I
3077
3078 YAAZYAZT x
3079 O
3080 I,I
3081 *******************************************************************/
3082 case CURLYX: {
3083 CURCUR cc;
3084 CHECKPOINT cp = PL_savestack_ix;
3085 /* No need to save/restore up to this paren */
3086 I32 parenfloor = scan->flags;
3087
3088 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3089 next += ARG(next);
3090 cc.oldcc = PL_regcc;
3091 PL_regcc = &cc;
3092 /* XXXX Probably it is better to teach regpush to support
3093 parenfloor > PL_regsize... */
3094 if (parenfloor > (I32)*PL_reglastparen)
3095 parenfloor = *PL_reglastparen; /* Pessimization... */
3096 cc.parenfloor = parenfloor;
3097 cc.cur = -1;
3098 cc.min = ARG1(scan);
3099 cc.max = ARG2(scan);
3100 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3101 cc.next = next;
3102 cc.minmod = minmod;
3103 cc.lastloc = 0;
3104 PL_reginput = locinput;
3105 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3106 regcpblow(cp);
3107 PL_regcc = cc.oldcc;
3108 saySAME(n);
3109 }
3110 /* NOT REACHED */
3111 case WHILEM: {
3112 /*
3113 * This is really hard to understand, because after we match
3114 * what we're trying to match, we must make sure the rest of
3115 * the REx is going to match for sure, and to do that we have
3116 * to go back UP the parse tree by recursing ever deeper. And
3117 * if it fails, we have to reset our parent's current state
3118 * that we can try again after backing off.
3119 */
3120
3121 CHECKPOINT cp, lastcp;
3122 CURCUR* cc = PL_regcc;
3123 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3124
3125 n = cc->cur + 1; /* how many we know we matched */
3126 PL_reginput = locinput;
3127
3128 DEBUG_r(
3129 PerlIO_printf(Perl_debug_log,
3130 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3131 REPORT_CODE_OFF+PL_regindent*2, "",
3132 (long)n, (long)cc->min,
3133 (long)cc->max, PTR2UV(cc))
3134 );
3135
3136 /* If degenerate scan matches "", assume scan done. */
3137
3138 if (locinput == cc->lastloc && n >= cc->min) {
3139 PL_regcc = cc->oldcc;
3140 if (PL_regcc)
3141 ln = PL_regcc->cur;
3142 DEBUG_r(
3143 PerlIO_printf(Perl_debug_log,
3144 "%*s empty match detected, try continuation...\n",
3145 REPORT_CODE_OFF+PL_regindent*2, "")
3146 );
3147 if (regmatch(cc->next))
3148 sayYES;
3149 if (PL_regcc)
3150 PL_regcc->cur = ln;
3151 PL_regcc = cc;
3152 sayNO;
3153 }
3154
3155 /* First just match a string of min scans. */
3156
3157 if (n < cc->min) {
3158 cc->cur = n;
3159 cc->lastloc = locinput;
3160 if (regmatch(cc->scan))
3161 sayYES;
3162 cc->cur = n - 1;
3163 cc->lastloc = lastloc;
3164 sayNO;
3165 }
3166
3167 if (scan->flags) {
3168 /* Check whether we already were at this position.
3169 Postpone detection until we know the match is not
3170 *that* much linear. */
3171 if (!PL_reg_maxiter) {
3172 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3173 PL_reg_leftiter = PL_reg_maxiter;
3174 }
3175 if (PL_reg_leftiter-- == 0) {
3176 I32 size = (PL_reg_maxiter + 7)/8;
3177 if (PL_reg_poscache) {
3178 if ((I32)PL_reg_poscache_size < size) {
3179 Renew(PL_reg_poscache, size, char);
3180 PL_reg_poscache_size = size;
3181 }
3182 Zero(PL_reg_poscache, size, char);
3183 }
3184 else {
3185 PL_reg_poscache_size = size;
3186 Newz(29, PL_reg_poscache, size, char);
3187 }
3188 DEBUG_r(
3189 PerlIO_printf(Perl_debug_log,
3190 "%sDetected a super-linear match, switching on caching%s...\n",
3191 PL_colors[4], PL_colors[5])
3192 );
3193 }
3194 if (PL_reg_leftiter < 0) {
3195 I32 o = locinput - PL_bostr, b;
3196
3197 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3198 b = o % 8;
3199 o /= 8;
3200 if (PL_reg_poscache[o] & (1<<b)) {
3201 DEBUG_r(
3202 PerlIO_printf(Perl_debug_log,
3203 "%*s already tried at this position...\n",
3204 REPORT_CODE_OFF+PL_regindent*2, "")
3205 );
3206 if (PL_reg_flags & RF_false)
3207 sayYES;
3208 else
3209 sayNO_SILENT;
3210 }
3211 PL_reg_poscache[o] |= (1<<b);
3212 }
3213 }
3214
3215 /* Prefer next over scan for minimal matching. */
3216
3217 if (cc->minmod) {
3218 PL_regcc = cc->oldcc;
3219 if (PL_regcc)
3220 ln = PL_regcc->cur;
3221 cp = regcppush(cc->parenfloor);
3222 REGCP_SET(lastcp);
3223 if (regmatch(cc->next)) {
3224 regcpblow(cp);
3225 sayYES; /* All done. */
3226 }
3227 REGCP_UNWIND(lastcp);
3228 regcppop();
3229 if (PL_regcc)
3230 PL_regcc->cur = ln;
3231 PL_regcc = cc;
3232
3233 if (n >= cc->max) { /* Maximum greed exceeded? */
3234 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3235 && !(PL_reg_flags & RF_warned)) {
3236 PL_reg_flags |= RF_warned;
3237 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3238 "Complex regular subexpression recursion",
3239 REG_INFTY - 1);
3240 }
3241 sayNO;
3242 }
3243
3244 DEBUG_r(
3245 PerlIO_printf(Perl_debug_log,
3246 "%*s trying longer...\n",
3247 REPORT_CODE_OFF+PL_regindent*2, "")
3248 );
3249 /* Try scanning more and see if it helps. */
3250 PL_reginput = locinput;
3251 cc->cur = n;
3252 cc->lastloc = locinput;
3253 cp = regcppush(cc->parenfloor);
3254 REGCP_SET(lastcp);
3255 if (regmatch(cc->scan)) {
3256 regcpblow(cp);
3257 sayYES;
3258 }
3259 REGCP_UNWIND(lastcp);
3260 regcppop();
3261 cc->cur = n - 1;
3262 cc->lastloc = lastloc;
3263 sayNO;
3264 }
3265
3266 /* Prefer scan over next for maximal matching. */
3267
3268 if (n < cc->max) { /* More greed allowed? */
3269 cp = regcppush(cc->parenfloor);
3270 cc->cur = n;
3271 cc->lastloc = locinput;
3272 REGCP_SET(lastcp);
3273 if (regmatch(cc->scan)) {
3274 regcpblow(cp);
3275 sayYES;
3276 }
3277 REGCP_UNWIND(lastcp);
3278 regcppop(); /* Restore some previous $<digit>s? */
3279 PL_reginput = locinput;
3280 DEBUG_r(
3281 PerlIO_printf(Perl_debug_log,
3282 "%*s failed, try continuation...\n",
3283 REPORT_CODE_OFF+PL_regindent*2, "")
3284 );
3285 }
3286 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3287 && !(PL_reg_flags & RF_warned)) {
3288 PL_reg_flags |= RF_warned;
3289 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3290 "Complex regular subexpression recursion",
3291 REG_INFTY - 1);
3292 }
3293
3294 /* Failed deeper matches of scan, so see if this one works. */
3295 PL_regcc = cc->oldcc;
3296 if (PL_regcc)
3297 ln = PL_regcc->cur;
3298 if (regmatch(cc->next))
3299 sayYES;
3300 if (PL_regcc)
3301 PL_regcc->cur = ln;
3302 PL_regcc = cc;
3303 cc->cur = n - 1;
3304 cc->lastloc = lastloc;
3305 sayNO;
3306 }
3307 /* NOT REACHED */
3308 case BRANCHJ:
3309 next = scan + ARG(scan);
3310 if (next == scan)
3311 next = NULL;
3312 inner = NEXTOPER(NEXTOPER(scan));
3313 goto do_branch;
3314 case BRANCH:
3315 inner = NEXTOPER(scan);
3316 do_branch:
3317 {
3318 c1 = OP(scan);
3319 if (OP(next) != c1) /* No choice. */
3320 next = inner; /* Avoid recursion. */
3321 else {
3322 I32 lastparen = *PL_reglastparen;
3323 I32 unwind1;
3324 re_unwind_branch_t *uw;
3325
3326 /* Put unwinding data on stack */
3327 unwind1 = SSNEWt(1,re_unwind_branch_t);
3328 uw = SSPTRt(unwind1,re_unwind_branch_t);
3329 uw->prev = unwind;
3330 unwind = unwind1;
3331 uw->type = ((c1 == BRANCH)
3332 ? RE_UNWIND_BRANCH
3333 : RE_UNWIND_BRANCHJ);
3334 uw->lastparen = lastparen;
3335 uw->next = next;
3336 uw->locinput = locinput;
3337 uw->nextchr = nextchr;
3338 #ifdef DEBUGGING
3339 uw->regindent = ++PL_regindent;
3340 #endif
3341
3342 REGCP_SET(uw->lastcp);
3343
3344 /* Now go into the first branch */
3345 next = inner;
3346 }
3347 }
3348 break;
3349 case MINMOD:
3350 minmod = 1;
3351 break;
3352 case CURLYM:
3353 {
3354 I32 l = 0;
3355 CHECKPOINT lastcp;
3356
3357 /* We suppose that the next guy does not need
3358 backtracking: in particular, it is of constant length,
3359 and has no parenths to influence future backrefs. */
3360 ln = ARG1(scan); /* min to match */
3361 n = ARG2(scan); /* max to match */
3362 paren = scan->flags;
3363 if (paren) {
3364 if (paren > PL_regsize)
3365 PL_regsize = paren;
3366 if (paren > (I32)*PL_reglastparen)
3367 *PL_reglastparen = paren;
3368 }
3369 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3370 if (paren)
3371 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3372 PL_reginput = locinput;
3373 if (minmod) {
3374 minmod = 0;
3375 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3376 sayNO;
3377 /* if we matched something zero-length we don't need to
3378 backtrack - capturing parens are already defined, so
3379 the caveat in the maximal case doesn't apply
3380
3381 XXXX if ln == 0, we can redo this check first time
3382 through the following loop
3383 */
3384 if (ln && l == 0)
3385 n = ln; /* don't backtrack */
3386 locinput = PL_reginput;
3387 if (HAS_TEXT(next) || JUMPABLE(next)) {
3388 regnode *text_node = next;
3389
3390 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3391
3392 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3393 else {
3394 if (PL_regkind[(U8)OP(text_node)] == REF) {
3395 c1 = c2 = -1000;
3396 goto assume_ok_MM;
3397 }
3398 else { c1 = (U8)*STRING(text_node); }
3399 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3400 c2 = PL_fold[c1];
3401 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3402 c2 = PL_fold_locale[c1];
3403 else
3404 c2 = c1;
3405 }
3406 }
3407 else
3408 c1 = c2 = -1000;
3409 assume_ok_MM:
3410 REGCP_SET(lastcp);
3411 /* This may be improved if l == 0. */
3412 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3413 /* If it could work, try it. */
3414 if (c1 == -1000 ||
3415 UCHARAT(PL_reginput) == c1 ||
3416 UCHARAT(PL_reginput) == c2)
3417 {
3418 if (paren) {
3419 if (ln) {
3420 PL_regstartp[paren] =
3421 HOPc(PL_reginput, -l) - PL_bostr;
3422 PL_regendp[paren] = PL_reginput - PL_bostr;
3423 }
3424 else
3425 PL_regendp[paren] = -1;
3426 }
3427 if (regmatch(next))
3428 sayYES;
3429 REGCP_UNWIND(lastcp);
3430 }
3431 /* Couldn't or didn't -- move forward. */
3432 PL_reginput = locinput;
3433 if (regrepeat_hard(scan, 1, &l)) {
3434 ln++;
3435 locinput = PL_reginput;
3436 }
3437 else
3438 sayNO;
3439 }
3440 }
3441 else {
3442 n = regrepeat_hard(scan, n, &l);
3443 /* if we matched something zero-length we don't need to
3444 backtrack, unless the minimum count is zero and we
3445 are capturing the result - in that case the capture
3446 being defined or not may affect later execution
3447 */
3448 if (n != 0 && l == 0 && !(paren && ln == 0))
3449 ln = n; /* don't backtrack */
3450 locinput = PL_reginput;
3451 DEBUG_r(
3452 PerlIO_printf(Perl_debug_log,
3453 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3454 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3455 (IV) n, (IV)l)
3456 );
3457 if (n >= ln) {
3458 if (HAS_TEXT(next) || JUMPABLE(next)) {
3459 regnode *text_node = next;
3460
3461 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3462
3463 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3464 else {
3465 if (PL_regkind[(U8)OP(text_node)] == REF) {
3466 c1 = c2 = -1000;
3467 goto assume_ok_REG;
3468 }
3469 else { c1 = (U8)*STRING(text_node); }
3470
3471 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3472 c2 = PL_fold[c1];
3473 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3474 c2 = PL_fold_locale[c1];
3475 else
3476 c2 = c1;
3477 }
3478 }
3479 else
3480 c1 = c2 = -1000;
3481 }
3482 assume_ok_REG:
3483 REGCP_SET(lastcp);
3484 while (n >= ln) {
3485 /* If it could work, try it. */
3486 if (c1 == -1000 ||
3487 UCHARAT(PL_reginput) == c1 ||
3488 UCHARAT(PL_reginput) == c2)
3489 {
3490 DEBUG_r(
3491 PerlIO_printf(Perl_debug_log,
3492 "%*s trying tail with n=%"IVdf"...\n",
3493 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3494 );
3495 if (paren) {
3496 if (n) {
3497 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3498 PL_regendp[paren] = PL_reginput - PL_bostr;
3499 }
3500 else
3501 PL_regendp[paren] = -1;
3502 }
3503 if (regmatch(next))
3504 sayYES;
3505 REGCP_UNWIND(lastcp);
3506 }
3507 /* Couldn't or didn't -- back up. */
3508 n--;
3509 locinput = HOPc(locinput, -l);
3510 PL_reginput = locinput;
3511 }
3512 }
3513 sayNO;
3514 break;
3515 }
3516 case CURLYN:
3517 paren = scan->flags; /* Which paren to set */
3518 if (paren > PL_regsize)
3519 PL_regsize = paren;
3520 if (paren > (I32)*PL_reglastparen)
3521 *PL_reglastparen = paren;
3522 ln = ARG1(scan); /* min to match */
3523 n = ARG2(scan); /* max to match */
3524 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3525 goto repeat;
3526 case CURLY:
3527 paren = 0;
3528 ln = ARG1(scan); /* min to match */
3529 n = ARG2(scan); /* max to match */
3530 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3531 goto repeat;
3532 case STAR:
3533 ln = 0;
3534 n = REG_INFTY;
3535 scan = NEXTOPER(scan);
3536 paren = 0;
3537 goto repeat;
3538 case PLUS:
3539 ln = 1;
3540 n = REG_INFTY;
3541 scan = NEXTOPER(scan);
3542 paren = 0;
3543 repeat:
3544 /*
3545 * Lookahead to avoid useless match attempts
3546 * when we know what character comes next.
3547 */
3548
3549 /*
3550 * Used to only do .*x and .*?x, but now it allows
3551 * for )'s, ('s and (?{ ... })'s to be in the way
3552 * of the quantifier and the EXACT-like node. -- japhy
3553 */
3554
3555 if (HAS_TEXT(next) || JUMPABLE(next)) {
3556 U8 *s;
3557 regnode *text_node = next;
3558
3559 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3560
3561 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3562 else {
3563 if (PL_regkind[(U8)OP(text_node)] == REF) {
3564 c1 = c2 = -1000;
3565 goto assume_ok_easy;
3566 }
3567 else { s = (U8*)STRING(text_node); }
3568
3569 if (!UTF) {
3570 c2 = c1 = *s;
3571 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3572 c2 = PL_fold[c1];
3573 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3574 c2 = PL_fold_locale[c1];
3575 }
3576 else { /* UTF */
3577 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3578 STRLEN ulen1, ulen2;
3579 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3580 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
3581
3582 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3583 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3584
3585 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3586 ckWARN(WARN_UTF8) ?
3587 0 : UTF8_ALLOW_ANY);
3588 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3589 ckWARN(WARN_UTF8) ?
3590 0 : UTF8_ALLOW_ANY);
3591 }
3592 else {
3593 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3594 ckWARN(WARN_UTF8) ?
3595 0 : UTF8_ALLOW_ANY);
3596 }
3597 }
3598 }
3599 }
3600 else
3601 c1 = c2 = -1000;
3602 assume_ok_easy:
3603 PL_reginput = locinput;
3604 if (minmod) {
3605 CHECKPOINT lastcp;
3606 minmod = 0;
3607 if (ln && regrepeat(scan, ln) < ln)
3608 sayNO;
3609 locinput = PL_reginput;
3610 REGCP_SET(lastcp);
3611 if (c1 != -1000) {
3612 char *e; /* Should not check after this */
3613 char *old = locinput;
3614 int count = 0;
3615
3616 if (n == REG_INFTY) {
3617 e = PL_regeol - 1;
3618 if (do_utf8)
3619 while (UTF8_IS_CONTINUATION(*(U8*)e))
3620 e--;
3621 }
3622 else if (do_utf8) {
3623 int m = n - ln;
3624 for (e = locinput;
3625 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3626 e += UTF8SKIP(e);
3627 }
3628 else {
3629 e = locinput + n - ln;
3630 if (e >= PL_regeol)
3631 e = PL_regeol - 1;
3632 }
3633 while (1) {
3634 /* Find place 'next' could work */
3635 if (!do_utf8) {
3636 if (c1 == c2) {
3637 while (locinput <= e &&
3638 UCHARAT(locinput) != c1)
3639 locinput++;
3640 } else {
3641 while (locinput <= e
3642 && UCHARAT(locinput) != c1
3643 && UCHARAT(locinput) != c2)
3644 locinput++;
3645 }
3646 count = locinput - old;
3647 }
3648 else {
3649 STRLEN len;
3650 if (c1 == c2) {
3651 /* count initialised to
3652 * utf8_distance(old, locinput) */
3653 while (locinput <= e &&
3654 utf8n_to_uvchr((U8*)locinput,
3655 UTF8_MAXLEN, &len,
3656 ckWARN(WARN_UTF8) ?
3657 0 : UTF8_ALLOW_ANY) != (UV)c1) {
3658 locinput += len;
3659 count++;
3660 }
3661 } else {
3662 /* count initialised to
3663 * utf8_distance(old, locinput) */
3664 while (locinput <= e) {
3665 UV c = utf8n_to_uvchr((U8*)locinput,
3666 UTF8_MAXLEN, &len,
3667 ckWARN(WARN_UTF8) ?
3668 0 : UTF8_ALLOW_ANY);
3669 if (c == (UV)c1 || c == (UV)c2)
3670 break;
3671 locinput += len;
3672 count++;
3673 }
3674 }
3675 }
3676 if (locinput > e)
3677 sayNO;
3678 /* PL_reginput == old now */
3679 if (locinput != old) {
3680 ln = 1; /* Did some */
3681 if (regrepeat(scan, count) < count)
3682 sayNO;
3683 }
3684 /* PL_reginput == locinput now */
3685 TRYPAREN(paren, ln, locinput);
3686 PL_reginput = locinput; /* Could be reset... */
3687 REGCP_UNWIND(lastcp);
3688 /* Couldn't or didn't -- move forward. */
3689 old = locinput;
3690 if (do_utf8)
3691 locinput += UTF8SKIP(locinput);
3692 else
3693 locinput++;
3694 count = 1;
3695 }
3696 }
3697 else
3698 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3699 UV c;
3700 if (c1 != -1000) {
3701 if (do_utf8)
3702 c = utf8n_to_uvchr((U8*)PL_reginput,
3703 UTF8_MAXLEN, 0,
3704 ckWARN(WARN_UTF8) ?
3705 0 : UTF8_ALLOW_ANY);
3706 else
3707 c = UCHARAT(PL_reginput);
3708 /* If it could work, try it. */
3709 if (c == (UV)c1 || c == (UV)c2)
3710 {
3711 TRYPAREN(paren, ln, PL_reginput);
3712 REGCP_UNWIND(lastcp);
3713 }
3714 }
3715 /* If it could work, try it. */
3716 else if (c1 == -1000)
3717 {
3718 TRYPAREN(paren, ln, PL_reginput);
3719 REGCP_UNWIND(lastcp);
3720 }
3721 /* Couldn't or didn't -- move forward. */
3722 PL_reginput = locinput;
3723 if (regrepeat(scan, 1)) {
3724 ln++;
3725 locinput = PL_reginput;
3726 }
3727 else
3728 sayNO;
3729 }
3730 }
3731 else {
3732 CHECKPOINT lastcp;
3733 n = regrepeat(scan, n);
3734 locinput = PL_reginput;
3735 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3736 ((!PL_multiline && OP(next) != MEOL) ||
3737 OP(next) == SEOL || OP(next) == EOS))
3738 {
3739 ln = n; /* why back off? */
3740 /* ...because $ and \Z can match before *and* after
3741 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3742 We should back off by one in this case. */
3743 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3744 ln--;
3745 }
3746 REGCP_SET(lastcp);
3747 if (paren) {
3748 UV c = 0;
3749 while (n >= ln) {
3750 if (c1 != -1000) {
3751 if (do_utf8)
3752 c = utf8n_to_uvchr((U8*)PL_reginput,
3753 UTF8_MAXLEN, 0,
3754 ckWARN(WARN_UTF8) ?
3755 0 : UTF8_ALLOW_ANY);
3756 else
3757 c = UCHARAT(PL_reginput);
3758 }
3759 /* If it could work, try it. */
3760 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3761 {
3762 TRYPAREN(paren, n, PL_reginput);
3763 REGCP_UNWIND(lastcp);
3764 }
3765 /* Couldn't or didn't -- back up. */
3766 n--;
3767 PL_reginput = locinput = HOPc(locinput, -1);
3768 }
3769 }
3770 else {
3771 UV c = 0;
3772 while (n >= ln) {
3773 if (c1 != -1000) {
3774 if (do_utf8)
3775 c = utf8n_to_uvchr((U8*)PL_reginput,
3776 UTF8_MAXLEN, 0,
3777 ckWARN(WARN_UTF8) ?
3778 0 : UTF8_ALLOW_ANY);
3779 else
3780 c = UCHARAT(PL_reginput);
3781 }
3782 /* If it could work, try it. */
3783 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3784 {
3785 TRYPAREN(paren, n, PL_reginput);
3786 REGCP_UNWIND(lastcp);
3787 }
3788 /* Couldn't or didn't -- back up. */
3789 n--;
3790 PL_reginput = locinput = HOPc(locinput, -1);
3791 }
3792 }
3793 }
3794 sayNO;
3795 break;
3796 case END:
3797 if (PL_reg_call_cc) {
3798 re_cc_state *cur_call_cc = PL_reg_call_cc;
3799 CURCUR *cctmp = PL_regcc;
3800 regexp *re = PL_reg_re;
3801 CHECKPOINT cp, lastcp;
3802
3803 cp = regcppush(0); /* Save *all* the positions. */
3804 REGCP_SET(lastcp);
3805 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3806 the caller. */
3807 PL_reginput = locinput; /* Make position available to
3808 the callcc. */
3809 cache_re(PL_reg_call_cc->re);
3810 PL_regcc = PL_reg_call_cc->cc;
3811 PL_reg_call_cc = PL_reg_call_cc->prev;
3812 if (regmatch(cur_call_cc->node)) {
3813 PL_reg_call_cc = cur_call_cc;
3814 regcpblow(cp);
3815 sayYES;
3816 }
3817 REGCP_UNWIND(lastcp);
3818 regcppop();
3819 PL_reg_call_cc = cur_call_cc;
3820 PL_regcc = cctmp;
3821 PL_reg_re = re;
3822 cache_re(re);
3823
3824 DEBUG_r(
3825 PerlIO_printf(Perl_debug_log,
3826 "%*s continuation failed...\n",
3827 REPORT_CODE_OFF+PL_regindent*2, "")
3828 );
3829 sayNO_SILENT;
3830 }
3831 if (locinput < PL_regtill) {
3832 DEBUG_r(PerlIO_printf(Perl_debug_log,
3833 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3834 PL_colors[4],
3835 (long)(locinput - PL_reg_starttry),
3836 (long)(PL_regtill - PL_reg_starttry),
3837 PL_colors[5]));
3838 sayNO_FINAL; /* Cannot match: too short. */
3839 }
3840 PL_reginput = locinput; /* put where regtry can find it */
3841 sayYES_FINAL; /* Success! */
3842 case SUCCEED:
3843 PL_reginput = locinput; /* put where regtry can find it */
3844 sayYES_LOUD; /* Success! */
3845 case SUSPEND:
3846 n = 1;
3847 PL_reginput = locinput;
3848 goto do_ifmatch;
3849 case UNLESSM:
3850 n = 0;
3851 if (scan->flags) {
3852 s = HOPBACKc(locinput, scan->flags);
3853 if (!s)
3854 goto say_yes;
3855 PL_reginput = s;
3856 }
3857 else
3858 PL_reginput = locinput;
3859 PL_reg_flags ^= RF_false;
3860 goto do_ifmatch;
3861 case IFMATCH:
3862 n = 1;
3863 if (scan->flags) {
3864 s = HOPBACKc(locinput, scan->flags);
3865 if (!s)
3866 goto say_no;
3867 PL_reginput = s;
3868 }
3869 else
3870 PL_reginput = locinput;
3871
3872 do_ifmatch:
3873 inner = NEXTOPER(NEXTOPER(scan));
3874 if (regmatch(inner) != n) {
3875 if (n == 0)
3876 PL_reg_flags ^= RF_false;
3877 say_no:
3878 if (logical) {
3879 logical = 0;
3880 sw = 0;
3881 goto do_longjump;
3882 }
3883 else
3884 sayNO;
3885 }
3886 if (n == 0)
3887 PL_reg_flags ^= RF_false;
3888 say_yes:
3889 if (logical) {
3890 logical = 0;
3891 sw = 1;
3892 }
3893 if (OP(scan) == SUSPEND) {
3894 locinput = PL_reginput;
3895 nextchr = UCHARAT(locinput);
3896 }
3897 /* FALL THROUGH. */
3898 case LONGJMP:
3899 do_longjump:
3900 next = scan + ARG(scan);
3901 if (next == scan)
3902 next = NULL;
3903 break;
3904 default:
3905 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3906 PTR2UV(scan), OP(scan));
3907 Perl_croak(aTHX_ "regexp memory corruption");
3908 }
3909 reenter:
3910 scan = next;
3911 }
3912
3913 /*
3914 * We get here only if there's trouble -- normally "case END" is
3915 * the terminating point.
3916 */
3917 Perl_croak(aTHX_ "corrupted regexp pointers");
3918 /*NOTREACHED*/
3919 sayNO;
3920
3921 yes_loud:
3922 DEBUG_r(
3923 PerlIO_printf(Perl_debug_log,
3924 "%*s %scould match...%s\n",
3925 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3926 );
3927 goto yes;
3928 yes_final:
3929 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3930 PL_colors[4],PL_colors[5]));
3931 yes:
3932 #ifdef DEBUGGING
3933 PL_regindent--;
3934 #endif
3935
3936 #if 0 /* Breaks $^R */
3937 if (unwind)
3938 regcpblow(firstcp);
3939 #endif
3940 return 1;
3941
3942 no:
3943 DEBUG_r(
3944 PerlIO_printf(Perl_debug_log,
3945 "%*s %sfailed...%s\n",
3946 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3947 );
3948 goto do_no;
3949 no_final:
3950 do_no:
3951 if (unwind) {
3952 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3953
3954 switch (uw->type) {
3955 case RE_UNWIND_BRANCH:
3956 case RE_UNWIND_BRANCHJ:
3957 {
3958 re_unwind_branch_t *uwb = &(uw->branch);
3959 I32 lastparen = uwb->lastparen;
3960
3961 REGCP_UNWIND(uwb->lastcp);
3962 for (n = *PL_reglastparen; n > lastparen; n--)
3963 PL_regendp[n] = -1;
3964 *PL_reglastparen = n;
3965 scan = next = uwb->next;
3966 if ( !scan ||
3967 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3968 ? BRANCH : BRANCHJ) ) { /* Failure */
3969 unwind = uwb->prev;
3970 #ifdef DEBUGGING
3971 PL_regindent--;
3972 #endif
3973 goto do_no;
3974 }
3975 /* Have more choice yet. Reuse the same uwb. */
3976 /*SUPPRESS 560*/
3977 if ((n = (uwb->type == RE_UNWIND_BRANCH
3978 ? NEXT_OFF(next) : ARG(next))))
3979 next += n;
3980 else
3981 next = NULL; /* XXXX Needn't unwinding in this case... */
3982 uwb->next = next;
3983 next = NEXTOPER(scan);
3984 if (uwb->type == RE_UNWIND_BRANCHJ)
3985 next = NEXTOPER(next);
3986 locinput = uwb->locinput;
3987 nextchr = uwb->nextchr;
3988 #ifdef DEBUGGING
3989 PL_regindent = uwb->regindent;
3990 #endif
3991
3992 goto reenter;
3993 }
3994 /* NOT REACHED */
3995 default:
3996 Perl_croak(aTHX_ "regexp unwind memory corruption");
3997 }
3998 /* NOT REACHED */
3999 }
4000 #ifdef DEBUGGING
4001 PL_regindent--;
4002 #endif
4003 return 0;
4004 }
4005
4006 /*
4007 - regrepeat - repeatedly match something simple, report how many
4008 */
4009 /*
4010 * [This routine now assumes that it will only match on things of length 1.
4011 * That was true before, but now we assume scan - reginput is the count,
4012 * rather than incrementing count on every character. [Er, except utf8.]]
4013 */
4014 STATIC I32
S_regrepeat(pTHX_ regnode * p,I32 max)4015 S_regrepeat(pTHX_ regnode *p, I32 max)
4016 {
4017 register char *scan;
4018 register I32 c;
4019 register char *loceol = PL_regeol;
4020 register I32 hardcount = 0;
4021 register bool do_utf8 = PL_reg_match_utf8;
4022
4023 scan = PL_reginput;
4024 if (max == REG_INFTY)
4025 max = I32_MAX;
4026 else if (max < loceol - scan)
4027 loceol = scan + max;
4028 switch (OP(p)) {
4029 case REG_ANY:
4030 if (do_utf8) {
4031 loceol = PL_regeol;
4032 while (scan < loceol && hardcount < max && *scan != '\n') {
4033 scan += UTF8SKIP(scan);
4034 hardcount++;
4035 }
4036 } else {
4037 while (scan < loceol && *scan != '\n')
4038 scan++;
4039 }
4040 break;
4041 case SANY:
4042 if (do_utf8) {
4043 loceol = PL_regeol;
4044 while (scan < loceol && hardcount < max) {
4045 scan += UTF8SKIP(scan);
4046 hardcount++;
4047 }
4048 }
4049 else
4050 scan = loceol;
4051 break;
4052 case CANY:
4053 scan = loceol;
4054 break;
4055 case EXACT: /* length of string is 1 */
4056 c = (U8)*STRING(p);
4057 while (scan < loceol && UCHARAT(scan) == c)
4058 scan++;
4059 break;
4060 case EXACTF: /* length of string is 1 */
4061 c = (U8)*STRING(p);
4062 while (scan < loceol &&
4063 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4064 scan++;
4065 break;
4066 case EXACTFL: /* length of string is 1 */
4067 PL_reg_flags |= RF_tainted;
4068 c = (U8)*STRING(p);
4069 while (scan < loceol &&
4070 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4071 scan++;
4072 break;
4073 case ANYOF:
4074 if (do_utf8) {
4075 loceol = PL_regeol;
4076 while (hardcount < max && scan < loceol &&
4077 reginclass(p, (U8*)scan, 0, do_utf8)) {
4078 scan += UTF8SKIP(scan);
4079 hardcount++;
4080 }
4081 } else {
4082 while (scan < loceol && REGINCLASS(p, (U8*)scan))
4083 scan++;
4084 }
4085 break;
4086 case ALNUM:
4087 if (do_utf8) {
4088 loceol = PL_regeol;
4089 LOAD_UTF8_CHARCLASS(alnum,"a");
4090 while (hardcount < max && scan < loceol &&
4091 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4092 scan += UTF8SKIP(scan);
4093 hardcount++;
4094 }
4095 } else {
4096 while (scan < loceol && isALNUM(*scan))
4097 scan++;
4098 }
4099 break;
4100 case ALNUML:
4101 PL_reg_flags |= RF_tainted;
4102 if (do_utf8) {
4103 loceol = PL_regeol;
4104 while (hardcount < max && scan < loceol &&
4105 isALNUM_LC_utf8((U8*)scan)) {
4106 scan += UTF8SKIP(scan);
4107 hardcount++;
4108 }
4109 } else {
4110 while (scan < loceol && isALNUM_LC(*scan))
4111 scan++;
4112 }
4113 break;
4114 case NALNUM:
4115 if (do_utf8) {
4116 loceol = PL_regeol;
4117 LOAD_UTF8_CHARCLASS(alnum,"a");
4118 while (hardcount < max && scan < loceol &&
4119 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4120 scan += UTF8SKIP(scan);
4121 hardcount++;
4122 }
4123 } else {
4124 while (scan < loceol && !isALNUM(*scan))
4125 scan++;
4126 }
4127 break;
4128 case NALNUML:
4129 PL_reg_flags |= RF_tainted;
4130 if (do_utf8) {
4131 loceol = PL_regeol;
4132 while (hardcount < max && scan < loceol &&
4133 !isALNUM_LC_utf8((U8*)scan)) {
4134 scan += UTF8SKIP(scan);
4135 hardcount++;
4136 }
4137 } else {
4138 while (scan < loceol && !isALNUM_LC(*scan))
4139 scan++;
4140 }
4141 break;
4142 case SPACE:
4143 if (do_utf8) {
4144 loceol = PL_regeol;
4145 LOAD_UTF8_CHARCLASS(space," ");
4146 while (hardcount < max && scan < loceol &&
4147 (*scan == ' ' ||
4148 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4149 scan += UTF8SKIP(scan);
4150 hardcount++;
4151 }
4152 } else {
4153 while (scan < loceol && isSPACE(*scan))
4154 scan++;
4155 }
4156 break;
4157 case SPACEL:
4158 PL_reg_flags |= RF_tainted;
4159 if (do_utf8) {
4160 loceol = PL_regeol;
4161 while (hardcount < max && scan < loceol &&
4162 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4163 scan += UTF8SKIP(scan);
4164 hardcount++;
4165 }
4166 } else {
4167 while (scan < loceol && isSPACE_LC(*scan))
4168 scan++;
4169 }
4170 break;
4171 case NSPACE:
4172 if (do_utf8) {
4173 loceol = PL_regeol;
4174 LOAD_UTF8_CHARCLASS(space," ");
4175 while (hardcount < max && scan < loceol &&
4176 !(*scan == ' ' ||
4177 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4178 scan += UTF8SKIP(scan);
4179 hardcount++;
4180 }
4181 } else {
4182 while (scan < loceol && !isSPACE(*scan))
4183 scan++;
4184 break;
4185 }
4186 case NSPACEL:
4187 PL_reg_flags |= RF_tainted;
4188 if (do_utf8) {
4189 loceol = PL_regeol;
4190 while (hardcount < max && scan < loceol &&
4191 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4192 scan += UTF8SKIP(scan);
4193 hardcount++;
4194 }
4195 } else {
4196 while (scan < loceol && !isSPACE_LC(*scan))
4197 scan++;
4198 }
4199 break;
4200 case DIGIT:
4201 if (do_utf8) {
4202 loceol = PL_regeol;
4203 LOAD_UTF8_CHARCLASS(digit,"0");
4204 while (hardcount < max && scan < loceol &&
4205 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4206 scan += UTF8SKIP(scan);
4207 hardcount++;
4208 }
4209 } else {
4210 while (scan < loceol && isDIGIT(*scan))
4211 scan++;
4212 }
4213 break;
4214 case NDIGIT:
4215 if (do_utf8) {
4216 loceol = PL_regeol;
4217 LOAD_UTF8_CHARCLASS(digit,"0");
4218 while (hardcount < max && scan < loceol &&
4219 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4220 scan += UTF8SKIP(scan);
4221 hardcount++;
4222 }
4223 } else {
4224 while (scan < loceol && !isDIGIT(*scan))
4225 scan++;
4226 }
4227 break;
4228 default: /* Called on something of 0 width. */
4229 break; /* So match right here or not at all. */
4230 }
4231
4232 if (hardcount)
4233 c = hardcount;
4234 else
4235 c = scan - PL_reginput;
4236 PL_reginput = scan;
4237
4238 DEBUG_r(
4239 {
4240 SV *prop = sv_newmortal();
4241
4242 regprop(prop, p);
4243 PerlIO_printf(Perl_debug_log,
4244 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4245 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
4246 });
4247
4248 return(c);
4249 }
4250
4251 /*
4252 - regrepeat_hard - repeatedly match something, report total lenth and length
4253 *
4254 * The repeater is supposed to have constant length.
4255 */
4256
4257 STATIC I32
S_regrepeat_hard(pTHX_ regnode * p,I32 max,I32 * lp)4258 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4259 {
4260 register char *scan = Nullch;
4261 register char *start;
4262 register char *loceol = PL_regeol;
4263 I32 l = 0;
4264 I32 count = 0, res = 1;
4265
4266 if (!max)
4267 return 0;
4268
4269 start = PL_reginput;
4270 if (PL_reg_match_utf8) {
4271 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4272 if (!count++) {
4273 l = 0;
4274 while (start < PL_reginput) {
4275 l++;
4276 start += UTF8SKIP(start);
4277 }
4278 *lp = l;
4279 if (l == 0)
4280 return max;
4281 }
4282 if (count == max)
4283 return count;
4284 }
4285 }
4286 else {
4287 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4288 if (!count++) {
4289 *lp = l = PL_reginput - start;
4290 if (max != REG_INFTY && l*max < loceol - scan)
4291 loceol = scan + l*max;
4292 if (l == 0)
4293 return max;
4294 }
4295 }
4296 }
4297 if (!res)
4298 PL_reginput = scan;
4299
4300 return count;
4301 }
4302
4303 /*
4304 - regclass_swash - prepare the utf8 swash
4305 */
4306
4307 SV *
Perl_regclass_swash(pTHX_ register regnode * node,bool doinit,SV ** listsvp,SV ** altsvp)4308 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4309 {
4310 SV *sw = NULL;
4311 SV *si = NULL;
4312 SV *alt = NULL;
4313
4314 if (PL_regdata && PL_regdata->count) {
4315 U32 n = ARG(node);
4316
4317 if (PL_regdata->what[n] == 's') {
4318 SV *rv = (SV*)PL_regdata->data[n];
4319 AV *av = (AV*)SvRV((SV*)rv);
4320 SV **ary = AvARRAY(av);
4321 SV **a, **b;
4322
4323 /* See the end of regcomp.c:S_reglass() for
4324 * documentation of these array elements. */
4325
4326 si = *ary;
4327 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4328 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4329
4330 if (a)
4331 sw = *a;
4332 else if (si && doinit) {
4333 sw = swash_init("utf8", "", si, 1, 0);
4334 (void)av_store(av, 1, sw);
4335 }
4336 if (b)
4337 alt = *b;
4338 }
4339 }
4340
4341 if (listsvp)
4342 *listsvp = si;
4343 if (altsvp)
4344 *altsvp = alt;
4345
4346 return sw;
4347 }
4348
4349 /*
4350 - reginclass - determine if a character falls into a character class
4351
4352 The n is the ANYOF regnode, the p is the target string, lenp
4353 is pointer to the maximum length of how far to go in the p
4354 (if the lenp is zero, UTF8SKIP(p) is used),
4355 do_utf8 tells whether the target string is in UTF-8.
4356
4357 */
4358
4359 STATIC bool
S_reginclass(pTHX_ register regnode * n,register U8 * p,STRLEN * lenp,register bool do_utf8)4360 S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
4361 {
4362 char flags = ANYOF_FLAGS(n);
4363 bool match = FALSE;
4364 UV c = *p;
4365 STRLEN len = 0;
4366 STRLEN plen;
4367
4368 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4369 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4370 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
4371
4372 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4373 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4374 if (lenp)
4375 *lenp = 0;
4376 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4377 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4378 match = TRUE;
4379 }
4380 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4381 match = TRUE;
4382 if (!match) {
4383 AV *av;
4384 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
4385
4386 if (sw) {
4387 if (swash_fetch(sw, p, do_utf8))
4388 match = TRUE;
4389 else if (flags & ANYOF_FOLD) {
4390 if (!match && lenp && av) {
4391 I32 i;
4392
4393 for (i = 0; i <= av_len(av); i++) {
4394 SV* sv = *av_fetch(av, i, FALSE);
4395 STRLEN len;
4396 char *s = SvPV(sv, len);
4397
4398 if (len <= plen && memEQ(s, (char*)p, len)) {
4399 *lenp = len;
4400 match = TRUE;
4401 break;
4402 }
4403 }
4404 }
4405 if (!match) {
4406 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4407 STRLEN tmplen;
4408
4409 to_utf8_fold(p, tmpbuf, &tmplen);
4410 if (swash_fetch(sw, tmpbuf, do_utf8))
4411 match = TRUE;
4412 }
4413 }
4414 }
4415 }
4416 if (match && lenp && *lenp == 0)
4417 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4418 }
4419 if (!match && c < 256) {
4420 if (ANYOF_BITMAP_TEST(n, c))
4421 match = TRUE;
4422 else if (flags & ANYOF_FOLD) {
4423 U8 f;
4424
4425 if (flags & ANYOF_LOCALE) {
4426 PL_reg_flags |= RF_tainted;
4427 f = PL_fold_locale[c];
4428 }
4429 else
4430 f = PL_fold[c];
4431 if (f != c && ANYOF_BITMAP_TEST(n, f))
4432 match = TRUE;
4433 }
4434
4435 if (!match && (flags & ANYOF_CLASS)) {
4436 PL_reg_flags |= RF_tainted;
4437 if (
4438 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4439 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4440 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4441 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4442 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4443 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4444 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4445 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4446 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4447 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4448 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4449 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4450 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4451 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4452 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4453 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4454 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4455 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4456 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4457 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4458 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4459 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4460 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4461 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4468 ) /* How's that for a conditional? */
4469 {
4470 match = TRUE;
4471 }
4472 }
4473 }
4474
4475 return (flags & ANYOF_INVERT) ? !match : match;
4476 }
4477
4478 STATIC U8 *
S_reghop(pTHX_ U8 * s,I32 off)4479 S_reghop(pTHX_ U8 *s, I32 off)
4480 {
4481 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4482 }
4483
4484 STATIC U8 *
S_reghop3(pTHX_ U8 * s,I32 off,U8 * lim)4485 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4486 {
4487 if (off >= 0) {
4488 while (off-- && s < lim) {
4489 /* XXX could check well-formedness here */
4490 s += UTF8SKIP(s);
4491 }
4492 }
4493 else {
4494 while (off++) {
4495 if (s > lim) {
4496 s--;
4497 if (UTF8_IS_CONTINUED(*s)) {
4498 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4499 s--;
4500 }
4501 /* XXX could check well-formedness here */
4502 }
4503 }
4504 }
4505 return s;
4506 }
4507
4508 STATIC U8 *
S_reghopmaybe(pTHX_ U8 * s,I32 off)4509 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4510 {
4511 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4512 }
4513
4514 STATIC U8 *
S_reghopmaybe3(pTHX_ U8 * s,I32 off,U8 * lim)4515 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4516 {
4517 if (off >= 0) {
4518 while (off-- && s < lim) {
4519 /* XXX could check well-formedness here */
4520 s += UTF8SKIP(s);
4521 }
4522 if (off >= 0)
4523 return 0;
4524 }
4525 else {
4526 while (off++) {
4527 if (s > lim) {
4528 s--;
4529 if (UTF8_IS_CONTINUED(*s)) {
4530 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4531 s--;
4532 }
4533 /* XXX could check well-formedness here */
4534 }
4535 else
4536 break;
4537 }
4538 if (off <= 0)
4539 return 0;
4540 }
4541 return s;
4542 }
4543
4544 static void
restore_pos(pTHX_ void * arg)4545 restore_pos(pTHX_ void *arg)
4546 {
4547 if (PL_reg_eval_set) {
4548 if (PL_reg_oldsaved) {
4549 PL_reg_re->subbeg = PL_reg_oldsaved;
4550 PL_reg_re->sublen = PL_reg_oldsavedlen;
4551 RX_MATCH_COPIED_on(PL_reg_re);
4552 }
4553 PL_reg_magic->mg_len = PL_reg_oldpos;
4554 PL_reg_eval_set = 0;
4555 PL_curpm = PL_reg_oldcurpm;
4556 }
4557 }
4558
4559 STATIC void
S_to_utf8_substr(pTHX_ register regexp * prog)4560 S_to_utf8_substr(pTHX_ register regexp *prog)
4561 {
4562 SV* sv;
4563 if (prog->float_substr && !prog->float_utf8) {
4564 prog->float_utf8 = sv = NEWSV(117, 0);
4565 SvSetSV(sv, prog->float_substr);
4566 sv_utf8_upgrade(sv);
4567 if (SvTAIL(prog->float_substr))
4568 SvTAIL_on(sv);
4569 if (prog->float_substr == prog->check_substr)
4570 prog->check_utf8 = sv;
4571 }
4572 if (prog->anchored_substr && !prog->anchored_utf8) {
4573 prog->anchored_utf8 = sv = NEWSV(118, 0);
4574 SvSetSV(sv, prog->anchored_substr);
4575 sv_utf8_upgrade(sv);
4576 if (SvTAIL(prog->anchored_substr))
4577 SvTAIL_on(sv);
4578 if (prog->anchored_substr == prog->check_substr)
4579 prog->check_utf8 = sv;
4580 }
4581 }
4582
4583 STATIC void
S_to_byte_substr(pTHX_ register regexp * prog)4584 S_to_byte_substr(pTHX_ register regexp *prog)
4585 {
4586 SV* sv;
4587 if (prog->float_utf8 && !prog->float_substr) {
4588 prog->float_substr = sv = NEWSV(117, 0);
4589 SvSetSV(sv, prog->float_utf8);
4590 if (sv_utf8_downgrade(sv, TRUE)) {
4591 if (SvTAIL(prog->float_utf8))
4592 SvTAIL_on(sv);
4593 } else {
4594 SvREFCNT_dec(sv);
4595 prog->float_substr = sv = &PL_sv_undef;
4596 }
4597 if (prog->float_utf8 == prog->check_utf8)
4598 prog->check_substr = sv;
4599 }
4600 if (prog->anchored_utf8 && !prog->anchored_substr) {
4601 prog->anchored_substr = sv = NEWSV(118, 0);
4602 SvSetSV(sv, prog->anchored_utf8);
4603 if (sv_utf8_downgrade(sv, TRUE)) {
4604 if (SvTAIL(prog->anchored_utf8))
4605 SvTAIL_on(sv);
4606 } else {
4607 SvREFCNT_dec(sv);
4608 prog->anchored_substr = sv = &PL_sv_undef;
4609 }
4610 if (prog->anchored_utf8 == prog->check_utf8)
4611 prog->check_substr = sv;
4612 }
4613 }
4614