xref: /openbsd-src/gnu/usr.bin/perl/regexec.c (revision a0747c9f67a4ae71ccb71e62a28d1ea19e06a63c)
1 /*    regexec.c
2  */
3 
4 /*
5  *	One Ring to rule them all, One Ring to find them
6  *
7  *     [p.v of _The Lord of the Rings_, opening poem]
8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10  */
11 
12 /* This file contains functions for executing a regular expression.  See
13  * also regcomp.c which funnily enough, contains functions for compiling
14  * a regular expression.
15  *
16  * This file is also copied at build time to ext/re/re_exec.c, where
17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18  * This causes the main functions to be compiled under new names and with
19  * debugging support added, which makes "use re 'debug'" work.
20  */
21 
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23  * confused with the original package (see point 3 below).  Thanks, Henry!
24  */
25 
26 /* Additional note: this code is very heavily munged from Henry's version
27  * in places.  In some spots I've traded clarity for efficiency, so don't
28  * blame Henry for some of the lack of readability.
29  */
30 
31 /* The names of the functions have been changed from regcomp and
32  * regexec to  pregcomp and pregexec in order to avoid conflicts
33  * with the POSIX routines of the same names.
34 */
35 
36 #ifdef PERL_EXT_RE_BUILD
37 #include "re_top.h"
38 #endif
39 
40 /*
41  * pregcomp and pregexec -- regsub and regerror are not used in perl
42  *
43  *	Copyright (c) 1986 by University of Toronto.
44  *	Written by Henry Spencer.  Not derived from licensed software.
45  *
46  *	Permission is granted to anyone to use this software for any
47  *	purpose on any computer system, and to redistribute it freely,
48  *	subject to the following restrictions:
49  *
50  *	1. The author is not responsible for the consequences of use of
51  *		this software, no matter how awful, even if they arise
52  *		from defects in it.
53  *
54  *	2. The origin of this software must not be misrepresented, either
55  *		by explicit claim or by omission.
56  *
57  *	3. Altered versions must be plainly marked as such, and must not
58  *		be misrepresented as being the original software.
59  *
60  ****    Alterations to Henry's code are...
61  ****
62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64  ****    by Larry Wall and others
65  ****
66  ****    You may distribute under the terms of either the GNU General Public
67  ****    License or the Artistic License, as specified in the README file.
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGEXEC_C
75 #include "perl.h"
76 
77 #ifdef PERL_IN_XSUB_RE
78 #  include "re_comp.h"
79 #else
80 #  include "regcomp.h"
81 #endif
82 
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
85 
86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
88 
89 static const char utf8_locale_required[] =
90       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
91 
92 #ifdef DEBUGGING
93 /* At least one required character in the target string is expressible only in
94  * UTF-8. */
95 static const char non_utf8_target_but_utf8_required[]
96                 = "Can't match, because target string needs to be in UTF-8\n";
97 #endif
98 
99 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
100     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
101     goto target;                                                         \
102 } STMT_END
103 
104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
105 
106 #ifndef STATIC
107 #define	STATIC	static
108 #endif
109 
110 /*
111  * Forwards.
112  */
113 
114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
115 
116 #define HOPc(pos,off) \
117 	(char *)(reginfo->is_utf8_target \
118 	    ? reghop3((U8*)pos, off, \
119                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
120 	    : (U8*)(pos + off))
121 
122 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
123 #define HOPBACK3(pos, off, lim) \
124 	(reginfo->is_utf8_target                          \
125 	    ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
126 	    : (pos - off >= lim)	                         \
127 		? (U8*)pos - off		                 \
128 		: NULL)
129 
130 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
131 
132 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
133 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
134 
135 /* lim must be +ve. Returns NULL on overshoot */
136 #define HOPMAYBE3(pos,off,lim) \
137 	(reginfo->is_utf8_target                        \
138 	    ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
139 	    : ((U8*)pos + off <= lim)                   \
140 		? (U8*)pos + off                        \
141 		: NULL)
142 
143 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
144  * off must be >=0; args should be vars rather than expressions */
145 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
146     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
147     : (U8*)((pos + off) > lim ? lim : (pos + off)))
148 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
149 
150 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
151     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
152     : (U8*)(pos + off))
153 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
154 
155 #define PLACEHOLDER	/* Something for the preprocessor to grab onto */
156 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
157 
158 /* for use after a quantifier and before an EXACT-like node -- japhy */
159 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
160  *
161  * NOTE that *nothing* that affects backtracking should be in here, specifically
162  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
163  * node that is in between two EXACT like nodes when ascertaining what the required
164  * "follow" character is. This should probably be moved to regex compile time
165  * although it may be done at run time beause of the REF possibility - more
166  * investigation required. -- demerphq
167 */
168 #define JUMPABLE(rn) (                                                             \
169     OP(rn) == OPEN ||                                                              \
170     (OP(rn) == CLOSE &&                                                            \
171      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
172     OP(rn) == EVAL ||                                                              \
173     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
174     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
175     OP(rn) == KEEPS ||                                                             \
176     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
177 )
178 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
179 
180 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
181 
182 /*
183   Search for mandatory following text node; for lookahead, the text must
184   follow but for lookbehind (rn->flags != 0) we skip to the next step.
185 */
186 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
187     while (JUMPABLE(rn)) { \
188 	const OPCODE type = OP(rn); \
189 	if (type == SUSPEND || PL_regkind[type] == CURLY) \
190 	    rn = NEXTOPER(NEXTOPER(rn)); \
191 	else if (type == PLUS) \
192 	    rn = NEXTOPER(rn); \
193 	else if (type == IFMATCH) \
194 	    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
195 	else rn += NEXT_OFF(rn); \
196     } \
197 } STMT_END
198 
199 #define SLAB_FIRST(s) (&(s)->states[0])
200 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
201 
202 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
203 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
204 static regmatch_state * S_push_slab(pTHX);
205 
206 #define REGCP_PAREN_ELEMS 3
207 #define REGCP_OTHER_ELEMS 3
208 #define REGCP_FRAME_ELEMS 1
209 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
210  * are needed for the regexp context stack bookkeeping. */
211 
212 STATIC CHECKPOINT
213 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
214 {
215     const int retval = PL_savestack_ix;
216     const int paren_elems_to_push =
217                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
218     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
219     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
220     I32 p;
221     DECLARE_AND_GET_RE_DEBUG_FLAGS;
222 
223     PERL_ARGS_ASSERT_REGCPPUSH;
224 
225     if (paren_elems_to_push < 0)
226         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
227                    (int)paren_elems_to_push, (int)maxopenparen,
228                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
229 
230     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
231 	Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
232 		   " out of range (%lu-%ld)",
233 		   total_elems,
234                    (unsigned long)maxopenparen,
235                    (long)parenfloor);
236 
237     SSGROW(total_elems + REGCP_FRAME_ELEMS);
238 
239     DEBUG_BUFFERS_r(
240 	if ((int)maxopenparen > (int)parenfloor)
241             Perl_re_exec_indentf( aTHX_
242 		"rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
243 		depth,
244                 PTR2UV(rex),
245 		PTR2UV(rex->offs)
246 	    );
247     );
248     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
249 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
250 	SSPUSHIV(rex->offs[p].end);
251 	SSPUSHIV(rex->offs[p].start);
252 	SSPUSHINT(rex->offs[p].start_tmp);
253         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
254 	    "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
255 	    depth,
256             (UV)p,
257 	    (IV)rex->offs[p].start,
258 	    (IV)rex->offs[p].start_tmp,
259 	    (IV)rex->offs[p].end
260 	));
261     }
262 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
263     SSPUSHINT(maxopenparen);
264     SSPUSHINT(rex->lastparen);
265     SSPUSHINT(rex->lastcloseparen);
266     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
267 
268     return retval;
269 }
270 
271 /* These are needed since we do not localize EVAL nodes: */
272 #define REGCP_SET(cp)                                           \
273     DEBUG_STATE_r(                                              \
274         Perl_re_exec_indentf( aTHX_                             \
275             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
276             depth, (IV)PL_savestack_ix                          \
277         )                                                       \
278     );                                                          \
279     cp = PL_savestack_ix
280 
281 #define REGCP_UNWIND(cp)                                        \
282     DEBUG_STATE_r(                                              \
283         if (cp != PL_savestack_ix)                              \
284             Perl_re_exec_indentf( aTHX_                         \
285                 "Clearing an EVAL scope, savestack=%"           \
286                 IVdf "..%" IVdf "\n",                           \
287                 depth, (IV)(cp), (IV)PL_savestack_ix            \
288             )                                                   \
289     );                                                          \
290     regcpblow(cp)
291 
292 /* set the start and end positions of capture ix */
293 #define CLOSE_CAPTURE(ix, s, e)                                            \
294     rex->offs[ix].start = s;                                               \
295     rex->offs[ix].end = e;                                                 \
296     if (ix > rex->lastparen)                                               \
297         rex->lastparen = ix;                                               \
298     rex->lastcloseparen = ix;                                              \
299     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
300         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
301         depth,                                                             \
302         PTR2UV(rex),                                                       \
303         PTR2UV(rex->offs),                                                 \
304         (UV)ix,                                                            \
305         (IV)rex->offs[ix].start,                                           \
306         (IV)rex->offs[ix].end,                                             \
307         (UV)rex->lastparen                                                 \
308     ))
309 
310 #define UNWIND_PAREN(lp, lcp)               \
311     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
312         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
313         depth,                              \
314         PTR2UV(rex),                        \
315         PTR2UV(rex->offs),                  \
316         (UV)(lp),                           \
317         (UV)(rex->lastparen),               \
318         (UV)(lcp)                           \
319     ));                                     \
320     for (n = rex->lastparen; n > lp; n--)   \
321         rex->offs[n].end = -1;              \
322     rex->lastparen = n;                     \
323     rex->lastcloseparen = lcp;
324 
325 
326 STATIC void
327 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
328 {
329     UV i;
330     U32 paren;
331     DECLARE_AND_GET_RE_DEBUG_FLAGS;
332 
333     PERL_ARGS_ASSERT_REGCPPOP;
334 
335     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
336     i = SSPOPUV;
337     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
338     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
339     rex->lastcloseparen = SSPOPINT;
340     rex->lastparen = SSPOPINT;
341     *maxopenparen_p = SSPOPINT;
342 
343     i -= REGCP_OTHER_ELEMS;
344     /* Now restore the parentheses context. */
345     DEBUG_BUFFERS_r(
346 	if (i || rex->lastparen + 1 <= rex->nparens)
347             Perl_re_exec_indentf( aTHX_
348 		"rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
349 		depth,
350                 PTR2UV(rex),
351 		PTR2UV(rex->offs)
352 	    );
353     );
354     paren = *maxopenparen_p;
355     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
356 	SSize_t tmps;
357 	rex->offs[paren].start_tmp = SSPOPINT;
358 	rex->offs[paren].start = SSPOPIV;
359 	tmps = SSPOPIV;
360 	if (paren <= rex->lastparen)
361 	    rex->offs[paren].end = tmps;
362         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
363 	    "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
364 	    depth,
365             (UV)paren,
366 	    (IV)rex->offs[paren].start,
367 	    (IV)rex->offs[paren].start_tmp,
368 	    (IV)rex->offs[paren].end,
369 	    (paren > rex->lastparen ? "(skipped)" : ""));
370 	);
371 	paren--;
372     }
373 #if 1
374     /* It would seem that the similar code in regtry()
375      * already takes care of this, and in fact it is in
376      * a better location to since this code can #if 0-ed out
377      * but the code in regtry() is needed or otherwise tests
378      * requiring null fields (pat.t#187 and split.t#{13,14}
379      * (as of patchlevel 7877)  will fail.  Then again,
380      * this code seems to be necessary or otherwise
381      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
382      * --jhi updated by dapm */
383     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
384 	if (i > *maxopenparen_p)
385 	    rex->offs[i].start = -1;
386 	rex->offs[i].end = -1;
387         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
388 	    "    \\%" UVuf ": %s   ..-1 undeffing\n",
389 	    depth,
390             (UV)i,
391 	    (i > *maxopenparen_p) ? "-1" : "  "
392 	));
393     }
394 #endif
395 }
396 
397 /* restore the parens and associated vars at savestack position ix,
398  * but without popping the stack */
399 
400 STATIC void
401 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
402 {
403     I32 tmpix = PL_savestack_ix;
404     PERL_ARGS_ASSERT_REGCP_RESTORE;
405 
406     PL_savestack_ix = ix;
407     regcppop(rex, maxopenparen_p);
408     PL_savestack_ix = tmpix;
409 }
410 
411 #define regcpblow(cp) LEAVE_SCOPE(cp)	/* Ignores regcppush()ed data. */
412 
413 #ifndef PERL_IN_XSUB_RE
414 
415 bool
416 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
417 {
418     /* Returns a boolean as to whether or not 'character' is a member of the
419      * Posix character class given by 'classnum' that should be equivalent to a
420      * value in the typedef '_char_class_number'.
421      *
422      * Ideally this could be replaced by a just an array of function pointers
423      * to the C library functions that implement the macros this calls.
424      * However, to compile, the precise function signatures are required, and
425      * these may vary from platform to platform.  To avoid having to figure
426      * out what those all are on each platform, I (khw) am using this method,
427      * which adds an extra layer of function call overhead (unless the C
428      * optimizer strips it away).  But we don't particularly care about
429      * performance with locales anyway. */
430 
431     switch ((_char_class_number) classnum) {
432         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
433         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
434         case _CC_ENUM_ASCII:     return isASCII_LC(character);
435         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
436         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
437                                         || isUPPER_LC(character);
438         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
439         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
440         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
441         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
442         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
443         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
444         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
445         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
446         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
447         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
448         default:    /* VERTSPACE should never occur in locales */
449             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
450     }
451 
452     NOT_REACHED; /* NOTREACHED */
453     return FALSE;
454 }
455 
456 #endif
457 
458 PERL_STATIC_INLINE I32
459 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
460 {
461     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
462      * folded.  Works on all folds representable without UTF-8, except for
463      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
464      * check that the strings each have at least 'len' characters.
465      *
466      * There is almost an identical API function where s2 need not be folded:
467      * Perl_foldEQ_latin1() */
468 
469     const U8 *a = (const U8 *)s1;
470     const U8 *b = (const U8 *)s2;
471 
472     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
473 
474     assert(len >= 0);
475 
476     while (len--) {
477         assert(! isUPPER_L1(*b));
478         if (toLOWER_L1(*a) != *b) {
479             return 0;
480         }
481         a++, b++;
482     }
483     return 1;
484 }
485 
486 STATIC bool
487 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
488 {
489     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
490      * 'character' is a member of the Posix character class given by 'classnum'
491      * that should be equivalent to a value in the typedef
492      * '_char_class_number'.
493      *
494      * This just calls isFOO_lc on the code point for the character if it is in
495      * the range 0-255.  Outside that range, all characters use Unicode
496      * rules, ignoring any locale.  So use the Unicode function if this class
497      * requires an inversion list, and use the Unicode macro otherwise. */
498 
499     dVAR;
500 
501     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
502 
503     if (UTF8_IS_INVARIANT(*character)) {
504         return isFOO_lc(classnum, *character);
505     }
506     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
507         return isFOO_lc(classnum,
508                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
509     }
510 
511     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
512 
513     switch ((_char_class_number) classnum) {
514         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
515         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
516         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
517         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
518         default:
519             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
520                                         utf8_to_uvchr_buf(character, e, NULL));
521     }
522 
523     return FALSE; /* Things like CNTRL are always below 256 */
524 }
525 
526 STATIC U8 *
527 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
528 {
529     /* Returns the position of the first byte in the sequence between 's' and
530      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
531      * */
532 
533     PERL_ARGS_ASSERT_FIND_SPAN_END;
534 
535     assert(send >= s);
536 
537     if ((STRLEN) (send - s) >= PERL_WORDSIZE
538                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
539                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
540     {
541         PERL_UINTMAX_T span_word;
542 
543         /* Process per-byte until reach word boundary.  XXX This loop could be
544          * eliminated if we knew that this platform had fast unaligned reads */
545         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
546             if (*s != span_byte) {
547                 return s;
548             }
549             s++;
550         }
551 
552         /* Create a word filled with the bytes we are spanning */
553         span_word = PERL_COUNT_MULTIPLIER * span_byte;
554 
555         /* Process per-word as long as we have at least a full word left */
556         do {
557 
558             /* Keep going if the whole word is composed of 'span_byte's */
559             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
560                 s += PERL_WORDSIZE;
561                 continue;
562             }
563 
564             /* Here, at least one byte in the word isn't 'span_byte'. */
565 
566 #ifdef EBCDIC
567 
568             break;
569 
570 #else
571 
572             /* This xor leaves 1 bits only in those non-matching bytes */
573             span_word ^= * (PERL_UINTMAX_T *) s;
574 
575             /* Make sure the upper bit of each non-matching byte is set.  This
576              * makes each such byte look like an ASCII platform variant byte */
577             span_word |= span_word << 1;
578             span_word |= span_word << 2;
579             span_word |= span_word << 4;
580 
581             /* That reduces the problem to what this function solves */
582             return s + variant_byte_number(span_word);
583 
584 #endif
585 
586         } while (s + PERL_WORDSIZE <= send);
587     }
588 
589     /* Process the straggler bytes beyond the final word boundary */
590     while (s < send) {
591         if (*s != span_byte) {
592             return s;
593         }
594         s++;
595     }
596 
597     return s;
598 }
599 
600 STATIC U8 *
601 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
602 {
603     /* Returns the position of the first byte in the sequence between 's'
604      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
605      * returns 'send' if none found.  It uses word-level operations instead of
606      * byte to speed up the process */
607 
608     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
609 
610     assert(send >= s);
611     assert((byte & mask) == byte);
612 
613 #ifndef EBCDIC
614 
615     if ((STRLEN) (send - s) >= PERL_WORDSIZE
616                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
617                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
618     {
619         PERL_UINTMAX_T word, mask_word;
620 
621         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
622             if (((*s) & mask) == byte) {
623                 return s;
624             }
625             s++;
626         }
627 
628         word      = PERL_COUNT_MULTIPLIER * byte;
629         mask_word = PERL_COUNT_MULTIPLIER * mask;
630 
631         do {
632             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
633 
634             /* If 'masked' contains bytes with the bit pattern of 'byte' within
635              * it, xoring with 'word' will leave each of the 8 bits in such
636              * bytes be 0, and no byte containing any other bit pattern will be
637              * 0. */
638             masked ^= word;
639 
640             /* This causes the most significant bit to be set to 1 for any
641              * bytes in the word that aren't completely 0 */
642             masked |= masked << 1;
643             masked |= masked << 2;
644             masked |= masked << 4;
645 
646             /* The msbits are the same as what marks a byte as variant, so we
647              * can use this mask.  If all msbits are 1, the word doesn't
648              * contain 'byte' */
649             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
650                 s += PERL_WORDSIZE;
651                 continue;
652             }
653 
654             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
655              * and any that are, are 0.  Complement and re-AND to swap that */
656             masked = ~ masked;
657             masked &= PERL_VARIANTS_WORD_MASK;
658 
659             /* This reduces the problem to that solved by this function */
660             s += variant_byte_number(masked);
661             return s;
662 
663         } while (s + PERL_WORDSIZE <= send);
664     }
665 
666 #endif
667 
668     while (s < send) {
669         if (((*s) & mask) == byte) {
670             return s;
671         }
672         s++;
673     }
674 
675     return s;
676 }
677 
678 STATIC U8 *
679 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
680 {
681     /* Returns the position of the first byte in the sequence between 's' and
682      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
683      * 'span_byte' should have been ANDed with 'mask' in the call of this
684      * function.  Returns 'send' if none found.  Works like find_span_end(),
685      * except for the AND */
686 
687     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
688 
689     assert(send >= s);
690     assert((span_byte & mask) == span_byte);
691 
692     if ((STRLEN) (send - s) >= PERL_WORDSIZE
693                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
694                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
695     {
696         PERL_UINTMAX_T span_word, mask_word;
697 
698         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
699             if (((*s) & mask) != span_byte) {
700                 return s;
701             }
702             s++;
703         }
704 
705         span_word = PERL_COUNT_MULTIPLIER * span_byte;
706         mask_word = PERL_COUNT_MULTIPLIER * mask;
707 
708         do {
709             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
710 
711             if (masked == span_word) {
712                 s += PERL_WORDSIZE;
713                 continue;
714             }
715 
716 #ifdef EBCDIC
717 
718             break;
719 
720 #else
721 
722             masked ^= span_word;
723             masked |= masked << 1;
724             masked |= masked << 2;
725             masked |= masked << 4;
726             return s + variant_byte_number(masked);
727 
728 #endif
729 
730         } while (s + PERL_WORDSIZE <= send);
731     }
732 
733     while (s < send) {
734         if (((*s) & mask) != span_byte) {
735             return s;
736         }
737         s++;
738     }
739 
740     return s;
741 }
742 
743 /*
744  * pregexec and friends
745  */
746 
747 #ifndef PERL_IN_XSUB_RE
748 /*
749  - pregexec - match a regexp against a string
750  */
751 I32
752 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
753 	 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
754 /* stringarg: the point in the string at which to begin matching */
755 /* strend:    pointer to null at end of string */
756 /* strbeg:    real beginning of string */
757 /* minend:    end of match must be >= minend bytes after stringarg. */
758 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
759  *            itself is accessed via the pointers above */
760 /* nosave:    For optimizations. */
761 {
762     PERL_ARGS_ASSERT_PREGEXEC;
763 
764     return
765 	regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
766 		      nosave ? 0 : REXEC_COPY_STR);
767 }
768 #endif
769 
770 
771 
772 /* re_intuit_start():
773  *
774  * Based on some optimiser hints, try to find the earliest position in the
775  * string where the regex could match.
776  *
777  *   rx:     the regex to match against
778  *   sv:     the SV being matched: only used for utf8 flag; the string
779  *           itself is accessed via the pointers below. Note that on
780  *           something like an overloaded SV, SvPOK(sv) may be false
781  *           and the string pointers may point to something unrelated to
782  *           the SV itself.
783  *   strbeg: real beginning of string
784  *   strpos: the point in the string at which to begin matching
785  *   strend: pointer to the byte following the last char of the string
786  *   flags   currently unused; set to 0
787  *   data:   currently unused; set to NULL
788  *
789  * The basic idea of re_intuit_start() is to use some known information
790  * about the pattern, namely:
791  *
792  *   a) the longest known anchored substring (i.e. one that's at a
793  *      constant offset from the beginning of the pattern; but not
794  *      necessarily at a fixed offset from the beginning of the
795  *      string);
796  *   b) the longest floating substring (i.e. one that's not at a constant
797  *      offset from the beginning of the pattern);
798  *   c) Whether the pattern is anchored to the string; either
799  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
800  *      or anchored to pos(): /\G/;
801  *   d) A start class: a real or synthetic character class which
802  *      represents which characters are legal at the start of the pattern;
803  *
804  * to either quickly reject the match, or to find the earliest position
805  * within the string at which the pattern might match, thus avoiding
806  * running the full NFA engine at those earlier locations, only to
807  * eventually fail and retry further along.
808  *
809  * Returns NULL if the pattern can't match, or returns the address within
810  * the string which is the earliest place the match could occur.
811  *
812  * The longest of the anchored and floating substrings is called 'check'
813  * and is checked first. The other is called 'other' and is checked
814  * second. The 'other' substring may not be present.  For example,
815  *
816  *    /(abc|xyz)ABC\d{0,3}DEFG/
817  *
818  * will have
819  *
820  *   check substr (float)    = "DEFG", offset 6..9 chars
821  *   other substr (anchored) = "ABC",  offset 3..3 chars
822  *   stclass = [ax]
823  *
824  * Be aware that during the course of this function, sometimes 'anchored'
825  * refers to a substring being anchored relative to the start of the
826  * pattern, and sometimes to the pattern itself being anchored relative to
827  * the string. For example:
828  *
829  *   /\dabc/:   "abc" is anchored to the pattern;
830  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
831  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
832  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
833  *                    but the pattern is anchored to the string.
834  */
835 
836 char *
837 Perl_re_intuit_start(pTHX_
838                     REGEXP * const rx,
839                     SV *sv,
840                     const char * const strbeg,
841                     char *strpos,
842                     char *strend,
843                     const U32 flags,
844                     re_scream_pos_data *data)
845 {
846     struct regexp *const prog = ReANY(rx);
847     SSize_t start_shift = prog->check_offset_min;
848     /* Should be nonnegative! */
849     SSize_t end_shift   = 0;
850     /* current lowest pos in string where the regex can start matching */
851     char *rx_origin = strpos;
852     SV *check;
853     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
854     U8   other_ix = 1 - prog->substrs->check_ix;
855     bool ml_anch = 0;
856     char *other_last = strpos;/* latest pos 'other' substr already checked to */
857     char *check_at = NULL;		/* check substr found at this pos */
858     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
859     RXi_GET_DECL(prog,progi);
860     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
861     regmatch_info *const reginfo = &reginfo_buf;
862     DECLARE_AND_GET_RE_DEBUG_FLAGS;
863 
864     PERL_ARGS_ASSERT_RE_INTUIT_START;
865     PERL_UNUSED_ARG(flags);
866     PERL_UNUSED_ARG(data);
867 
868     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
869                 "Intuit: trying to determine minimum start position...\n"));
870 
871     /* for now, assume that all substr offsets are positive. If at some point
872      * in the future someone wants to do clever things with lookbehind and
873      * -ve offsets, they'll need to fix up any code in this function
874      * which uses these offsets. See the thread beginning
875      * <20140113145929.GF27210@iabyn.com>
876      */
877     assert(prog->substrs->data[0].min_offset >= 0);
878     assert(prog->substrs->data[0].max_offset >= 0);
879     assert(prog->substrs->data[1].min_offset >= 0);
880     assert(prog->substrs->data[1].max_offset >= 0);
881     assert(prog->substrs->data[2].min_offset >= 0);
882     assert(prog->substrs->data[2].max_offset >= 0);
883 
884     /* for now, assume that if both present, that the floating substring
885      * doesn't start before the anchored substring.
886      * If you break this assumption (e.g. doing better optimisations
887      * with lookahead/behind), then you'll need to audit the code in this
888      * function carefully first
889      */
890     assert(
891             ! (  (prog->anchored_utf8 || prog->anchored_substr)
892               && (prog->float_utf8    || prog->float_substr))
893            || (prog->float_min_offset >= prog->anchored_offset));
894 
895     /* byte rather than char calculation for efficiency. It fails
896      * to quickly reject some cases that can't match, but will reject
897      * them later after doing full char arithmetic */
898     if (prog->minlen > strend - strpos) {
899         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
900 			      "  String too short...\n"));
901 	goto fail;
902     }
903 
904     RXp_MATCH_UTF8_set(prog, utf8_target);
905     reginfo->is_utf8_target = cBOOL(utf8_target);
906     reginfo->info_aux = NULL;
907     reginfo->strbeg = strbeg;
908     reginfo->strend = strend;
909     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
910     reginfo->intuit = 1;
911     /* not actually used within intuit, but zero for safety anyway */
912     reginfo->poscache_maxiter = 0;
913 
914     if (utf8_target) {
915         if ((!prog->anchored_utf8 && prog->anchored_substr)
916                 || (!prog->float_utf8 && prog->float_substr))
917 	    to_utf8_substr(prog);
918 	check = prog->check_utf8;
919     } else {
920 	if (!prog->check_substr && prog->check_utf8) {
921 	    if (! to_byte_substr(prog)) {
922                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
923             }
924         }
925 	check = prog->check_substr;
926     }
927 
928     /* dump the various substring data */
929     DEBUG_OPTIMISE_MORE_r({
930         int i;
931         for (i=0; i<=2; i++) {
932             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
933                                   : prog->substrs->data[i].substr);
934             if (!sv)
935                 continue;
936 
937             Perl_re_printf( aTHX_
938                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
939                 " useful=%" IVdf " utf8=%d [%s]\n",
940                 i,
941                 (IV)prog->substrs->data[i].min_offset,
942                 (IV)prog->substrs->data[i].max_offset,
943                 (IV)prog->substrs->data[i].end_shift,
944                 BmUSEFUL(sv),
945                 utf8_target ? 1 : 0,
946                 SvPEEK(sv));
947         }
948     });
949 
950     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
951 
952         /* ml_anch: check after \n?
953          *
954          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
955          * with /.*.../, these flags will have been added by the
956          * compiler:
957          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
958          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
959          */
960 	ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
961                    && !(prog->intflags & PREGf_IMPLICIT);
962 
963 	if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
964             /* we are only allowed to match at BOS or \G */
965 
966             /* trivially reject if there's a BOS anchor and we're not at BOS.
967              *
968              * Note that we don't try to do a similar quick reject for
969              * \G, since generally the caller will have calculated strpos
970              * based on pos() and gofs, so the string is already correctly
971              * anchored by definition; and handling the exceptions would
972              * be too fiddly (e.g. REXEC_IGNOREPOS).
973              */
974             if (   strpos != strbeg
975                 && (prog->intflags & PREGf_ANCH_SBOL))
976             {
977                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
978                                 "  Not at start...\n"));
979 	        goto fail;
980 	    }
981 
982             /* in the presence of an anchor, the anchored (relative to the
983              * start of the regex) substr must also be anchored relative
984              * to strpos. So quickly reject if substr isn't found there.
985              * This works for \G too, because the caller will already have
986              * subtracted gofs from pos, and gofs is the offset from the
987              * \G to the start of the regex. For example, in /.abc\Gdef/,
988              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
989              * caller will have set strpos=pos()-4; we look for the substr
990              * at position pos()-4+1, which lines up with the "a" */
991 
992 	    if (prog->check_offset_min == prog->check_offset_max) {
993 	        /* Substring at constant offset from beg-of-str... */
994 	        SSize_t slen = SvCUR(check);
995                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
996 
997                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
998                     "  Looking for check substr at fixed offset %" IVdf "...\n",
999                     (IV)prog->check_offset_min));
1000 
1001 	        if (SvTAIL(check)) {
1002                     /* In this case, the regex is anchored at the end too.
1003                      * Unless it's a multiline match, the lengths must match
1004                      * exactly, give or take a \n.  NB: slen >= 1 since
1005                      * the last char of check is \n */
1006 		    if (!multiline
1007                         && (   strend - s > slen
1008                             || strend - s < slen - 1
1009                             || (strend - s == slen && strend[-1] != '\n')))
1010                     {
1011                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1012                                             "  String too long...\n"));
1013                         goto fail_finish;
1014                     }
1015                     /* Now should match s[0..slen-2] */
1016                     slen--;
1017                 }
1018                 if (slen && (strend - s < slen
1019                     || *SvPVX_const(check) != *s
1020                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1021                 {
1022                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1023                                     "  String not equal...\n"));
1024                     goto fail_finish;
1025                 }
1026 
1027                 check_at = s;
1028                 goto success_at_start;
1029 	    }
1030 	}
1031     }
1032 
1033     end_shift = prog->check_end_shift;
1034 
1035 #ifdef DEBUGGING	/* 7/99: reports of failure (with the older version) */
1036     if (end_shift < 0)
1037 	Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1038 		   (IV)end_shift, RX_PRECOMP(rx));
1039 #endif
1040 
1041   restart:
1042 
1043     /* This is the (re)entry point of the main loop in this function.
1044      * The goal of this loop is to:
1045      * 1) find the "check" substring in the region rx_origin..strend
1046      *    (adjusted by start_shift / end_shift). If not found, reject
1047      *    immediately.
1048      * 2) If it exists, look for the "other" substr too if defined; for
1049      *    example, if the check substr maps to the anchored substr, then
1050      *    check the floating substr, and vice-versa. If not found, go
1051      *    back to (1) with rx_origin suitably incremented.
1052      * 3) If we find an rx_origin position that doesn't contradict
1053      *    either of the substrings, then check the possible additional
1054      *    constraints on rx_origin of /^.../m or a known start class.
1055      *    If these fail, then depending on which constraints fail, jump
1056      *    back to here, or to various other re-entry points further along
1057      *    that skip some of the first steps.
1058      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1059      *    substring. If the start position was determined to be at the
1060      *    beginning of the string  - so, not rejected, but not optimised,
1061      *    since we have to run regmatch from position 0 - decrement the
1062      *    BmUSEFUL() count. Otherwise increment it.
1063      */
1064 
1065 
1066     /* first, look for the 'check' substring */
1067 
1068     {
1069         U8* start_point;
1070         U8* end_point;
1071 
1072         DEBUG_OPTIMISE_MORE_r({
1073             Perl_re_printf( aTHX_
1074                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1075                 " Start shift: %" IVdf " End shift %" IVdf
1076                 " Real end Shift: %" IVdf "\n",
1077                 (IV)(rx_origin - strbeg),
1078                 (IV)prog->check_offset_min,
1079                 (IV)start_shift,
1080                 (IV)end_shift,
1081                 (IV)prog->check_end_shift);
1082         });
1083 
1084         end_point = HOPBACK3(strend, end_shift, rx_origin);
1085         if (!end_point)
1086             goto fail_finish;
1087         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1088         if (!start_point)
1089             goto fail_finish;
1090 
1091 
1092         /* If the regex is absolutely anchored to either the start of the
1093          * string (SBOL) or to pos() (ANCH_GPOS), then
1094          * check_offset_max represents an upper bound on the string where
1095          * the substr could start. For the ANCH_GPOS case, we assume that
1096          * the caller of intuit will have already set strpos to
1097          * pos()-gofs, so in this case strpos + offset_max will still be
1098          * an upper bound on the substr.
1099          */
1100         if (!ml_anch
1101             && prog->intflags & PREGf_ANCH
1102             && prog->check_offset_max != SSize_t_MAX)
1103         {
1104             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1105             const char * const anchor =
1106                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1107             SSize_t targ_len = (char*)end_point - anchor;
1108 
1109             if (check_len > targ_len) {
1110                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1111 			      "Target string too short to match required substring...\n"));
1112                 goto fail_finish;
1113             }
1114 
1115             /* do a bytes rather than chars comparison. It's conservative;
1116              * so it skips doing the HOP if the result can't possibly end
1117              * up earlier than the old value of end_point.
1118              */
1119             assert(anchor + check_len <= (char *)end_point);
1120             if (prog->check_offset_max + check_len < targ_len) {
1121                 end_point = HOP3lim((U8*)anchor,
1122                                 prog->check_offset_max,
1123                                 end_point - check_len
1124                             )
1125                             + check_len;
1126                 if (end_point < start_point)
1127                     goto fail_finish;
1128             }
1129         }
1130 
1131 	check_at = fbm_instr( start_point, end_point,
1132 		      check, multiline ? FBMrf_MULTILINE : 0);
1133 
1134         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1135             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1136             (IV)((char*)start_point - strbeg),
1137             (IV)((char*)end_point   - strbeg),
1138             (IV)(check_at ? check_at - strbeg : -1)
1139         ));
1140 
1141         /* Update the count-of-usability, remove useless subpatterns,
1142             unshift s.  */
1143 
1144         DEBUG_EXECUTE_r({
1145             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1146                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1147             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1148                               (check_at ? "Found" : "Did not find"),
1149                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1150                     ? "anchored" : "floating"),
1151                 quoted,
1152                 RE_SV_TAIL(check),
1153                 (check_at ? " at offset " : "...\n") );
1154         });
1155 
1156         if (!check_at)
1157             goto fail_finish;
1158         /* set rx_origin to the minimum position where the regex could start
1159          * matching, given the constraint of the just-matched check substring.
1160          * But don't set it lower than previously.
1161          */
1162 
1163         if (check_at - rx_origin > prog->check_offset_max)
1164             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1165         /* Finish the diagnostic message */
1166         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1167             "%ld (rx_origin now %" IVdf ")...\n",
1168             (long)(check_at - strbeg),
1169             (IV)(rx_origin - strbeg)
1170         ));
1171     }
1172 
1173 
1174     /* now look for the 'other' substring if defined */
1175 
1176     if (prog->substrs->data[other_ix].utf8_substr
1177         || prog->substrs->data[other_ix].substr)
1178     {
1179 	/* Take into account the "other" substring. */
1180         char *last, *last1;
1181         char *s;
1182         SV* must;
1183         struct reg_substr_datum *other;
1184 
1185       do_other_substr:
1186         other = &prog->substrs->data[other_ix];
1187         if (!utf8_target && !other->substr) {
1188             if (!to_byte_substr(prog)) {
1189                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1190             }
1191         }
1192 
1193         /* if "other" is anchored:
1194          * we've previously found a floating substr starting at check_at.
1195          * This means that the regex origin must lie somewhere
1196          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1197          * and max:                 HOP3(check_at, -check_offset_min)
1198          * (except that min will be >= strpos)
1199          * So the fixed  substr must lie somewhere between
1200          *  HOP3(min, anchored_offset)
1201          *  HOP3(max, anchored_offset) + SvCUR(substr)
1202          */
1203 
1204         /* if "other" is floating
1205          * Calculate last1, the absolute latest point where the
1206          * floating substr could start in the string, ignoring any
1207          * constraints from the earlier fixed match. It is calculated
1208          * as follows:
1209          *
1210          * strend - prog->minlen (in chars) is the absolute latest
1211          * position within the string where the origin of the regex
1212          * could appear. The latest start point for the floating
1213          * substr is float_min_offset(*) on from the start of the
1214          * regex.  last1 simply combines thee two offsets.
1215          *
1216          * (*) You might think the latest start point should be
1217          * float_max_offset from the regex origin, and technically
1218          * you'd be correct. However, consider
1219          *    /a\d{2,4}bcd\w/
1220          * Here, float min, max are 3,5 and minlen is 7.
1221          * This can match either
1222          *    /a\d\dbcd\w/
1223          *    /a\d\d\dbcd\w/
1224          *    /a\d\d\d\dbcd\w/
1225          * In the first case, the regex matches minlen chars; in the
1226          * second, minlen+1, in the third, minlen+2.
1227          * In the first case, the floating offset is 3 (which equals
1228          * float_min), in the second, 4, and in the third, 5 (which
1229          * equals float_max). In all cases, the floating string bcd
1230          * can never start more than 4 chars from the end of the
1231          * string, which equals minlen - float_min. As the substring
1232          * starts to match more than float_min from the start of the
1233          * regex, it makes the regex match more than minlen chars,
1234          * and the two cancel each other out. So we can always use
1235          * float_min - minlen, rather than float_max - minlen for the
1236          * latest position in the string.
1237          *
1238          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1239          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1240          */
1241 
1242         assert(prog->minlen >= other->min_offset);
1243         last1 = HOP3c(strend,
1244                         other->min_offset - prog->minlen, strbeg);
1245 
1246         if (other_ix) {/* i.e. if (other-is-float) */
1247             /* last is the latest point where the floating substr could
1248              * start, *given* any constraints from the earlier fixed
1249              * match. This constraint is that the floating string starts
1250              * <= float_max_offset chars from the regex origin (rx_origin).
1251              * If this value is less than last1, use it instead.
1252              */
1253             assert(rx_origin <= last1);
1254             last =
1255                 /* this condition handles the offset==infinity case, and
1256                  * is a short-cut otherwise. Although it's comparing a
1257                  * byte offset to a char length, it does so in a safe way,
1258                  * since 1 char always occupies 1 or more bytes,
1259                  * so if a string range is  (last1 - rx_origin) bytes,
1260                  * it will be less than or equal to  (last1 - rx_origin)
1261                  * chars; meaning it errs towards doing the accurate HOP3
1262                  * rather than just using last1 as a short-cut */
1263                 (last1 - rx_origin) < other->max_offset
1264                     ? last1
1265                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1266         }
1267         else {
1268             assert(strpos + start_shift <= check_at);
1269             last = HOP4c(check_at, other->min_offset - start_shift,
1270                         strbeg, strend);
1271         }
1272 
1273         s = HOP3c(rx_origin, other->min_offset, strend);
1274         if (s < other_last)	/* These positions already checked */
1275             s = other_last;
1276 
1277         must = utf8_target ? other->utf8_substr : other->substr;
1278         assert(SvPOK(must));
1279         {
1280             char *from = s;
1281             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1282 
1283             if (to > strend)
1284                 to = strend;
1285             if (from > to) {
1286                 s = NULL;
1287                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1288                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1289                     (IV)(from - strbeg),
1290                     (IV)(to   - strbeg)
1291                 ));
1292             }
1293             else {
1294                 s = fbm_instr(
1295                     (unsigned char*)from,
1296                     (unsigned char*)to,
1297                     must,
1298                     multiline ? FBMrf_MULTILINE : 0
1299                 );
1300                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1301                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1302                     (IV)(from - strbeg),
1303                     (IV)(to   - strbeg),
1304                     (IV)(s ? s - strbeg : -1)
1305                 ));
1306             }
1307         }
1308 
1309         DEBUG_EXECUTE_r({
1310             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1311                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1312             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1313                 s ? "Found" : "Contradicts",
1314                 other_ix ? "floating" : "anchored",
1315                 quoted, RE_SV_TAIL(must));
1316         });
1317 
1318 
1319         if (!s) {
1320             /* last1 is latest possible substr location. If we didn't
1321              * find it before there, we never will */
1322             if (last >= last1) {
1323                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1324                                         "; giving up...\n"));
1325                 goto fail_finish;
1326             }
1327 
1328             /* try to find the check substr again at a later
1329              * position. Maybe next time we'll find the "other" substr
1330              * in range too */
1331             other_last = HOP3c(last, 1, strend) /* highest failure */;
1332             rx_origin =
1333                 other_ix /* i.e. if other-is-float */
1334                     ? HOP3c(rx_origin, 1, strend)
1335                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1336             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1337                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1338                 (other_ix ? "floating" : "anchored"),
1339                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1340                 (IV)(rx_origin - strbeg)
1341             ));
1342             goto restart;
1343         }
1344         else {
1345             if (other_ix) { /* if (other-is-float) */
1346                 /* other_last is set to s, not s+1, since its possible for
1347                  * a floating substr to fail first time, then succeed
1348                  * second time at the same floating position; e.g.:
1349                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1350                  * The first time round, anchored and float match at
1351                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1352                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1353                  */
1354                 other_last = s;
1355             }
1356             else {
1357                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1358                 other_last = HOP3c(s, 1, strend);
1359             }
1360             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1361                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1362                   (long)(s - strbeg),
1363                 (IV)(rx_origin - strbeg)
1364               ));
1365 
1366         }
1367     }
1368     else {
1369         DEBUG_OPTIMISE_MORE_r(
1370             Perl_re_printf( aTHX_
1371                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1372                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1373                 " strend:%" IVdf "\n",
1374                 (IV)prog->check_offset_min,
1375                 (IV)prog->check_offset_max,
1376                 (IV)(check_at-strbeg),
1377                 (IV)(rx_origin-strbeg),
1378                 (IV)(rx_origin-check_at),
1379                 (IV)(strend-strbeg)
1380             )
1381         );
1382     }
1383 
1384   postprocess_substr_matches:
1385 
1386     /* handle the extra constraint of /^.../m if present */
1387 
1388     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1389         char *s;
1390 
1391         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1392                         "  looking for /^/m anchor"));
1393 
1394         /* we have failed the constraint of a \n before rx_origin.
1395          * Find the next \n, if any, even if it's beyond the current
1396          * anchored and/or floating substrings. Whether we should be
1397          * scanning ahead for the next \n or the next substr is debatable.
1398          * On the one hand you'd expect rare substrings to appear less
1399          * often than \n's. On the other hand, searching for \n means
1400          * we're effectively flipping between check_substr and "\n" on each
1401          * iteration as the current "rarest" string candidate, which
1402          * means for example that we'll quickly reject the whole string if
1403          * hasn't got a \n, rather than trying every substr position
1404          * first
1405          */
1406 
1407         s = HOP3c(strend, - prog->minlen, strpos);
1408         if (s <= rx_origin ||
1409             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1410         {
1411             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1412                             "  Did not find /%s^%s/m...\n",
1413                             PL_colors[0], PL_colors[1]));
1414             goto fail_finish;
1415         }
1416 
1417         /* earliest possible origin is 1 char after the \n.
1418          * (since *rx_origin == '\n', it's safe to ++ here rather than
1419          * HOP(rx_origin, 1)) */
1420         rx_origin++;
1421 
1422         if (prog->substrs->check_ix == 0  /* check is anchored */
1423             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1424         {
1425             /* Position contradicts check-string; either because
1426              * check was anchored (and thus has no wiggle room),
1427              * or check was float and rx_origin is above the float range */
1428             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1429                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1430                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1431             goto restart;
1432         }
1433 
1434         /* if we get here, the check substr must have been float,
1435          * is in range, and we may or may not have had an anchored
1436          * "other" substr which still contradicts */
1437         assert(prog->substrs->check_ix); /* check is float */
1438 
1439         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1440             /* whoops, the anchored "other" substr exists, so we still
1441              * contradict. On the other hand, the float "check" substr
1442              * didn't contradict, so just retry the anchored "other"
1443              * substr */
1444             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1445                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1446                 PL_colors[0], PL_colors[1],
1447                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1448                 (IV)(rx_origin - strbeg)
1449             ));
1450             goto do_other_substr;
1451         }
1452 
1453         /* success: we don't contradict the found floating substring
1454          * (and there's no anchored substr). */
1455         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1456             "  Found /%s^%s/m with rx_origin %ld...\n",
1457             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1458     }
1459     else {
1460         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1461             "  (multiline anchor test skipped)\n"));
1462     }
1463 
1464   success_at_start:
1465 
1466 
1467     /* if we have a starting character class, then test that extra constraint.
1468      * (trie stclasses are too expensive to use here, we are better off to
1469      * leave it to regmatch itself) */
1470 
1471     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1472         const U8* const str = (U8*)STRING(progi->regstclass);
1473 
1474         /* XXX this value could be pre-computed */
1475         const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1476 		    ?  (reginfo->is_utf8_pat
1477                         ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1478                         : (SSize_t)STR_LEN(progi->regstclass))
1479 		    : 1);
1480 	char * endpos;
1481         char *s;
1482         /* latest pos that a matching float substr constrains rx start to */
1483         char *rx_max_float = NULL;
1484 
1485         /* if the current rx_origin is anchored, either by satisfying an
1486          * anchored substring constraint, or a /^.../m constraint, then we
1487          * can reject the current origin if the start class isn't found
1488          * at the current position. If we have a float-only match, then
1489          * rx_origin is constrained to a range; so look for the start class
1490          * in that range. if neither, then look for the start class in the
1491          * whole rest of the string */
1492 
1493         /* XXX DAPM it's not clear what the minlen test is for, and why
1494          * it's not used in the floating case. Nothing in the test suite
1495          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1496          * Here are some old comments, which may or may not be correct:
1497          *
1498 	 *   minlen == 0 is possible if regstclass is \b or \B,
1499 	 *   and the fixed substr is ''$.
1500          *   Since minlen is already taken into account, rx_origin+1 is
1501          *   before strend; accidentally, minlen >= 1 guaranties no false
1502          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1503          *   0) below assumes that regstclass does not come from lookahead...
1504 	 *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1505          *   This leaves EXACTF-ish only, which are dealt with in
1506          *   find_byclass().
1507          */
1508 
1509 	if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1510             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1511         else if (prog->float_substr || prog->float_utf8) {
1512 	    rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1513 	    endpos = HOP3clim(rx_max_float, cl_l, strend);
1514         }
1515         else
1516             endpos= strend;
1517 
1518         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1519             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1520             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1521               (IV)start_shift, (IV)(check_at - strbeg),
1522               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1523 
1524         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1525                             reginfo);
1526 	if (!s) {
1527 	    if (endpos == strend) {
1528                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1529 				"  Could not match STCLASS...\n") );
1530 		goto fail;
1531 	    }
1532             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1533                                "  This position contradicts STCLASS...\n") );
1534             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1535                         && !(prog->intflags & PREGf_IMPLICIT))
1536 		goto fail;
1537 
1538 	    /* Contradict one of substrings */
1539 	    if (prog->anchored_substr || prog->anchored_utf8) {
1540                 if (prog->substrs->check_ix == 1) { /* check is float */
1541                     /* Have both, check_string is floating */
1542                     assert(rx_origin + start_shift <= check_at);
1543                     if (rx_origin + start_shift != check_at) {
1544                         /* not at latest position float substr could match:
1545                          * Recheck anchored substring, but not floating.
1546                          * The condition above is in bytes rather than
1547                          * chars for efficiency. It's conservative, in
1548                          * that it errs on the side of doing 'goto
1549                          * do_other_substr'. In this case, at worst,
1550                          * an extra anchored search may get done, but in
1551                          * practice the extra fbm_instr() is likely to
1552                          * get skipped anyway. */
1553                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1554                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1555                             (long)(other_last - strbeg),
1556                             (IV)(rx_origin - strbeg)
1557                         ));
1558                         goto do_other_substr;
1559                     }
1560                 }
1561             }
1562 	    else {
1563                 /* float-only */
1564 
1565                 if (ml_anch) {
1566                     /* In the presence of ml_anch, we might be able to
1567                      * find another \n without breaking the current float
1568                      * constraint. */
1569 
1570                     /* strictly speaking this should be HOP3c(..., 1, ...),
1571                      * but since we goto a block of code that's going to
1572                      * search for the next \n if any, its safe here */
1573                     rx_origin++;
1574                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1575                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1576                               PL_colors[0], PL_colors[1],
1577                               (long)(rx_origin - strbeg)) );
1578                     goto postprocess_substr_matches;
1579                 }
1580 
1581                 /* strictly speaking this can never be true; but might
1582                  * be if we ever allow intuit without substrings */
1583                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1584                     goto fail;
1585 
1586                 rx_origin = rx_max_float;
1587             }
1588 
1589             /* at this point, any matching substrings have been
1590              * contradicted. Start again... */
1591 
1592             rx_origin = HOP3c(rx_origin, 1, strend);
1593 
1594             /* uses bytes rather than char calculations for efficiency.
1595              * It's conservative: it errs on the side of doing 'goto restart',
1596              * where there is code that does a proper char-based test */
1597             if (rx_origin + start_shift + end_shift > strend) {
1598                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1599                                        "  Could not match STCLASS...\n") );
1600                 goto fail;
1601             }
1602             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1603                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1604                 (prog->substrs->check_ix ? "floating" : "anchored"),
1605                 (long)(rx_origin + start_shift - strbeg),
1606                 (IV)(rx_origin - strbeg)
1607             ));
1608             goto restart;
1609 	}
1610 
1611         /* Success !!! */
1612 
1613 	if (rx_origin != s) {
1614             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1615 			"  By STCLASS: moving %ld --> %ld\n",
1616                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1617                    );
1618         }
1619         else {
1620             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1621                                   "  Does not contradict STCLASS...\n");
1622                    );
1623         }
1624     }
1625 
1626     /* Decide whether using the substrings helped */
1627 
1628     if (rx_origin != strpos) {
1629 	/* Fixed substring is found far enough so that the match
1630 	   cannot start at strpos. */
1631 
1632         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1633 	++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);	/* hooray/5 */
1634     }
1635     else {
1636         /* The found rx_origin position does not prohibit matching at
1637          * strpos, so calling intuit didn't gain us anything. Decrement
1638          * the BmUSEFUL() count on the check substring, and if we reach
1639          * zero, free it.  */
1640 	if (!(prog->intflags & PREGf_NAUGHTY)
1641 	    && (utf8_target ? (
1642 		prog->check_utf8		/* Could be deleted already */
1643 		&& --BmUSEFUL(prog->check_utf8) < 0
1644 		&& (prog->check_utf8 == prog->float_utf8)
1645 	    ) : (
1646 		prog->check_substr		/* Could be deleted already */
1647 		&& --BmUSEFUL(prog->check_substr) < 0
1648 		&& (prog->check_substr == prog->float_substr)
1649 	    )))
1650 	{
1651 	    /* If flags & SOMETHING - do not do it many times on the same match */
1652             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1653 	    /* XXX Does the destruction order has to change with utf8_target? */
1654 	    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1655 	    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1656 	    prog->check_substr = prog->check_utf8 = NULL;	/* disable */
1657 	    prog->float_substr = prog->float_utf8 = NULL;	/* clear */
1658 	    check = NULL;			/* abort */
1659 	    /* XXXX This is a remnant of the old implementation.  It
1660 	            looks wasteful, since now INTUIT can use many
1661 	            other heuristics. */
1662 	    prog->extflags &= ~RXf_USE_INTUIT;
1663 	}
1664     }
1665 
1666     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1667             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1668              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1669 
1670     return rx_origin;
1671 
1672   fail_finish:				/* Substring not found */
1673     if (prog->check_substr || prog->check_utf8)		/* could be removed already */
1674 	BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1675   fail:
1676     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1677 			  PL_colors[4], PL_colors[5]));
1678     return NULL;
1679 }
1680 
1681 
1682 #define DECL_TRIE_TYPE(scan) \
1683     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1684                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1685                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1686                     trie_type = ((scan->flags == EXACT)                             \
1687                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1688                                  : (scan->flags == EXACTL)                          \
1689                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1690                                     : (scan->flags == EXACTFAA)                     \
1691                                       ? (utf8_target                                \
1692                                          ? trie_utf8_exactfa_fold                   \
1693                                          : trie_latin_utf8_exactfa_fold)            \
1694                                       : (scan->flags == EXACTFLU8                   \
1695                                          ? (utf8_target                             \
1696                                            ? trie_flu8                              \
1697                                            : trie_flu8_latin)                       \
1698                                          : (utf8_target                             \
1699                                            ? trie_utf8_fold                         \
1700                                            : trie_latin_utf8_fold)))
1701 
1702 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1703  * 'foldbuf+sizeof(foldbuf)' */
1704 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1705 STMT_START {                                                                        \
1706     STRLEN skiplen;                                                                 \
1707     U8 flags = FOLD_FLAGS_FULL;                                                     \
1708     switch (trie_type) {                                                            \
1709     case trie_flu8:                                                                 \
1710         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1711         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1712             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1713         }                                                                           \
1714         goto do_trie_utf8_fold;                                                     \
1715     case trie_utf8_exactfa_fold:                                                    \
1716         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1717         /* FALLTHROUGH */                                                           \
1718     case trie_utf8_fold:                                                            \
1719       do_trie_utf8_fold:                                                            \
1720         if ( foldlen>0 ) {                                                          \
1721             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1722             foldlen -= len;                                                         \
1723             uscan += len;                                                           \
1724             len=0;                                                                  \
1725         } else {                                                                    \
1726             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1727                                                                             flags); \
1728             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1729             skiplen = UVCHR_SKIP( uvc );                                            \
1730             foldlen -= skiplen;                                                     \
1731             uscan = foldbuf + skiplen;                                              \
1732         }                                                                           \
1733         break;                                                                      \
1734     case trie_flu8_latin:                                                           \
1735         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1736         goto do_trie_latin_utf8_fold;                                               \
1737     case trie_latin_utf8_exactfa_fold:                                              \
1738         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1739         /* FALLTHROUGH */                                                           \
1740     case trie_latin_utf8_fold:                                                      \
1741       do_trie_latin_utf8_fold:                                                      \
1742         if ( foldlen>0 ) {                                                          \
1743             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1744             foldlen -= len;                                                         \
1745             uscan += len;                                                           \
1746             len=0;                                                                  \
1747         } else {                                                                    \
1748             len = 1;                                                                \
1749             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1750             skiplen = UVCHR_SKIP( uvc );                                            \
1751             foldlen -= skiplen;                                                     \
1752             uscan = foldbuf + skiplen;                                              \
1753         }                                                                           \
1754         break;                                                                      \
1755     case trie_utf8l:                                                                \
1756         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1757         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1758             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1759         }                                                                           \
1760         /* FALLTHROUGH */                                                           \
1761     case trie_utf8:                                                                 \
1762         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1763         break;                                                                      \
1764     case trie_plain:                                                                \
1765         uvc = (UV)*uc;                                                              \
1766         len = 1;                                                                    \
1767     }                                                                               \
1768     if (uvc < 256) {                                                                \
1769         charid = trie->charmap[ uvc ];                                              \
1770     }                                                                               \
1771     else {                                                                          \
1772         charid = 0;                                                                 \
1773         if (widecharmap) {                                                          \
1774             SV** const svpp = hv_fetch(widecharmap,                                 \
1775                         (char*)&uvc, sizeof(UV), 0);                                \
1776             if (svpp)                                                               \
1777                 charid = (U16)SvIV(*svpp);                                          \
1778         }                                                                           \
1779     }                                                                               \
1780 } STMT_END
1781 
1782 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1783     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1784                 startpos, doutf8, depth)
1785 
1786 #define REXEC_FBC_SCAN(UTF8, CODE)                          \
1787     STMT_START {                                            \
1788         while (s < strend) {                                \
1789             CODE                                            \
1790             s += ((UTF8)                                    \
1791                   ? UTF8_SAFE_SKIP(s, reginfo->strend)      \
1792                   : 1);                                     \
1793         }                                                   \
1794     } STMT_END
1795 
1796 #define REXEC_FBC_CLASS_SCAN(UTF8, COND)                    \
1797     STMT_START {                                            \
1798         while (s < strend) {                                \
1799             REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)           \
1800         }                                                   \
1801     } STMT_END
1802 
1803 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
1804     if (COND) {                                                \
1805         FBC_CHECK_AND_TRY                                      \
1806         s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
1807         previous_occurrence_end = s;                           \
1808     }                                                          \
1809     else {                                                     \
1810         s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
1811     }
1812 
1813 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1814     if (utf8_target) {                                         \
1815 	REXEC_FBC_CLASS_SCAN(1, CONDUTF8);                     \
1816     }                                                          \
1817     else {                                                     \
1818 	REXEC_FBC_CLASS_SCAN(0, COND);                         \
1819     }
1820 
1821 /* We keep track of where the next character should start after an occurrence
1822  * of the one we're looking for.  Knowing that, we can see right away if the
1823  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1824  * don't accept the 2nd and succeeding adjacent occurrences */
1825 #define FBC_CHECK_AND_TRY                                           \
1826         if (   (   doevery                                          \
1827                 || s != previous_occurrence_end)                    \
1828             && (   reginfo->intuit                                  \
1829                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1830         {                                                           \
1831             goto got_it;                                            \
1832         }
1833 
1834 
1835 /* This differs from the above macros in that it calls a function which returns
1836  * the next occurrence of the thing being looked for in 's'; and 'strend' if
1837  * there is no such occurrence. */
1838 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f)                   \
1839     while (s < strend) {                                    \
1840         s = (f);                                            \
1841         if (s >= strend) {                                  \
1842             break;                                          \
1843         }                                                   \
1844                                                             \
1845         FBC_CHECK_AND_TRY                                   \
1846         s += (UTF8) ? UTF8SKIP(s) : 1;                      \
1847         previous_occurrence_end = s;                        \
1848     }
1849 
1850 /* This differs from the above macros in that it is passed a single byte that
1851  * is known to begin the next occurrence of the thing being looked for in 's'.
1852  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1853  * at that position. */
1854 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)      \
1855     while (s < strend) {                                    \
1856         s = (char *) memchr(s, byte, strend -s);            \
1857         if (s == NULL) {                                    \
1858             s = (char *) strend;                            \
1859             break;                                          \
1860         }                                                   \
1861                                                             \
1862         if (COND) {                                         \
1863             FBC_CHECK_AND_TRY                               \
1864             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1865             previous_occurrence_end = s;                    \
1866         }                                                   \
1867         else {                                              \
1868             s += UTF8SKIP(s);                               \
1869         }                                                   \
1870     }
1871 
1872 /* The three macros below are slightly different versions of the same logic.
1873  *
1874  * The first is for /a and /aa when the target string is UTF-8.  This can only
1875  * match ascii, but it must advance based on UTF-8.   The other two handle the
1876  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1877  * for the boundary (or non-boundary) between a word and non-word character.
1878  * The utf8 and non-utf8 cases have the same logic, but the details must be
1879  * different.  Find the "wordness" of the character just prior to this one, and
1880  * compare it with the wordness of this one.  If they differ, we have a
1881  * boundary.  At the beginning of the string, pretend that the previous
1882  * character was a new-line.
1883  *
1884  * All these macros uncleanly have side-effects with each other and outside
1885  * variables.  So far it's been too much trouble to clean-up
1886  *
1887  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1888  *               a word character or not.
1889  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1890  *               word/non-word
1891  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1892  *
1893  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1894  * are looking for a boundary or for a non-boundary.  If we are looking for a
1895  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1896  * see if this tentative match actually works, and if so, to quit the loop
1897  * here.  And vice-versa if we are looking for a non-boundary.
1898  *
1899  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1900  * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
1901  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1902  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1903  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1904  * complement.  But in that branch we complement tmp, meaning that at the
1905  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1906  * which means at the top of the loop in the next iteration, it is
1907  * TEST_NON_UTF8(s-1) */
1908 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1909     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1910     tmp = TEST_NON_UTF8(tmp);                                                  \
1911     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1912         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1913             tmp = !tmp;                                                        \
1914             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1915         }                                                                      \
1916         else {                                                                 \
1917             IF_FAIL;                                                           \
1918         }                                                                      \
1919     );                                                                         \
1920 
1921 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1922  * TEST_UTF8 is a macro that for the same input code points returns identically
1923  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1924 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1925     if (s == reginfo->strbeg) {                                                \
1926         tmp = '\n';                                                            \
1927     }                                                                          \
1928     else { /* Back-up to the start of the previous character */                \
1929         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1930         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1931                                                        0, UTF8_ALLOW_DEFAULT); \
1932     }                                                                          \
1933     tmp = TEST_UV(tmp);                                                        \
1934     REXEC_FBC_SCAN(1,  /* 1=>is-utf8; advances s while s < strend */           \
1935         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1936             tmp = !tmp;                                                        \
1937             IF_SUCCESS;                                                        \
1938         }                                                                      \
1939         else {                                                                 \
1940             IF_FAIL;                                                           \
1941         }                                                                      \
1942     );
1943 
1944 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1945  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1946  * macros below */
1947 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1948     if (utf8_target) {                                                         \
1949         UTF8_CODE                                                              \
1950     }                                                                          \
1951     else {  /* Not utf8 */                                                     \
1952 	tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1953 	tmp = TEST_NON_UTF8(tmp);                                              \
1954 	REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */       \
1955 	    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1956 		IF_SUCCESS;                                                    \
1957 		tmp = !tmp;                                                    \
1958 	    }                                                                  \
1959 	    else {                                                             \
1960 		IF_FAIL;                                                       \
1961 	    }                                                                  \
1962 	);                                                                     \
1963     }                                                                          \
1964     /* Here, things have been set up by the previous code so that tmp is the   \
1965      * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1966      * utf8ness of the target).  We also have to check if this matches against \
1967      * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1968      * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1969      * string */                                                               \
1970     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1971         IF_SUCCESS;                                                            \
1972     }                                                                          \
1973     else {                                                                     \
1974         IF_FAIL;                                                               \
1975     }
1976 
1977 /* This is the macro to use when we want to see if something that looks like it
1978  * could match, actually does, and if so exits the loop.  It needs to be used
1979  * only for bounds checking macros, as it allows for matching beyond the end of
1980  * string (which should be zero length without having to look at the string
1981  * contents) */
1982 #define REXEC_FBC_TRYIT                                                     \
1983     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
1984         goto got_it
1985 
1986 /* The only difference between the BOUND and NBOUND cases is that
1987  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1988  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1989  * with the other one being empty (PLACEHOLDER is defined as empty).
1990  *
1991  * The TEST_FOO parameters are for operating on different forms of input, but
1992  * all should be ones that return identically for the same underlying code
1993  * points */
1994 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1995     FBC_BOUND_COMMON(                                                          \
1996           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1997           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1998 
1999 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
2000     FBC_BOUND_COMMON(                                                          \
2001             FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
2002             TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2003 
2004 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
2005     FBC_BOUND_COMMON(                                                          \
2006           FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
2007           TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2008 
2009 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
2010     FBC_BOUND_COMMON(                                                          \
2011             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
2012             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2013 
2014 #ifdef DEBUGGING
2015 static IV
2016 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2017   IV cp_out = _invlist_search(invlist, cp_in);
2018   assert(cp_out >= 0);
2019   return cp_out;
2020 }
2021 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2022 	invmap[S_get_break_val_cp_checked(invlist, cp)]
2023 #else
2024 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2025 	invmap[_invlist_search(invlist, cp)]
2026 #endif
2027 
2028 /* Takes a pointer to an inversion list, a pointer to its corresponding
2029  * inversion map, and a code point, and returns the code point's value
2030  * according to the two arrays.  It assumes that all code points have a value.
2031  * This is used as the base macro for macros for particular properties */
2032 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2033 	_generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2034 
2035 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2036  * of a code point, returning the value for the first code point in the string.
2037  * And it takes the particular macro name that finds the desired value given a
2038  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2039 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2040              (__ASSERT_(pos < strend)                                          \
2041                  /* Note assumes is valid UTF-8 */                             \
2042              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2043 
2044 /* Returns the GCB value for the input code point */
2045 #define getGCB_VAL_CP(cp)                                                      \
2046           _generic_GET_BREAK_VAL_CP(                                           \
2047                                     PL_GCB_invlist,                            \
2048                                     _Perl_GCB_invmap,                          \
2049                                     (cp))
2050 
2051 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2052  * bounded by pos and strend */
2053 #define getGCB_VAL_UTF8(pos, strend)                                           \
2054     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2055 
2056 /* Returns the LB value for the input code point */
2057 #define getLB_VAL_CP(cp)                                                       \
2058           _generic_GET_BREAK_VAL_CP(                                           \
2059                                     PL_LB_invlist,                             \
2060                                     _Perl_LB_invmap,                           \
2061                                     (cp))
2062 
2063 /* Returns the LB value for the first code point in the UTF-8 encoded string
2064  * bounded by pos and strend */
2065 #define getLB_VAL_UTF8(pos, strend)                                            \
2066     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2067 
2068 
2069 /* Returns the SB value for the input code point */
2070 #define getSB_VAL_CP(cp)                                                       \
2071           _generic_GET_BREAK_VAL_CP(                                           \
2072                                     PL_SB_invlist,                             \
2073                                     _Perl_SB_invmap,                     \
2074                                     (cp))
2075 
2076 /* Returns the SB value for the first code point in the UTF-8 encoded string
2077  * bounded by pos and strend */
2078 #define getSB_VAL_UTF8(pos, strend)                                            \
2079     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2080 
2081 /* Returns the WB value for the input code point */
2082 #define getWB_VAL_CP(cp)                                                       \
2083           _generic_GET_BREAK_VAL_CP(                                           \
2084                                     PL_WB_invlist,                             \
2085                                     _Perl_WB_invmap,                         \
2086                                     (cp))
2087 
2088 /* Returns the WB value for the first code point in the UTF-8 encoded string
2089  * bounded by pos and strend */
2090 #define getWB_VAL_UTF8(pos, strend)                                            \
2091     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2092 
2093 /* We know what class REx starts with.  Try to find this position... */
2094 /* if reginfo->intuit, its a dryrun */
2095 /* annoyingly all the vars in this routine have different names from their counterparts
2096    in regmatch. /grrr */
2097 STATIC char *
2098 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2099     const char *strend, regmatch_info *reginfo)
2100 {
2101     dVAR;
2102 
2103     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2104     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2105 
2106     char *pat_string;   /* The pattern's exactish string */
2107     char *pat_end;	    /* ptr to end char of pat_string */
2108     re_fold_t folder;	/* Function for computing non-utf8 folds */
2109     const U8 *fold_array;   /* array for folding ords < 256 */
2110     STRLEN ln;
2111     STRLEN lnc;
2112     U8 c1;
2113     U8 c2;
2114     char *e = NULL;
2115 
2116     /* In some cases we accept only the first occurence of 'x' in a sequence of
2117      * them.  This variable points to just beyond the end of the previous
2118      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2119      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2120      * hop back.) */
2121     char * previous_occurrence_end = 0;
2122 
2123     I32 tmp;            /* Scratch variable */
2124     const bool utf8_target = reginfo->is_utf8_target;
2125     UV utf8_fold_flags = 0;
2126     const bool is_utf8_pat = reginfo->is_utf8_pat;
2127     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2128                                    with a result inverts that result, as 0^1 =
2129                                    1 and 1^1 = 0 */
2130     _char_class_number classnum;
2131 
2132     RXi_GET_DECL(prog,progi);
2133 
2134     PERL_ARGS_ASSERT_FIND_BYCLASS;
2135 
2136     /* We know what class it must start with. */
2137     switch (OP(c)) {
2138     case ANYOFPOSIXL:
2139     case ANYOFL:
2140         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2141 
2142         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2143             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2144         }
2145 
2146         /* FALLTHROUGH */
2147     case ANYOFD:
2148     case ANYOF:
2149         if (utf8_target) {
2150             REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2151                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2152         }
2153         else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2154             /* We know that s is in the bitmap range since the target isn't
2155              * UTF-8, so what happens for out-of-range values is not relevant,
2156              * so exclude that from the flags */
2157             REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2158         }
2159         else {
2160             REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2161         }
2162         break;
2163 
2164     case ANYOFM:    /* ARG() is the base byte; FLAGS() the mask byte */
2165         /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so
2166          * use 0 */
2167         REXEC_FBC_FIND_NEXT_SCAN(0,
2168          (char *) find_next_masked((U8 *) s, (U8 *) strend,
2169                                    (U8) ARG(c), FLAGS(c)));
2170         break;
2171 
2172     case NANYOFM:   /* UTF-8ness does matter because can match UTF-8 variants.
2173                      */
2174         REXEC_FBC_FIND_NEXT_SCAN(utf8_target,
2175          (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2176                                    (U8) ARG(c), FLAGS(c)));
2177         break;
2178 
2179     case ANYOFH:
2180         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2181             REXEC_FBC_CLASS_SCAN(TRUE,
2182                   (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2183                    && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2184         }
2185         break;
2186 
2187     case ANYOFHb:
2188         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2189 
2190             /* We know what the first byte of any matched string should be */
2191             U8 first_byte = FLAGS(c);
2192 
2193             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2194                       reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2195         }
2196         break;
2197 
2198     case ANYOFHr:
2199         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2200             REXEC_FBC_CLASS_SCAN(TRUE,
2201                   (   inRANGE(NATIVE_UTF8_TO_I8(*s),
2202                               LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2203                               HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2204                    && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2205         }
2206         break;
2207 
2208     case ANYOFHs:
2209         if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
2210             REXEC_FBC_CLASS_SCAN(TRUE,
2211                   (   strend -s >= FLAGS(c)
2212                    && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
2213                    && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
2214         }
2215         break;
2216 
2217     case ANYOFR:
2218         if (utf8_target) {
2219             REXEC_FBC_CLASS_SCAN(TRUE,
2220                   (   NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2221                    && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2222                                                     (U8 *) strend,
2223                                                     NULL),
2224                                   ANYOFRbase(c), ANYOFRdelta(c))));
2225         }
2226         else {
2227             REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
2228                                                ANYOFRbase(c), ANYOFRdelta(c)));
2229         }
2230         break;
2231 
2232     case ANYOFRb:
2233         if (utf8_target) {
2234 
2235             /* We know what the first byte of any matched string should be */
2236             U8 first_byte = FLAGS(c);
2237 
2238             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2239                       withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2240                                                     (U8 *) strend,
2241                                                     NULL),
2242                                   ANYOFRbase(c), ANYOFRdelta(c)));
2243         }
2244         else {
2245             REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
2246                                                ANYOFRbase(c), ANYOFRdelta(c)));
2247         }
2248         break;
2249 
2250     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2251         assert(! is_utf8_pat);
2252 	/* FALLTHROUGH */
2253     case EXACTFAA:
2254         if (is_utf8_pat) {
2255             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2256                              |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
2257             goto do_exactf_utf8;
2258         }
2259         else if (utf8_target) {
2260 
2261             /* Here, and elsewhere in this file, the reason we can't consider a
2262              * non-UTF-8 pattern already folded in the presence of a UTF-8
2263              * target is because any MICRO SIGN in the pattern won't be folded.
2264              * Since the fold of the MICRO SIGN requires UTF-8 to represent, we
2265              * can consider a non-UTF-8 pattern folded when matching a
2266              * non-UTF-8 target */
2267             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2268             goto do_exactf_utf8;
2269         }
2270 
2271         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2272          * which these functions don't handle anyway */
2273         fold_array = PL_fold_latin1;
2274         folder = foldEQ_latin1_s2_folded;
2275         goto do_exactf_non_utf8;
2276 
2277     case EXACTF:   /* This node only generated for non-utf8 patterns */
2278         assert(! is_utf8_pat);
2279         if (utf8_target) {
2280             goto do_exactf_utf8;
2281         }
2282         fold_array = PL_fold;
2283         folder = foldEQ;
2284         goto do_exactf_non_utf8;
2285 
2286     case EXACTFL:
2287         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2288         if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2289             utf8_fold_flags = FOLDEQ_LOCALE;
2290             goto do_exactf_utf8;
2291         }
2292         fold_array = PL_fold_locale;
2293         folder = foldEQ_locale;
2294         goto do_exactf_non_utf8;
2295 
2296     case EXACTFUP:      /* Problematic even though pattern isn't UTF-8.  Use
2297                            full functionality normally not done except for
2298                            UTF-8 */
2299         assert(! is_utf8_pat);
2300         goto do_exactf_utf8;
2301 
2302     case EXACTFLU8:
2303             if (! utf8_target) {    /* All code points in this node require
2304                                        UTF-8 to express.  */
2305                 break;
2306             }
2307             utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2308                                              | FOLDEQ_S2_FOLDS_SANE;
2309             goto do_exactf_utf8;
2310 
2311     case EXACTFU_REQ8:
2312         if (! utf8_target) {
2313             break;
2314         }
2315         assert(is_utf8_pat);
2316         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2317         goto do_exactf_utf8;
2318 
2319     case EXACTFU:
2320         if (is_utf8_pat || utf8_target) {
2321             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2322             goto do_exactf_utf8;
2323         }
2324 
2325         /* Any 'ss' in the pattern should have been replaced by regcomp,
2326          * so we don't have to worry here about this single special case
2327          * in the Latin1 range */
2328         fold_array = PL_fold_latin1;
2329         folder = foldEQ_latin1_s2_folded;
2330 
2331         /* FALLTHROUGH */
2332 
2333       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2334                            are no glitches with fold-length differences
2335                            between the target string and pattern */
2336 
2337         /* The idea in the non-utf8 EXACTF* cases is to first find the
2338          * first character of the EXACTF* node and then, if necessary,
2339          * case-insensitively compare the full text of the node.  c1 is the
2340          * first character.  c2 is its fold.  This logic will not work for
2341          * Unicode semantics and the german sharp ss, which hence should
2342          * not be compiled into a node that gets here. */
2343         pat_string = STRINGs(c);
2344         ln  = STR_LENs(c);	/* length to match in octets/bytes */
2345 
2346         /* We know that we have to match at least 'ln' bytes (which is the
2347          * same as characters, since not utf8).  If we have to match 3
2348          * characters, and there are only 2 availabe, we know without
2349          * trying that it will fail; so don't start a match past the
2350          * required minimum number from the far end */
2351         e = HOP3c(strend, -((SSize_t)ln), s);
2352         if (e < s)
2353             break;
2354 
2355         c1 = *pat_string;
2356         c2 = fold_array[c1];
2357         if (c1 == c2) { /* If char and fold are the same */
2358             while (s <= e) {
2359                 s = (char *) memchr(s, c1, e + 1 - s);
2360                 if (s == NULL) {
2361                     break;
2362                 }
2363 
2364                 /* Check that the rest of the node matches */
2365                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2366                     && (reginfo->intuit || regtry(reginfo, &s)) )
2367                 {
2368                     goto got_it;
2369                 }
2370                 s++;
2371             }
2372         }
2373         else {
2374             U8 bits_differing = c1 ^ c2;
2375 
2376             /* If the folds differ in one bit position only, we can mask to
2377              * match either of them, and can use this faster find method.  Both
2378              * ASCII and EBCDIC tend to have their case folds differ in only
2379              * one position, so this is very likely */
2380             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2381                 bits_differing = ~ bits_differing;
2382                 while (s <= e) {
2383                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2384                                         (c1 & bits_differing), bits_differing);
2385                     if (s > e) {
2386                         break;
2387                     }
2388 
2389                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2390                         && (reginfo->intuit || regtry(reginfo, &s)) )
2391                     {
2392                         goto got_it;
2393                     }
2394                     s++;
2395                 }
2396             }
2397             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2398                        should actually happen only in EXACTFL nodes */
2399                 while (s <= e) {
2400                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2401                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2402                         && (reginfo->intuit || regtry(reginfo, &s)) )
2403                     {
2404                         goto got_it;
2405                     }
2406                     s++;
2407                 }
2408             }
2409         }
2410         break;
2411 
2412       do_exactf_utf8:
2413       {
2414         unsigned expansion;
2415 
2416         /* If one of the operands is in utf8, we can't use the simpler folding
2417          * above, due to the fact that many different characters can have the
2418          * same fold, or portion of a fold, or different- length fold */
2419         pat_string = STRINGs(c);
2420         ln  = STR_LENs(c);	/* length to match in octets/bytes */
2421         pat_end = pat_string + ln;
2422         lnc = is_utf8_pat       /* length to match in characters */
2423                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2424                 : ln;
2425 
2426         /* We have 'lnc' characters to match in the pattern, but because of
2427          * multi-character folding, each character in the target can match
2428          * up to 3 characters (Unicode guarantees it will never exceed
2429          * this) if it is utf8-encoded; and up to 2 if not (based on the
2430          * fact that the Latin 1 folds are already determined, and the
2431          * only multi-char fold in that range is the sharp-s folding to
2432          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
2433          * string character.  Adjust lnc accordingly, rounding up, so that
2434          * if we need to match at least 4+1/3 chars, that really is 5. */
2435         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2436         lnc = (lnc + expansion - 1) / expansion;
2437 
2438         /* As in the non-UTF8 case, if we have to match 3 characters, and
2439          * only 2 are left, it's guaranteed to fail, so don't start a
2440          * match that would require us to go beyond the end of the string
2441          */
2442         e = HOP3c(strend, -((SSize_t)lnc), s);
2443 
2444         /* XXX Note that we could recalculate e to stop the loop earlier,
2445          * as the worst case expansion above will rarely be met, and as we
2446          * go along we would usually find that e moves further to the left.
2447          * This would happen only after we reached the point in the loop
2448          * where if there were no expansion we should fail.  Unclear if
2449          * worth the expense */
2450 
2451         while (s <= e) {
2452             char *my_strend= (char *)strend;
2453             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2454                   pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2455                 && (reginfo->intuit || regtry(reginfo, &s)) )
2456             {
2457                 goto got_it;
2458             }
2459             s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2460         }
2461         break;
2462     }
2463 
2464     case BOUNDL:
2465         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2466         if (FLAGS(c) != TRADITIONAL_BOUND) {
2467             if (! IN_UTF8_CTYPE_LOCALE) {
2468                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2469                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2470             }
2471             goto do_boundu;
2472         }
2473 
2474         FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2475         break;
2476 
2477     case NBOUNDL:
2478         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2479         if (FLAGS(c) != TRADITIONAL_BOUND) {
2480             if (! IN_UTF8_CTYPE_LOCALE) {
2481                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2482                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2483             }
2484             goto do_nboundu;
2485         }
2486 
2487         FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2488         break;
2489 
2490     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2491                    meaning */
2492         assert(FLAGS(c) == TRADITIONAL_BOUND);
2493 
2494         FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2495         break;
2496 
2497     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2498                    meaning */
2499         assert(FLAGS(c) == TRADITIONAL_BOUND);
2500 
2501         FBC_BOUND_A(isWORDCHAR_A);
2502         break;
2503 
2504     case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2505                    meaning */
2506         assert(FLAGS(c) == TRADITIONAL_BOUND);
2507 
2508         FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2509         break;
2510 
2511     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2512                    meaning */
2513         assert(FLAGS(c) == TRADITIONAL_BOUND);
2514 
2515         FBC_NBOUND_A(isWORDCHAR_A);
2516         break;
2517 
2518     case NBOUNDU:
2519         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2520             FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2521             break;
2522         }
2523 
2524       do_nboundu:
2525 
2526         to_complement = 1;
2527         /* FALLTHROUGH */
2528 
2529     case BOUNDU:
2530       do_boundu:
2531         switch((bound_type) FLAGS(c)) {
2532             case TRADITIONAL_BOUND:
2533                 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2534                 break;
2535             case GCB_BOUND:
2536                 if (s == reginfo->strbeg) {
2537                     if (reginfo->intuit || regtry(reginfo, &s))
2538                     {
2539                         goto got_it;
2540                     }
2541 
2542                     /* Didn't match.  Try at the next position (if there is one) */
2543                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2544                     if (UNLIKELY(s >= reginfo->strend)) {
2545                         break;
2546                     }
2547                 }
2548 
2549                 if (utf8_target) {
2550                     GCB_enum before = getGCB_VAL_UTF8(
2551                                                reghop3((U8*)s, -1,
2552                                                        (U8*)(reginfo->strbeg)),
2553                                                (U8*) reginfo->strend);
2554                     while (s < strend) {
2555                         GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2556                                                         (U8*) reginfo->strend);
2557                         if (   (to_complement ^ isGCB(before,
2558                                                       after,
2559                                                       (U8*) reginfo->strbeg,
2560                                                       (U8*) s,
2561                                                       utf8_target))
2562                             && (reginfo->intuit || regtry(reginfo, &s)))
2563                         {
2564                             goto got_it;
2565                         }
2566                         before = after;
2567                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2568                     }
2569                 }
2570                 else {  /* Not utf8.  Everything is a GCB except between CR and
2571                            LF */
2572                     while (s < strend) {
2573                         if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2574                                               || UCHARAT(s) != '\n'))
2575                             && (reginfo->intuit || regtry(reginfo, &s)))
2576                         {
2577                             goto got_it;
2578                         }
2579                         s++;
2580                     }
2581                 }
2582 
2583                 /* And, since this is a bound, it can match after the final
2584                  * character in the string */
2585                 if (   reginfo->intuit
2586                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2587                 {
2588                     goto got_it;
2589                 }
2590                 break;
2591 
2592             case LB_BOUND:
2593                 if (s == reginfo->strbeg) {
2594                     if (reginfo->intuit || regtry(reginfo, &s)) {
2595                         goto got_it;
2596                     }
2597                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2598                     if (UNLIKELY(s >= reginfo->strend)) {
2599                         break;
2600                     }
2601                 }
2602 
2603                 if (utf8_target) {
2604                     LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2605                                                                -1,
2606                                                                (U8*)(reginfo->strbeg)),
2607                                                        (U8*) reginfo->strend);
2608                     while (s < strend) {
2609                         LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2610                         if (to_complement ^ isLB(before,
2611                                                  after,
2612                                                  (U8*) reginfo->strbeg,
2613                                                  (U8*) s,
2614                                                  (U8*) reginfo->strend,
2615                                                  utf8_target)
2616                             && (reginfo->intuit || regtry(reginfo, &s)))
2617                         {
2618                             goto got_it;
2619                         }
2620                         before = after;
2621                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2622                     }
2623                 }
2624                 else {  /* Not utf8. */
2625                     LB_enum before = getLB_VAL_CP((U8) *(s -1));
2626                     while (s < strend) {
2627                         LB_enum after = getLB_VAL_CP((U8) *s);
2628                         if (to_complement ^ isLB(before,
2629                                                  after,
2630                                                  (U8*) reginfo->strbeg,
2631                                                  (U8*) s,
2632                                                  (U8*) reginfo->strend,
2633                                                  utf8_target)
2634                             && (reginfo->intuit || regtry(reginfo, &s)))
2635                         {
2636                             goto got_it;
2637                         }
2638                         before = after;
2639                         s++;
2640                     }
2641                 }
2642 
2643                 if (   reginfo->intuit
2644                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2645                 {
2646                     goto got_it;
2647                 }
2648 
2649                 break;
2650 
2651             case SB_BOUND:
2652                 if (s == reginfo->strbeg) {
2653                     if (reginfo->intuit || regtry(reginfo, &s)) {
2654                         goto got_it;
2655                     }
2656                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2657                     if (UNLIKELY(s >= reginfo->strend)) {
2658                         break;
2659                     }
2660                 }
2661 
2662                 if (utf8_target) {
2663                     SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2664                                                         -1,
2665                                                         (U8*)(reginfo->strbeg)),
2666                                                       (U8*) reginfo->strend);
2667                     while (s < strend) {
2668                         SB_enum after = getSB_VAL_UTF8((U8*) s,
2669                                                          (U8*) reginfo->strend);
2670                         if ((to_complement ^ isSB(before,
2671                                                   after,
2672                                                   (U8*) reginfo->strbeg,
2673                                                   (U8*) s,
2674                                                   (U8*) reginfo->strend,
2675                                                   utf8_target))
2676                             && (reginfo->intuit || regtry(reginfo, &s)))
2677                         {
2678                             goto got_it;
2679                         }
2680                         before = after;
2681                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2682                     }
2683                 }
2684                 else {  /* Not utf8. */
2685                     SB_enum before = getSB_VAL_CP((U8) *(s -1));
2686                     while (s < strend) {
2687                         SB_enum after = getSB_VAL_CP((U8) *s);
2688                         if ((to_complement ^ isSB(before,
2689                                                   after,
2690                                                   (U8*) reginfo->strbeg,
2691                                                   (U8*) s,
2692                                                   (U8*) reginfo->strend,
2693                                                   utf8_target))
2694                             && (reginfo->intuit || regtry(reginfo, &s)))
2695                         {
2696                             goto got_it;
2697                         }
2698                         before = after;
2699                         s++;
2700                     }
2701                 }
2702 
2703                 /* Here are at the final position in the target string.  The SB
2704                  * value is always true here, so matches, depending on other
2705                  * constraints */
2706                 if (   reginfo->intuit
2707                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2708                 {
2709                     goto got_it;
2710                 }
2711 
2712                 break;
2713 
2714             case WB_BOUND:
2715                 if (s == reginfo->strbeg) {
2716                     if (reginfo->intuit || regtry(reginfo, &s)) {
2717                         goto got_it;
2718                     }
2719                     s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2720                     if (UNLIKELY(s >= reginfo->strend)) {
2721                         break;
2722                     }
2723                 }
2724 
2725                 if (utf8_target) {
2726                     /* We are at a boundary between char_sub_0 and char_sub_1.
2727                      * We also keep track of the value for char_sub_-1 as we
2728                      * loop through the line.   Context may be needed to make a
2729                      * determination, and if so, this can save having to
2730                      * recalculate it */
2731                     WB_enum previous = WB_UNKNOWN;
2732                     WB_enum before = getWB_VAL_UTF8(
2733                                               reghop3((U8*)s,
2734                                                       -1,
2735                                                       (U8*)(reginfo->strbeg)),
2736                                               (U8*) reginfo->strend);
2737                     while (s < strend) {
2738                         WB_enum after = getWB_VAL_UTF8((U8*) s,
2739                                                         (U8*) reginfo->strend);
2740                         if ((to_complement ^ isWB(previous,
2741                                                   before,
2742                                                   after,
2743                                                   (U8*) reginfo->strbeg,
2744                                                   (U8*) s,
2745                                                   (U8*) reginfo->strend,
2746                                                   utf8_target))
2747                             && (reginfo->intuit || regtry(reginfo, &s)))
2748                         {
2749                             goto got_it;
2750                         }
2751                         previous = before;
2752                         before = after;
2753                         s += UTF8_SAFE_SKIP(s, reginfo->strend);
2754                     }
2755                 }
2756                 else {  /* Not utf8. */
2757                     WB_enum previous = WB_UNKNOWN;
2758                     WB_enum before = getWB_VAL_CP((U8) *(s -1));
2759                     while (s < strend) {
2760                         WB_enum after = getWB_VAL_CP((U8) *s);
2761                         if ((to_complement ^ isWB(previous,
2762                                                   before,
2763                                                   after,
2764                                                   (U8*) reginfo->strbeg,
2765                                                   (U8*) s,
2766                                                   (U8*) reginfo->strend,
2767                                                   utf8_target))
2768                             && (reginfo->intuit || regtry(reginfo, &s)))
2769                         {
2770                             goto got_it;
2771                         }
2772                         previous = before;
2773                         before = after;
2774                         s++;
2775                     }
2776                 }
2777 
2778                 if (   reginfo->intuit
2779                     || (s <= reginfo->strend && regtry(reginfo, &s)))
2780                 {
2781                     goto got_it;
2782                 }
2783         }
2784         break;
2785 
2786     case LNBREAK:
2787         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2788                         is_LNBREAK_latin1_safe(s, strend)
2789         );
2790         break;
2791 
2792     /* The argument to all the POSIX node types is the class number to pass to
2793      * _generic_isCC() to build a mask for searching in PL_charclass[] */
2794 
2795     case NPOSIXL:
2796         to_complement = 1;
2797         /* FALLTHROUGH */
2798 
2799     case POSIXL:
2800         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2801         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2802                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2803         break;
2804 
2805     case NPOSIXD:
2806         to_complement = 1;
2807         /* FALLTHROUGH */
2808 
2809     case POSIXD:
2810         if (utf8_target) {
2811             goto posix_utf8;
2812         }
2813         goto posixa;
2814 
2815     case NPOSIXA:
2816         if (utf8_target) {
2817             /* The complement of something that matches only ASCII matches all
2818              * non-ASCII, plus everything in ASCII that isn't in the class. */
2819             REXEC_FBC_CLASS_SCAN(1,   ! isASCII_utf8_safe(s, strend)
2820                                    || ! _generic_isCC_A(*s, FLAGS(c)));
2821             break;
2822         }
2823 
2824         to_complement = 1;
2825         goto posixa;
2826 
2827     case POSIXA:
2828         /* Don't need to worry about utf8, as it can match only a single
2829          * byte invariant character.  But we do anyway for performance reasons,
2830          * as otherwise we would have to examine all the continuation
2831          * characters */
2832         if (utf8_target) {
2833             REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2834             break;
2835         }
2836 
2837       posixa:
2838         REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2839                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2840         break;
2841 
2842     case NPOSIXU:
2843         to_complement = 1;
2844         /* FALLTHROUGH */
2845 
2846     case POSIXU:
2847         if (! utf8_target) {
2848             REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2849                                  to_complement ^ cBOOL(_generic_isCC(*s,
2850                                                                     FLAGS(c))));
2851         }
2852         else {
2853 
2854           posix_utf8:
2855             classnum = (_char_class_number) FLAGS(c);
2856             switch (classnum) {
2857                 default:
2858                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2859                         to_complement ^ cBOOL(_invlist_contains_cp(
2860                                               PL_XPosix_ptrs[classnum],
2861                                               utf8_to_uvchr_buf((U8 *) s,
2862                                                                 (U8 *) strend,
2863                                                                 NULL))));
2864                     break;
2865                 case _CC_ENUM_SPACE:
2866                     REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2867                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2868                     break;
2869 
2870                 case _CC_ENUM_BLANK:
2871                     REXEC_FBC_CLASS_SCAN(1,
2872                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2873                     break;
2874 
2875                 case _CC_ENUM_XDIGIT:
2876                     REXEC_FBC_CLASS_SCAN(1,
2877                        to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2878                     break;
2879 
2880                 case _CC_ENUM_VERTSPACE:
2881                     REXEC_FBC_CLASS_SCAN(1,
2882                        to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2883                     break;
2884 
2885                 case _CC_ENUM_CNTRL:
2886                     REXEC_FBC_CLASS_SCAN(1,
2887                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2888                     break;
2889             }
2890         }
2891         break;
2892 
2893     case AHOCORASICKC:
2894     case AHOCORASICK:
2895         {
2896             DECL_TRIE_TYPE(c);
2897             /* what trie are we using right now */
2898             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2899             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2900             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2901 
2902             const char *last_start = strend - trie->minlen;
2903 #ifdef DEBUGGING
2904             const char *real_start = s;
2905 #endif
2906             STRLEN maxlen = trie->maxlen;
2907             SV *sv_points;
2908             U8 **points; /* map of where we were in the input string
2909                             when reading a given char. For ASCII this
2910                             is unnecessary overhead as the relationship
2911                             is always 1:1, but for Unicode, especially
2912                             case folded Unicode this is not true. */
2913             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2914             U8 *bitmap=NULL;
2915 
2916 
2917             DECLARE_AND_GET_RE_DEBUG_FLAGS;
2918 
2919             /* We can't just allocate points here. We need to wrap it in
2920              * an SV so it gets freed properly if there is a croak while
2921              * running the match */
2922             ENTER;
2923             SAVETMPS;
2924             sv_points=newSV(maxlen * sizeof(U8 *));
2925             SvCUR_set(sv_points,
2926                 maxlen * sizeof(U8 *));
2927             SvPOK_on(sv_points);
2928             sv_2mortal(sv_points);
2929             points=(U8**)SvPV_nolen(sv_points );
2930             if ( trie_type != trie_utf8_fold
2931                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
2932             {
2933                 if (trie->bitmap)
2934                     bitmap=(U8*)trie->bitmap;
2935                 else
2936                     bitmap=(U8*)ANYOF_BITMAP(c);
2937             }
2938             /* this is the Aho-Corasick algorithm modified a touch
2939                to include special handling for long "unknown char" sequences.
2940                The basic idea being that we use AC as long as we are dealing
2941                with a possible matching char, when we encounter an unknown char
2942                (and we have not encountered an accepting state) we scan forward
2943                until we find a legal starting char.
2944                AC matching is basically that of trie matching, except that when
2945                we encounter a failing transition, we fall back to the current
2946                states "fail state", and try the current char again, a process
2947                we repeat until we reach the root state, state 1, or a legal
2948                transition. If we fail on the root state then we can either
2949                terminate if we have reached an accepting state previously, or
2950                restart the entire process from the beginning if we have not.
2951 
2952              */
2953             while (s <= last_start) {
2954                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2955                 U8 *uc = (U8*)s;
2956                 U16 charid = 0;
2957                 U32 base = 1;
2958                 U32 state = 1;
2959                 UV uvc = 0;
2960                 STRLEN len = 0;
2961                 STRLEN foldlen = 0;
2962                 U8 *uscan = (U8*)NULL;
2963                 U8 *leftmost = NULL;
2964 #ifdef DEBUGGING
2965                 U32 accepted_word= 0;
2966 #endif
2967                 U32 pointpos = 0;
2968 
2969                 while ( state && uc <= (U8*)strend ) {
2970                     int failed=0;
2971                     U32 word = aho->states[ state ].wordnum;
2972 
2973                     if( state==1 ) {
2974                         if ( bitmap ) {
2975                             DEBUG_TRIE_EXECUTE_r(
2976                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2977                                     dump_exec_pos( (char *)uc, c, strend, real_start,
2978                                         (char *)uc, utf8_target, 0 );
2979                                     Perl_re_printf( aTHX_
2980                                         " Scanning for legal start char...\n");
2981                                 }
2982                             );
2983                             if (utf8_target) {
2984                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2985                                     uc += UTF8SKIP(uc);
2986                                 }
2987                             } else {
2988                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2989                                     uc++;
2990                                 }
2991                             }
2992                             s= (char *)uc;
2993                         }
2994                         if (uc >(U8*)last_start) break;
2995                     }
2996 
2997                     if ( word ) {
2998                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2999                         if (!leftmost || lpos < leftmost) {
3000                             DEBUG_r(accepted_word=word);
3001                             leftmost= lpos;
3002                         }
3003                         if (base==0) break;
3004 
3005                     }
3006                     points[pointpos++ % maxlen]= uc;
3007                     if (foldlen || uc < (U8*)strend) {
3008                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3009                                              (U8 *) strend, uscan, len, uvc,
3010                                              charid, foldlen, foldbuf,
3011                                              uniflags);
3012                         DEBUG_TRIE_EXECUTE_r({
3013                             dump_exec_pos( (char *)uc, c, strend,
3014                                         real_start, s, utf8_target, 0);
3015                             Perl_re_printf( aTHX_
3016                                 " Charid:%3u CP:%4" UVxf " ",
3017                                  charid, uvc);
3018                         });
3019                     }
3020                     else {
3021                         len = 0;
3022                         charid = 0;
3023                     }
3024 
3025 
3026                     do {
3027 #ifdef DEBUGGING
3028                         word = aho->states[ state ].wordnum;
3029 #endif
3030                         base = aho->states[ state ].trans.base;
3031 
3032                         DEBUG_TRIE_EXECUTE_r({
3033                             if (failed)
3034                                 dump_exec_pos( (char *)uc, c, strend, real_start,
3035                                     s,   utf8_target, 0 );
3036                             Perl_re_printf( aTHX_
3037                                 "%sState: %4" UVxf ", word=%" UVxf,
3038                                 failed ? " Fail transition to " : "",
3039                                 (UV)state, (UV)word);
3040                         });
3041                         if ( base ) {
3042                             U32 tmp;
3043                             I32 offset;
3044                             if (charid &&
3045                                  ( ((offset = base + charid
3046                                     - 1 - trie->uniquecharcount)) >= 0)
3047                                  && ((U32)offset < trie->lasttrans)
3048                                  && trie->trans[offset].check == state
3049                                  && (tmp=trie->trans[offset].next))
3050                             {
3051                                 DEBUG_TRIE_EXECUTE_r(
3052                                     Perl_re_printf( aTHX_ " - legal\n"));
3053                                 state = tmp;
3054                                 break;
3055                             }
3056                             else {
3057                                 DEBUG_TRIE_EXECUTE_r(
3058                                     Perl_re_printf( aTHX_ " - fail\n"));
3059                                 failed = 1;
3060                                 state = aho->fail[state];
3061                             }
3062                         }
3063                         else {
3064                             /* we must be accepting here */
3065                             DEBUG_TRIE_EXECUTE_r(
3066                                     Perl_re_printf( aTHX_ " - accepting\n"));
3067                             failed = 1;
3068                             break;
3069                         }
3070                     } while(state);
3071                     uc += len;
3072                     if (failed) {
3073                         if (leftmost)
3074                             break;
3075                         if (!state) state = 1;
3076                     }
3077                 }
3078                 if ( aho->states[ state ].wordnum ) {
3079                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3080                     if (!leftmost || lpos < leftmost) {
3081                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3082                         leftmost = lpos;
3083                     }
3084                 }
3085                 if (leftmost) {
3086                     s = (char*)leftmost;
3087                     DEBUG_TRIE_EXECUTE_r({
3088                         Perl_re_printf( aTHX_  "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3089                             (UV)accepted_word, (IV)(s - real_start)
3090                         );
3091                     });
3092                     if (reginfo->intuit || regtry(reginfo, &s)) {
3093                         FREETMPS;
3094                         LEAVE;
3095                         goto got_it;
3096                     }
3097                     if (s < reginfo->strend) {
3098                         s = HOPc(s,1);
3099                     }
3100                     DEBUG_TRIE_EXECUTE_r({
3101                         Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3102                     });
3103                 } else {
3104                     DEBUG_TRIE_EXECUTE_r(
3105                         Perl_re_printf( aTHX_ "No match.\n"));
3106                     break;
3107                 }
3108             }
3109             FREETMPS;
3110             LEAVE;
3111         }
3112         break;
3113     default:
3114         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3115     }
3116     return 0;
3117   got_it:
3118     return s;
3119 }
3120 
3121 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3122  * flags have same meanings as with regexec_flags() */
3123 
3124 static void
3125 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3126                             char *strbeg,
3127                             char *strend,
3128                             SV *sv,
3129                             U32 flags,
3130                             bool utf8_target)
3131 {
3132     struct regexp *const prog = ReANY(rx);
3133 
3134     if (flags & REXEC_COPY_STR) {
3135 #ifdef PERL_ANY_COW
3136         if (SvCANCOW(sv)) {
3137             DEBUG_C(Perl_re_printf( aTHX_
3138                               "Copy on write: regexp capture, type %d\n",
3139                                     (int) SvTYPE(sv)));
3140             /* Create a new COW SV to share the match string and store
3141              * in saved_copy, unless the current COW SV in saved_copy
3142              * is valid and suitable for our purpose */
3143             if ((   prog->saved_copy
3144                  && SvIsCOW(prog->saved_copy)
3145                  && SvPOKp(prog->saved_copy)
3146                  && SvIsCOW(sv)
3147                  && SvPOKp(sv)
3148                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3149             {
3150                 /* just reuse saved_copy SV */
3151                 if (RXp_MATCH_COPIED(prog)) {
3152                     Safefree(prog->subbeg);
3153                     RXp_MATCH_COPIED_off(prog);
3154                 }
3155             }
3156             else {
3157                 /* create new COW SV to share string */
3158                 RXp_MATCH_COPY_FREE(prog);
3159                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3160             }
3161             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3162             assert (SvPOKp(prog->saved_copy));
3163             prog->sublen  = strend - strbeg;
3164             prog->suboffset = 0;
3165             prog->subcoffset = 0;
3166         } else
3167 #endif
3168         {
3169             SSize_t min = 0;
3170             SSize_t max = strend - strbeg;
3171             SSize_t sublen;
3172 
3173             if (    (flags & REXEC_COPY_SKIP_POST)
3174                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3175                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3176             ) { /* don't copy $' part of string */
3177                 U32 n = 0;
3178                 max = -1;
3179                 /* calculate the right-most part of the string covered
3180                  * by a capture. Due to lookahead, this may be to
3181                  * the right of $&, so we have to scan all captures */
3182                 while (n <= prog->lastparen) {
3183                     if (prog->offs[n].end > max)
3184                         max = prog->offs[n].end;
3185                     n++;
3186                 }
3187                 if (max == -1)
3188                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3189                             ? prog->offs[0].start
3190                             : 0;
3191                 assert(max >= 0 && max <= strend - strbeg);
3192             }
3193 
3194             if (    (flags & REXEC_COPY_SKIP_PRE)
3195                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3196                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3197             ) { /* don't copy $` part of string */
3198                 U32 n = 0;
3199                 min = max;
3200                 /* calculate the left-most part of the string covered
3201                  * by a capture. Due to lookbehind, this may be to
3202                  * the left of $&, so we have to scan all captures */
3203                 while (min && n <= prog->lastparen) {
3204                     if (   prog->offs[n].start != -1
3205                         && prog->offs[n].start < min)
3206                     {
3207                         min = prog->offs[n].start;
3208                     }
3209                     n++;
3210                 }
3211                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3212                     && min >  prog->offs[0].end
3213                 )
3214                     min = prog->offs[0].end;
3215 
3216             }
3217 
3218             assert(min >= 0 && min <= max && min <= strend - strbeg);
3219             sublen = max - min;
3220 
3221             if (RXp_MATCH_COPIED(prog)) {
3222                 if (sublen > prog->sublen)
3223                     prog->subbeg =
3224                             (char*)saferealloc(prog->subbeg, sublen+1);
3225             }
3226             else
3227                 prog->subbeg = (char*)safemalloc(sublen+1);
3228             Copy(strbeg + min, prog->subbeg, sublen, char);
3229             prog->subbeg[sublen] = '\0';
3230             prog->suboffset = min;
3231             prog->sublen = sublen;
3232             RXp_MATCH_COPIED_on(prog);
3233         }
3234         prog->subcoffset = prog->suboffset;
3235         if (prog->suboffset && utf8_target) {
3236             /* Convert byte offset to chars.
3237              * XXX ideally should only compute this if @-/@+
3238              * has been seen, a la PL_sawampersand ??? */
3239 
3240             /* If there's a direct correspondence between the
3241              * string which we're matching and the original SV,
3242              * then we can use the utf8 len cache associated with
3243              * the SV. In particular, it means that under //g,
3244              * sv_pos_b2u() will use the previously cached
3245              * position to speed up working out the new length of
3246              * subcoffset, rather than counting from the start of
3247              * the string each time. This stops
3248              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3249              * from going quadratic */
3250             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3251                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3252                                                 SV_GMAGIC|SV_CONST_RETURN);
3253             else
3254                 prog->subcoffset = utf8_length((U8*)strbeg,
3255                                     (U8*)(strbeg+prog->suboffset));
3256         }
3257     }
3258     else {
3259         RXp_MATCH_COPY_FREE(prog);
3260         prog->subbeg = strbeg;
3261         prog->suboffset = 0;
3262         prog->subcoffset = 0;
3263         prog->sublen = strend - strbeg;
3264     }
3265 }
3266 
3267 
3268 
3269 
3270 /*
3271  - regexec_flags - match a regexp against a string
3272  */
3273 I32
3274 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3275 	      char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3276 /* stringarg: the point in the string at which to begin matching */
3277 /* strend:    pointer to null at end of string */
3278 /* strbeg:    real beginning of string */
3279 /* minend:    end of match must be >= minend bytes after stringarg. */
3280 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3281  *            itself is accessed via the pointers above */
3282 /* data:      May be used for some additional optimizations.
3283               Currently unused. */
3284 /* flags:     For optimizations. See REXEC_* in regexp.h */
3285 
3286 {
3287     struct regexp *const prog = ReANY(rx);
3288     char *s;
3289     regnode *c;
3290     char *startpos;
3291     SSize_t minlen;		/* must match at least this many chars */
3292     SSize_t dontbother = 0;	/* how many characters not to try at end */
3293     const bool utf8_target = cBOOL(DO_UTF8(sv));
3294     I32 multiline;
3295     RXi_GET_DECL(prog,progi);
3296     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3297     regmatch_info *const reginfo = &reginfo_buf;
3298     regexp_paren_pair *swap = NULL;
3299     I32 oldsave;
3300     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3301 
3302     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3303     PERL_UNUSED_ARG(data);
3304 
3305     /* Be paranoid... */
3306     if (prog == NULL) {
3307 	Perl_croak(aTHX_ "NULL regexp parameter");
3308     }
3309 
3310     DEBUG_EXECUTE_r(
3311         debug_start_match(rx, utf8_target, stringarg, strend,
3312         "Matching");
3313     );
3314 
3315     startpos = stringarg;
3316 
3317     /* set these early as they may be used by the HOP macros below */
3318     reginfo->strbeg = strbeg;
3319     reginfo->strend = strend;
3320     reginfo->is_utf8_target = cBOOL(utf8_target);
3321 
3322     if (prog->intflags & PREGf_GPOS_SEEN) {
3323         MAGIC *mg;
3324 
3325         /* set reginfo->ganch, the position where \G can match */
3326 
3327         reginfo->ganch =
3328             (flags & REXEC_IGNOREPOS)
3329             ? stringarg /* use start pos rather than pos() */
3330             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3331               /* Defined pos(): */
3332             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3333             : strbeg; /* pos() not defined; use start of string */
3334 
3335         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3336             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3337 
3338         /* in the presence of \G, we may need to start looking earlier in
3339          * the string than the suggested start point of stringarg:
3340          * if prog->gofs is set, then that's a known, fixed minimum
3341          * offset, such as
3342          * /..\G/:   gofs = 2
3343          * /ab|c\G/: gofs = 1
3344          * or if the minimum offset isn't known, then we have to go back
3345          * to the start of the string, e.g. /w+\G/
3346          */
3347 
3348         if (prog->intflags & PREGf_ANCH_GPOS) {
3349             if (prog->gofs) {
3350                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3351                 if (!startpos ||
3352                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3353                 {
3354                     DEBUG_GPOS_r(Perl_re_printf( aTHX_
3355                             "fail: ganch-gofs before earliest possible start\n"));
3356                     return 0;
3357                 }
3358             }
3359             else
3360                 startpos = reginfo->ganch;
3361         }
3362         else if (prog->gofs) {
3363             startpos = HOPBACKc(startpos, prog->gofs);
3364             if (!startpos)
3365                 startpos = strbeg;
3366         }
3367         else if (prog->intflags & PREGf_GPOS_FLOAT)
3368             startpos = strbeg;
3369     }
3370 
3371     minlen = prog->minlen;
3372     if ((startpos + minlen) > strend || startpos < strbeg) {
3373 	DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3374                         "Regex match can't succeed, so not even tried\n"));
3375         return 0;
3376     }
3377 
3378     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3379      * which will call destuctors to reset PL_regmatch_state, free higher
3380      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3381      * regmatch_info_aux_eval */
3382 
3383     oldsave = PL_savestack_ix;
3384 
3385     s = startpos;
3386 
3387     if ((prog->extflags & RXf_USE_INTUIT)
3388         && !(flags & REXEC_CHECKED))
3389     {
3390 	s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3391                                     flags, NULL);
3392 	if (!s)
3393 	    return 0;
3394 
3395 	if (prog->extflags & RXf_CHECK_ALL) {
3396             /* we can match based purely on the result of INTUIT.
3397              * Set up captures etc just for $& and $-[0]
3398              * (an intuit-only match wont have $1,$2,..) */
3399             assert(!prog->nparens);
3400 
3401             /* s/// doesn't like it if $& is earlier than where we asked it to
3402              * start searching (which can happen on something like /.\G/) */
3403             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3404                     && (s < stringarg))
3405             {
3406                 /* this should only be possible under \G */
3407                 assert(prog->intflags & PREGf_GPOS_SEEN);
3408                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3409                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3410                 goto phooey;
3411             }
3412 
3413             /* match via INTUIT shouldn't have any captures.
3414              * Let @-, @+, $^N know */
3415             prog->lastparen = prog->lastcloseparen = 0;
3416             RXp_MATCH_UTF8_set(prog, utf8_target);
3417             prog->offs[0].start = s - strbeg;
3418             prog->offs[0].end = utf8_target
3419                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3420                 : s - strbeg + prog->minlenret;
3421             if ( !(flags & REXEC_NOT_FIRST) )
3422                 S_reg_set_capture_string(aTHX_ rx,
3423                                         strbeg, strend,
3424                                         sv, flags, utf8_target);
3425 
3426 	    return 1;
3427         }
3428     }
3429 
3430     multiline = prog->extflags & RXf_PMf_MULTILINE;
3431 
3432     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3433         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3434 			      "String too short [regexec_flags]...\n"));
3435 	goto phooey;
3436     }
3437 
3438     /* Check validity of program. */
3439     if (UCHARAT(progi->program) != REG_MAGIC) {
3440 	Perl_croak(aTHX_ "corrupted regexp program");
3441     }
3442 
3443     RXp_MATCH_TAINTED_off(prog);
3444     RXp_MATCH_UTF8_set(prog, utf8_target);
3445 
3446     reginfo->prog = rx;	 /* Yes, sorry that this is confusing.  */
3447     reginfo->intuit = 0;
3448     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3449     reginfo->warned = FALSE;
3450     reginfo->sv = sv;
3451     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3452     /* see how far we have to get to not match where we matched before */
3453     reginfo->till = stringarg + minend;
3454 
3455     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3456         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3457            S_cleanup_regmatch_info_aux has executed (registered by
3458            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3459            magic belonging to this SV.
3460            Not newSVsv, either, as it does not COW.
3461         */
3462         reginfo->sv = newSV(0);
3463         SvSetSV_nosteal(reginfo->sv, sv);
3464         SAVEFREESV(reginfo->sv);
3465     }
3466 
3467     /* reserve next 2 or 3 slots in PL_regmatch_state:
3468      * slot N+0: may currently be in use: skip it
3469      * slot N+1: use for regmatch_info_aux struct
3470      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3471      * slot N+3: ready for use by regmatch()
3472      */
3473 
3474     {
3475         regmatch_state *old_regmatch_state;
3476         regmatch_slab  *old_regmatch_slab;
3477         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3478 
3479         /* on first ever match, allocate first slab */
3480         if (!PL_regmatch_slab) {
3481             Newx(PL_regmatch_slab, 1, regmatch_slab);
3482             PL_regmatch_slab->prev = NULL;
3483             PL_regmatch_slab->next = NULL;
3484             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3485         }
3486 
3487         old_regmatch_state = PL_regmatch_state;
3488         old_regmatch_slab  = PL_regmatch_slab;
3489 
3490         for (i=0; i <= max; i++) {
3491             if (i == 1)
3492                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3493             else if (i ==2)
3494                 reginfo->info_aux_eval =
3495                 reginfo->info_aux->info_aux_eval =
3496                             &(PL_regmatch_state->u.info_aux_eval);
3497 
3498             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3499                 PL_regmatch_state = S_push_slab(aTHX);
3500         }
3501 
3502         /* note initial PL_regmatch_state position; at end of match we'll
3503          * pop back to there and free any higher slabs */
3504 
3505         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3506         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3507         reginfo->info_aux->poscache = NULL;
3508 
3509         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3510 
3511         if ((prog->extflags & RXf_EVAL_SEEN))
3512             S_setup_eval_state(aTHX_ reginfo);
3513         else
3514             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3515     }
3516 
3517     /* If there is a "must appear" string, look for it. */
3518 
3519     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3520         /* We have to be careful. If the previous successful match
3521            was from this regex we don't want a subsequent partially
3522            successful match to clobber the old results.
3523            So when we detect this possibility we add a swap buffer
3524            to the re, and switch the buffer each match. If we fail,
3525            we switch it back; otherwise we leave it swapped.
3526         */
3527         swap = prog->offs;
3528         /* avoid leak if we die, or clean up anyway if match completes */
3529         SAVEFREEPV(swap);
3530         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3531         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3532 	    "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3533 	    0,
3534             PTR2UV(prog),
3535 	    PTR2UV(swap),
3536 	    PTR2UV(prog->offs)
3537 	));
3538     }
3539 
3540     if (prog->recurse_locinput)
3541         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3542 
3543     /* Simplest case: anchored match need be tried only once, or with
3544      * MBOL, only at the beginning of each line.
3545      *
3546      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3547      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3548      * match at the start of the string then it won't match anywhere else
3549      * either; while with /.*.../, if it doesn't match at the beginning,
3550      * the earliest it could match is at the start of the next line */
3551 
3552     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3553         char *end;
3554 
3555 	if (regtry(reginfo, &s))
3556 	    goto got_it;
3557 
3558         if (!(prog->intflags & PREGf_ANCH_MBOL))
3559             goto phooey;
3560 
3561         /* didn't match at start, try at other newline positions */
3562 
3563         if (minlen)
3564             dontbother = minlen - 1;
3565         end = HOP3c(strend, -dontbother, strbeg) - 1;
3566 
3567         /* skip to next newline */
3568 
3569         while (s <= end) { /* note it could be possible to match at the end of the string */
3570             /* NB: newlines are the same in unicode as they are in latin */
3571             if (*s++ != '\n')
3572                 continue;
3573             if (prog->check_substr || prog->check_utf8) {
3574             /* note that with PREGf_IMPLICIT, intuit can only fail
3575              * or return the start position, so it's of limited utility.
3576              * Nevertheless, I made the decision that the potential for
3577              * quick fail was still worth it - DAPM */
3578                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3579                 if (!s)
3580                     goto phooey;
3581             }
3582             if (regtry(reginfo, &s))
3583                 goto got_it;
3584         }
3585         goto phooey;
3586     } /* end anchored search */
3587 
3588     if (prog->intflags & PREGf_ANCH_GPOS)
3589     {
3590         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3591         assert(prog->intflags & PREGf_GPOS_SEEN);
3592         /* For anchored \G, the only position it can match from is
3593          * (ganch-gofs); we already set startpos to this above; if intuit
3594          * moved us on from there, we can't possibly succeed */
3595         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3596 	if (s == startpos && regtry(reginfo, &s))
3597 	    goto got_it;
3598 	goto phooey;
3599     }
3600 
3601     /* Messy cases:  unanchored match. */
3602     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3603 	/* we have /x+whatever/ */
3604 	/* it must be a one character string (XXXX Except is_utf8_pat?) */
3605 	char ch;
3606 #ifdef DEBUGGING
3607 	int did_match = 0;
3608 #endif
3609 	if (utf8_target) {
3610             if (! prog->anchored_utf8) {
3611                 to_utf8_substr(prog);
3612             }
3613             ch = SvPVX_const(prog->anchored_utf8)[0];
3614 	    REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3615 		if (*s == ch) {
3616 		    DEBUG_EXECUTE_r( did_match = 1 );
3617 		    if (regtry(reginfo, &s)) goto got_it;
3618 		    s += UTF8_SAFE_SKIP(s, strend);
3619 		    while (s < strend && *s == ch)
3620 			s += UTF8SKIP(s);
3621 		}
3622 	    );
3623 
3624 	}
3625 	else {
3626             if (! prog->anchored_substr) {
3627                 if (! to_byte_substr(prog)) {
3628                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3629                 }
3630             }
3631             ch = SvPVX_const(prog->anchored_substr)[0];
3632 	    REXEC_FBC_SCAN(0,   /* 0=>not-utf8 */
3633 		if (*s == ch) {
3634 		    DEBUG_EXECUTE_r( did_match = 1 );
3635 		    if (regtry(reginfo, &s)) goto got_it;
3636 		    s++;
3637 		    while (s < strend && *s == ch)
3638 			s++;
3639 		}
3640 	    );
3641 	}
3642 	DEBUG_EXECUTE_r(if (!did_match)
3643                 Perl_re_printf( aTHX_
3644                                   "Did not find anchored character...\n")
3645                );
3646     }
3647     else if (prog->anchored_substr != NULL
3648 	      || prog->anchored_utf8 != NULL
3649 	      || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3650 		  && prog->float_max_offset < strend - s)) {
3651 	SV *must;
3652 	SSize_t back_max;
3653 	SSize_t back_min;
3654 	char *last;
3655 	char *last1;		/* Last position checked before */
3656 #ifdef DEBUGGING
3657 	int did_match = 0;
3658 #endif
3659 	if (prog->anchored_substr || prog->anchored_utf8) {
3660 	    if (utf8_target) {
3661                 if (! prog->anchored_utf8) {
3662                     to_utf8_substr(prog);
3663                 }
3664                 must = prog->anchored_utf8;
3665             }
3666             else {
3667                 if (! prog->anchored_substr) {
3668                     if (! to_byte_substr(prog)) {
3669                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3670                     }
3671                 }
3672                 must = prog->anchored_substr;
3673             }
3674 	    back_max = back_min = prog->anchored_offset;
3675 	} else {
3676 	    if (utf8_target) {
3677                 if (! prog->float_utf8) {
3678                     to_utf8_substr(prog);
3679                 }
3680                 must = prog->float_utf8;
3681             }
3682             else {
3683                 if (! prog->float_substr) {
3684                     if (! to_byte_substr(prog)) {
3685                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3686                     }
3687                 }
3688                 must = prog->float_substr;
3689             }
3690 	    back_max = prog->float_max_offset;
3691 	    back_min = prog->float_min_offset;
3692 	}
3693 
3694         if (back_min<0) {
3695 	    last = strend;
3696 	} else {
3697             last = HOP3c(strend,	/* Cannot start after this */
3698         	  -(SSize_t)(CHR_SVLEN(must)
3699         		 - (SvTAIL(must) != 0) + back_min), strbeg);
3700         }
3701 	if (s > reginfo->strbeg)
3702 	    last1 = HOPc(s, -1);
3703 	else
3704 	    last1 = s - 1;	/* bogus */
3705 
3706 	/* XXXX check_substr already used to find "s", can optimize if
3707 	   check_substr==must. */
3708 	dontbother = 0;
3709 	strend = HOPc(strend, -dontbother);
3710 	while ( (s <= last) &&
3711 		(s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3712 				  (unsigned char*)strend, must,
3713 				  multiline ? FBMrf_MULTILINE : 0)) ) {
3714 	    DEBUG_EXECUTE_r( did_match = 1 );
3715 	    if (HOPc(s, -back_max) > last1) {
3716 		last1 = HOPc(s, -back_min);
3717 		s = HOPc(s, -back_max);
3718 	    }
3719 	    else {
3720 		char * const t = (last1 >= reginfo->strbeg)
3721                                     ? HOPc(last1, 1) : last1 + 1;
3722 
3723 		last1 = HOPc(s, -back_min);
3724 		s = t;
3725 	    }
3726 	    if (utf8_target) {
3727 		while (s <= last1) {
3728 		    if (regtry(reginfo, &s))
3729 			goto got_it;
3730                     if (s >= last1) {
3731                         s++; /* to break out of outer loop */
3732                         break;
3733                     }
3734                     s += UTF8SKIP(s);
3735 		}
3736 	    }
3737 	    else {
3738 		while (s <= last1) {
3739 		    if (regtry(reginfo, &s))
3740 			goto got_it;
3741 		    s++;
3742 		}
3743 	    }
3744 	}
3745 	DEBUG_EXECUTE_r(if (!did_match) {
3746             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3747                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3748             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
3749 			      ((must == prog->anchored_substr || must == prog->anchored_utf8)
3750 			       ? "anchored" : "floating"),
3751                 quoted, RE_SV_TAIL(must));
3752         });
3753 	goto phooey;
3754     }
3755     else if ( (c = progi->regstclass) ) {
3756 	if (minlen) {
3757 	    const OPCODE op = OP(progi->regstclass);
3758 	    /* don't bother with what can't match */
3759 	    if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3760 	        strend = HOPc(strend, -(minlen - 1));
3761 	}
3762 	DEBUG_EXECUTE_r({
3763 	    SV * const prop = sv_newmortal();
3764             regprop(prog, prop, c, reginfo, NULL);
3765 	    {
3766 		RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3767 		    s,strend-s,PL_dump_re_max_len);
3768                 Perl_re_printf( aTHX_
3769 		    "Matching stclass %.*s against %s (%d bytes)\n",
3770 		    (int)SvCUR(prop), SvPVX_const(prop),
3771 		     quoted, (int)(strend - s));
3772 	    }
3773 	});
3774         if (find_byclass(prog, c, s, strend, reginfo))
3775 	    goto got_it;
3776         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
3777     }
3778     else {
3779 	dontbother = 0;
3780 	if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3781 	    /* Trim the end. */
3782 	    char *last= NULL;
3783 	    SV* float_real;
3784 	    STRLEN len;
3785 	    const char *little;
3786 
3787 	    if (utf8_target) {
3788                 if (! prog->float_utf8) {
3789                     to_utf8_substr(prog);
3790                 }
3791                 float_real = prog->float_utf8;
3792             }
3793             else {
3794                 if (! prog->float_substr) {
3795                     if (! to_byte_substr(prog)) {
3796                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3797                     }
3798                 }
3799                 float_real = prog->float_substr;
3800             }
3801 
3802             little = SvPV_const(float_real, len);
3803 	    if (SvTAIL(float_real)) {
3804                     /* This means that float_real contains an artificial \n on
3805                      * the end due to the presence of something like this:
3806                      * /foo$/ where we can match both "foo" and "foo\n" at the
3807                      * end of the string.  So we have to compare the end of the
3808                      * string first against the float_real without the \n and
3809                      * then against the full float_real with the string.  We
3810                      * have to watch out for cases where the string might be
3811                      * smaller than the float_real or the float_real without
3812                      * the \n. */
3813 		    char *checkpos= strend - len;
3814 		    DEBUG_OPTIMISE_r(
3815                         Perl_re_printf( aTHX_
3816 			    "%sChecking for float_real.%s\n",
3817 			    PL_colors[4], PL_colors[5]));
3818 		    if (checkpos + 1 < strbeg) {
3819                         /* can't match, even if we remove the trailing \n
3820                          * string is too short to match */
3821 			DEBUG_EXECUTE_r(
3822                             Perl_re_printf( aTHX_
3823 				"%sString shorter than required trailing substring, cannot match.%s\n",
3824 				PL_colors[4], PL_colors[5]));
3825 			goto phooey;
3826 		    } else if (memEQ(checkpos + 1, little, len - 1)) {
3827                         /* can match, the end of the string matches without the
3828                          * "\n" */
3829 			last = checkpos + 1;
3830 		    } else if (checkpos < strbeg) {
3831                         /* cant match, string is too short when the "\n" is
3832                          * included */
3833 			DEBUG_EXECUTE_r(
3834                             Perl_re_printf( aTHX_
3835 				"%sString does not contain required trailing substring, cannot match.%s\n",
3836 				PL_colors[4], PL_colors[5]));
3837 			goto phooey;
3838 		    } else if (!multiline) {
3839                         /* non multiline match, so compare with the "\n" at the
3840                          * end of the string */
3841 			if (memEQ(checkpos, little, len)) {
3842 			    last= checkpos;
3843 			} else {
3844 			    DEBUG_EXECUTE_r(
3845                                 Perl_re_printf( aTHX_
3846 				    "%sString does not contain required trailing substring, cannot match.%s\n",
3847 				    PL_colors[4], PL_colors[5]));
3848 			    goto phooey;
3849 			}
3850 		    } else {
3851                         /* multiline match, so we have to search for a place
3852                          * where the full string is located */
3853 			goto find_last;
3854 		    }
3855 	    } else {
3856 		  find_last:
3857 		    if (len)
3858 			last = rninstr(s, strend, little, little + len);
3859 		    else
3860 			last = strend;	/* matching "$" */
3861 	    }
3862 	    if (!last) {
3863                 /* at one point this block contained a comment which was
3864                  * probably incorrect, which said that this was a "should not
3865                  * happen" case.  Even if it was true when it was written I am
3866                  * pretty sure it is not anymore, so I have removed the comment
3867                  * and replaced it with this one. Yves */
3868 		DEBUG_EXECUTE_r(
3869                     Perl_re_printf( aTHX_
3870 			"%sString does not contain required substring, cannot match.%s\n",
3871                         PL_colors[4], PL_colors[5]
3872 	            ));
3873 		goto phooey;
3874 	    }
3875 	    dontbother = strend - last + prog->float_min_offset;
3876 	}
3877 	if (minlen && (dontbother < minlen))
3878 	    dontbother = minlen - 1;
3879 	strend -= dontbother; 		   /* this one's always in bytes! */
3880 	/* We don't know much -- general case. */
3881 	if (utf8_target) {
3882 	    for (;;) {
3883 		if (regtry(reginfo, &s))
3884 		    goto got_it;
3885 		if (s >= strend)
3886 		    break;
3887 		s += UTF8SKIP(s);
3888 	    };
3889 	}
3890 	else {
3891 	    do {
3892 		if (regtry(reginfo, &s))
3893 		    goto got_it;
3894 	    } while (s++ < strend);
3895 	}
3896     }
3897 
3898     /* Failure. */
3899     goto phooey;
3900 
3901   got_it:
3902     /* s/// doesn't like it if $& is earlier than where we asked it to
3903      * start searching (which can happen on something like /.\G/) */
3904     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3905             && (prog->offs[0].start < stringarg - strbeg))
3906     {
3907         /* this should only be possible under \G */
3908         assert(prog->intflags & PREGf_GPOS_SEEN);
3909         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3910             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3911         goto phooey;
3912     }
3913 
3914     /* clean up; this will trigger destructors that will free all slabs
3915      * above the current one, and cleanup the regmatch_info_aux
3916      * and regmatch_info_aux_eval sructs */
3917 
3918     LEAVE_SCOPE(oldsave);
3919 
3920     if (RXp_PAREN_NAMES(prog))
3921         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3922 
3923     /* make sure $`, $&, $', and $digit will work later */
3924     if ( !(flags & REXEC_NOT_FIRST) )
3925         S_reg_set_capture_string(aTHX_ rx,
3926                                     strbeg, reginfo->strend,
3927                                     sv, flags, utf8_target);
3928 
3929     return 1;
3930 
3931   phooey:
3932     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
3933 			  PL_colors[4], PL_colors[5]));
3934 
3935     if (swap) {
3936         /* we failed :-( roll it back.
3937          * Since the swap buffer will be freed on scope exit which follows
3938          * shortly, restore the old captures by copying 'swap's original
3939          * data to the new offs buffer
3940          */
3941         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3942 	    "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
3943 	    0,
3944             PTR2UV(prog),
3945 	    PTR2UV(prog->offs),
3946 	    PTR2UV(swap)
3947 	));
3948 
3949         Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
3950     }
3951 
3952     /* clean up; this will trigger destructors that will free all slabs
3953      * above the current one, and cleanup the regmatch_info_aux
3954      * and regmatch_info_aux_eval sructs */
3955 
3956     LEAVE_SCOPE(oldsave);
3957 
3958     return 0;
3959 }
3960 
3961 
3962 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3963  * Do inc before dec, in case old and new rex are the same */
3964 #define SET_reg_curpm(Re2)                          \
3965     if (reginfo->info_aux_eval) {                   \
3966 	(void)ReREFCNT_inc(Re2);		    \
3967 	ReREFCNT_dec(PM_GETRE(PL_reg_curpm));	    \
3968 	PM_SETRE((PL_reg_curpm), (Re2));	    \
3969     }
3970 
3971 
3972 /*
3973  - regtry - try match at specific point
3974  */
3975 STATIC bool			/* 0 failure, 1 success */
3976 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3977 {
3978     CHECKPOINT lastcp;
3979     REGEXP *const rx = reginfo->prog;
3980     regexp *const prog = ReANY(rx);
3981     SSize_t result;
3982 #ifdef DEBUGGING
3983     U32 depth = 0; /* used by REGCP_SET */
3984 #endif
3985     RXi_GET_DECL(prog,progi);
3986     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3987 
3988     PERL_ARGS_ASSERT_REGTRY;
3989 
3990     reginfo->cutpoint=NULL;
3991 
3992     prog->offs[0].start = *startposp - reginfo->strbeg;
3993     prog->lastparen = 0;
3994     prog->lastcloseparen = 0;
3995 
3996     /* XXXX What this code is doing here?!!!  There should be no need
3997        to do this again and again, prog->lastparen should take care of
3998        this!  --ilya*/
3999 
4000     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4001      * Actually, the code in regcppop() (which Ilya may be meaning by
4002      * prog->lastparen), is not needed at all by the test suite
4003      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4004      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4005      * Meanwhile, this code *is* needed for the
4006      * above-mentioned test suite tests to succeed.  The common theme
4007      * on those tests seems to be returning null fields from matches.
4008      * --jhi updated by dapm */
4009 
4010     /* After encountering a variant of the issue mentioned above I think
4011      * the point Ilya was making is that if we properly unwind whenever
4012      * we set lastparen to a smaller value then we should not need to do
4013      * this every time, only when needed. So if we have tests that fail if
4014      * we remove this, then it suggests somewhere else we are improperly
4015      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4016      * places it is called, and related regcp() routines. - Yves */
4017 #if 1
4018     if (prog->nparens) {
4019 	regexp_paren_pair *pp = prog->offs;
4020 	I32 i;
4021 	for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4022 	    ++pp;
4023 	    pp->start = -1;
4024 	    pp->end = -1;
4025 	}
4026     }
4027 #endif
4028     REGCP_SET(lastcp);
4029     result = regmatch(reginfo, *startposp, progi->program + 1);
4030     if (result != -1) {
4031 	prog->offs[0].end = result;
4032 	return 1;
4033     }
4034     if (reginfo->cutpoint)
4035         *startposp= reginfo->cutpoint;
4036     REGCP_UNWIND(lastcp);
4037     return 0;
4038 }
4039 
4040 /* this is used to determine how far from the left messages like
4041    'failed...' are printed in regexec.c. It should be set such that
4042    messages are inline with the regop output that created them.
4043 */
4044 #define REPORT_CODE_OFF 29
4045 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4046 #ifdef DEBUGGING
4047 int
4048 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4049 {
4050     va_list ap;
4051     int result;
4052     PerlIO *f= Perl_debug_log;
4053     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4054     va_start(ap, depth);
4055     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4056     result = PerlIO_vprintf(f, fmt, ap);
4057     va_end(ap);
4058     return result;
4059 }
4060 #endif /* DEBUGGING */
4061 
4062 /* grab a new slab and return the first slot in it */
4063 
4064 STATIC regmatch_state *
4065 S_push_slab(pTHX)
4066 {
4067     regmatch_slab *s = PL_regmatch_slab->next;
4068     if (!s) {
4069 	Newx(s, 1, regmatch_slab);
4070 	s->prev = PL_regmatch_slab;
4071 	s->next = NULL;
4072 	PL_regmatch_slab->next = s;
4073     }
4074     PL_regmatch_slab = s;
4075     return SLAB_FIRST(s);
4076 }
4077 
4078 #ifdef DEBUGGING
4079 
4080 STATIC void
4081 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4082     const char *start, const char *end, const char *blurb)
4083 {
4084     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4085 
4086     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4087 
4088     if (!PL_colorset)
4089             reginitcolors();
4090     {
4091         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4092             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4093 
4094         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4095             start, end - start, PL_dump_re_max_len);
4096 
4097         Perl_re_printf( aTHX_
4098             "%s%s REx%s %s against %s\n",
4099 		       PL_colors[4], blurb, PL_colors[5], s0, s1);
4100 
4101         if (utf8_target||utf8_pat)
4102             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4103                 utf8_pat ? "pattern" : "",
4104                 utf8_pat && utf8_target ? " and " : "",
4105                 utf8_target ? "string" : ""
4106             );
4107     }
4108 }
4109 
4110 STATIC void
4111 S_dump_exec_pos(pTHX_ const char *locinput,
4112                       const regnode *scan,
4113                       const char *loc_regeol,
4114                       const char *loc_bostr,
4115                       const char *loc_reg_starttry,
4116                       const bool utf8_target,
4117                       const U32 depth
4118                 )
4119 {
4120     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4121     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4122     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4123     /* The part of the string before starttry has one color
4124        (pref0_len chars), between starttry and current
4125        position another one (pref_len - pref0_len chars),
4126        after the current position the third one.
4127        We assume that pref0_len <= pref_len, otherwise we
4128        decrease pref0_len.  */
4129     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4130 	? (5 + taill) - l : locinput - loc_bostr;
4131     int pref0_len;
4132 
4133     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4134 
4135     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4136 	pref_len++;
4137     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4138     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4139 	l = ( loc_regeol - locinput > (5 + taill) - pref_len
4140 	      ? (5 + taill) - pref_len : loc_regeol - locinput);
4141     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4142 	l--;
4143     if (pref0_len < 0)
4144 	pref0_len = 0;
4145     if (pref0_len > pref_len)
4146 	pref0_len = pref_len;
4147     {
4148 	const int is_uni = utf8_target ? 1 : 0;
4149 
4150 	RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4151 	    (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4152 
4153 	RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4154 		    (locinput - pref_len + pref0_len),
4155 		    pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4156 
4157 	RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4158 		    locinput, loc_regeol - locinput, 10, 0, 1);
4159 
4160 	const STRLEN tlen=len0+len1+len2;
4161         Perl_re_printf( aTHX_
4162                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4163 		    (IV)(locinput - loc_bostr),
4164 		    len0, s0,
4165 		    len1, s1,
4166 		    (docolor ? "" : "> <"),
4167 		    len2, s2,
4168 		    (int)(tlen > 19 ? 0 :  19 - tlen),
4169                     "",
4170                     depth);
4171     }
4172 }
4173 
4174 #endif
4175 
4176 /* reg_check_named_buff_matched()
4177  * Checks to see if a named buffer has matched. The data array of
4178  * buffer numbers corresponding to the buffer is expected to reside
4179  * in the regexp->data->data array in the slot stored in the ARG() of
4180  * node involved. Note that this routine doesn't actually care about the
4181  * name, that information is not preserved from compilation to execution.
4182  * Returns the index of the leftmost defined buffer with the given name
4183  * or 0 if non of the buffers matched.
4184  */
4185 STATIC I32
4186 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4187 {
4188     I32 n;
4189     RXi_GET_DECL(rex,rexi);
4190     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4191     I32 *nums=(I32*)SvPVX(sv_dat);
4192 
4193     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4194 
4195     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4196         if ((I32)rex->lastparen >= nums[n] &&
4197             rex->offs[nums[n]].end != -1)
4198         {
4199             return nums[n];
4200         }
4201     }
4202     return 0;
4203 }
4204 
4205 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4206 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
4207 #define CHRTEST_NOT_A_CP_1 -999
4208 #define CHRTEST_NOT_A_CP_2 -998
4209 
4210 static bool
4211 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4212         U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4213 {
4214     /* This function determines if there are zero, one, two, or more characters
4215      * that match the first character of the passed-in EXACTish node
4216      * <text_node>, and if there are one or two, it returns them in the
4217      * passed-in pointers.
4218      *
4219      * If it determines that no possible character in the target string can
4220      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
4221      * the first character in <text_node> requires UTF-8 to represent, and the
4222      * target string isn't in UTF-8.)
4223      *
4224      * If there are more than two characters that could match the beginning of
4225      * <text_node>, or if more context is required to determine a match or not,
4226      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4227      *
4228      * The motiviation behind this function is to allow the caller to set up
4229      * tight loops for matching.  If <text_node> is of type EXACT, there is
4230      * only one possible character that can match its first character, and so
4231      * the situation is quite simple.  But things get much more complicated if
4232      * folding is involved.  It may be that the first character of an EXACTFish
4233      * node doesn't participate in any possible fold, e.g., punctuation, so it
4234      * can be matched only by itself.  The vast majority of characters that are
4235      * in folds match just two things, their lower and upper-case equivalents.
4236      * But not all are like that; some have multiple possible matches, or match
4237      * sequences of more than one character.  This function sorts all that out.
4238      *
4239      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
4240      * loop of trying to match A*, we know we can't exit where the thing
4241      * following it isn't a B.  And something can't be a B unless it is the
4242      * beginning of B.  By putting a quick test for that beginning in a tight
4243      * loop, we can rule out things that can't possibly be B without having to
4244      * break out of the loop, thus avoiding work.  Similarly, if A is a single
4245      * character, we can make a tight loop matching A*, using the outputs of
4246      * this function.
4247      *
4248      * If the target string to match isn't in UTF-8, and there aren't
4249      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4250      * the one or two possible octets (which are characters in this situation)
4251      * that can match.  In all cases, if there is only one character that can
4252      * match, *<c1p> and *<c2p> will be identical.
4253      *
4254      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4255      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4256      * can match the beginning of <text_node>.  They should be declared with at
4257      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
4258      * undefined what these contain.)  If one or both of the buffers are
4259      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4260      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
4261      * *<c2p> will be set to a negative number(s) that shouldn't match any code
4262      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
4263      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4264 
4265     const bool utf8_target = reginfo->is_utf8_target;
4266 
4267     UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4268     UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4269     bool use_chrtest_void = FALSE;
4270     const bool is_utf8_pat = reginfo->is_utf8_pat;
4271 
4272     /* Used when we have both utf8 input and utf8 output, to avoid converting
4273      * to/from code points */
4274     bool utf8_has_been_setup = FALSE;
4275 
4276     dVAR;
4277 
4278     U8 *pat = (U8*)STRING(text_node);
4279     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4280 
4281     if (   OP(text_node) == EXACT
4282         || OP(text_node) == LEXACT
4283         || OP(text_node) == EXACT_REQ8
4284         || OP(text_node) == LEXACT_REQ8
4285         || OP(text_node) == EXACTL)
4286     {
4287 
4288         /* In an exact node, only one thing can be matched, that first
4289          * character.  If both the pat and the target are UTF-8, we can just
4290          * copy the input to the output, avoiding finding the code point of
4291          * that character */
4292         if (!is_utf8_pat) {
4293             assert(   OP(text_node) != EXACT_REQ8
4294                    && OP(text_node) != LEXACT_REQ8);
4295             c2 = c1 = *pat;
4296         }
4297         else if (utf8_target) {
4298             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4299             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4300             utf8_has_been_setup = TRUE;
4301         }
4302         else if (   OP(text_node) == EXACT_REQ8
4303                  || OP(text_node) == LEXACT_REQ8)
4304         {
4305             return FALSE;   /* Can only match UTF-8 target */
4306         }
4307         else {
4308             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4309         }
4310     }
4311     else { /* an EXACTFish node */
4312         U8 *pat_end = pat + STR_LENs(text_node);
4313 
4314         /* An EXACTFL node has at least some characters unfolded, because what
4315          * they match is not known until now.  So, now is the time to fold
4316          * the first few of them, as many as are needed to determine 'c1' and
4317          * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
4318          * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4319          * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
4320          * need to fold as many characters as a single character can fold to,
4321          * so that later we can check if the first ones are such a multi-char
4322          * fold.  But, in such a pattern only locale-problematic characters
4323          * aren't folded, so we can skip this completely if the first character
4324          * in the node isn't one of the tricky ones */
4325         if (OP(text_node) == EXACTFL) {
4326 
4327             if (! is_utf8_pat) {
4328                 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4329                 {
4330                     folded[0] = folded[1] = 's';
4331                     pat = folded;
4332                     pat_end = folded + 2;
4333                 }
4334             }
4335             else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4336                 U8 *s = pat;
4337                 U8 *d = folded;
4338                 int i;
4339 
4340                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4341                     if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
4342                         *(d++) = (U8) toFOLD_LC(*s);
4343                         s++;
4344                     }
4345                     else {
4346                         STRLEN len;
4347                         _toFOLD_utf8_flags(s,
4348                                            pat_end,
4349                                            d,
4350                                            &len,
4351                                            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4352                         d += len;
4353                         s += UTF8SKIP(s);
4354                     }
4355                 }
4356 
4357                 pat = folded;
4358                 pat_end = d;
4359             }
4360         }
4361 
4362         if (    ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4363              || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4364         {
4365             /* Multi-character folds require more context to sort out.  Also
4366              * PL_utf8_foldclosures used below doesn't handle them, so have to
4367              * be handled outside this routine */
4368             use_chrtest_void = TRUE;
4369         }
4370         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4371             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4372 
4373             if (   UNLIKELY(PL_in_utf8_turkic_locale)
4374                 && OP(text_node) == EXACTFL
4375                 && UNLIKELY(   c1 == 'i' || c1 == 'I'
4376                             || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
4377                             || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
4378             {   /* Hard-coded Turkish locale rules for these 4 characters
4379                    override normal rules */
4380                 if (c1 == 'i') {
4381                     c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4382                 }
4383                 else if (c1 == 'I') {
4384                     c2 = LATIN_SMALL_LETTER_DOTLESS_I;
4385                 }
4386                 else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
4387                     c2 = 'i';
4388                 }
4389                 else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
4390                     c2 = 'I';
4391                 }
4392             }
4393             else if (c1 > 255) {
4394                 const U32 * remaining_folds;
4395                 U32 first_fold;
4396 
4397                 /* Look up what code points (besides c1) fold to c1;  e.g.,
4398                  * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4399                 Size_t folds_count = _inverse_folds(c1, &first_fold,
4400                                                        &remaining_folds);
4401                 if (folds_count == 0) {
4402                     c2 = c1;    /* there is only a single character that could
4403                                    match */
4404                 }
4405                 else if (folds_count != 1) {
4406                     /* If there aren't exactly two folds to this (itself and
4407                      * another), it is outside the scope of this function */
4408                     use_chrtest_void = TRUE;
4409                 }
4410                 else {  /* There are two.  We already have one, get the other */
4411                     c2 = first_fold;
4412 
4413                     /* Folds that cross the 255/256 boundary are forbidden if
4414                      * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4415                      * ASCIII.  The only other match to c1 is c2, and since c1
4416                      * is above 255, c2 better be as well under these
4417                      * circumstances.  If it isn't, it means the only legal
4418                      * match of c1 is itself. */
4419                     if (    c2 < 256
4420                         && (   (   OP(text_node) == EXACTFL
4421                                 && ! IN_UTF8_CTYPE_LOCALE)
4422                             || ((     OP(text_node) == EXACTFAA
4423                                    || OP(text_node) == EXACTFAA_NO_TRIE)
4424                                 && (isASCII(c1) || isASCII(c2)))))
4425                     {
4426                         c2 = c1;
4427                     }
4428                 }
4429             }
4430             else /* Here, c1 is <= 255 */
4431                 if (   utf8_target
4432                     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4433                     && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4434                     && (   (   OP(text_node) != EXACTFAA
4435                             && OP(text_node) != EXACTFAA_NO_TRIE)
4436                         ||   ! isASCII(c1)))
4437             {
4438                 /* Here, there could be something above Latin1 in the target
4439                  * which folds to this character in the pattern.  All such
4440                  * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4441                  * than two characters involved in their folds, so are outside
4442                  * the scope of this function */
4443                 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4444                     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4445                 }
4446                 else {
4447                     use_chrtest_void = TRUE;
4448                 }
4449             }
4450             else { /* Here nothing above Latin1 can fold to the pattern
4451                       character */
4452                 switch (OP(text_node)) {
4453 
4454                     case EXACTFL:   /* /l rules */
4455                         c2 = PL_fold_locale[c1];
4456                         break;
4457 
4458                     case EXACTF:   /* This node only generated for non-utf8
4459                                     patterns */
4460                         assert(! is_utf8_pat);
4461                         if (! utf8_target) {    /* /d rules */
4462                             c2 = PL_fold[c1];
4463                             break;
4464                         }
4465                         /* FALLTHROUGH */
4466                         /* /u rules for all these.  This happens to work for
4467                         * EXACTFAA as nothing in Latin1 folds to ASCII */
4468                     case EXACTFAA_NO_TRIE:   /* This node only generated for
4469                                                 non-utf8 patterns */
4470                         assert(! is_utf8_pat);
4471                         /* FALLTHROUGH */
4472                     case EXACTFAA:
4473                     case EXACTFUP:
4474                     case EXACTFU:
4475                         c2 = PL_fold_latin1[c1];
4476                         break;
4477                     case EXACTFU_REQ8:
4478                         return FALSE;
4479                         NOT_REACHED; /* NOTREACHED */
4480 
4481                     default:
4482                         Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4483                         NOT_REACHED; /* NOTREACHED */
4484                 }
4485             }
4486         }
4487     }
4488 
4489     /* Here have figured things out.  Set up the returns */
4490     if (use_chrtest_void) {
4491         *c2p = *c1p = CHRTEST_VOID;
4492     }
4493     else if (utf8_target) {
4494         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4495             uvchr_to_utf8(c1_utf8, c1);
4496             uvchr_to_utf8(c2_utf8, c2);
4497         }
4498 
4499         /* Invariants are stored in both the utf8 and byte outputs; Use
4500          * negative numbers otherwise for the byte ones.  Make sure that the
4501          * byte ones are the same iff the utf8 ones are the same */
4502         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4503         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4504                 ? *c2_utf8
4505                 : (c1 == c2)
4506                   ? CHRTEST_NOT_A_CP_1
4507                   : CHRTEST_NOT_A_CP_2;
4508     }
4509     else if (c1 > 255) {
4510        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4511                            can represent */
4512            return FALSE;
4513        }
4514 
4515        *c1p = *c2p = c2;    /* c2 is the only representable value */
4516     }
4517     else {  /* c1 is representable; see about c2 */
4518        *c1p = c1;
4519        *c2p = (c2 < 256) ? c2 : c1;
4520     }
4521 
4522     return TRUE;
4523 }
4524 
4525 STATIC bool
4526 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4527 {
4528     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4529      * between the inputs.  See https://www.unicode.org/reports/tr29/. */
4530 
4531     PERL_ARGS_ASSERT_ISGCB;
4532 
4533     switch (GCB_table[before][after]) {
4534         case GCB_BREAKABLE:
4535             return TRUE;
4536 
4537         case GCB_NOBREAK:
4538             return FALSE;
4539 
4540         case GCB_RI_then_RI:
4541             {
4542                 int RI_count = 1;
4543                 U8 * temp_pos = (U8 *) curpos;
4544 
4545                 /* Do not break within emoji flag sequences. That is, do not
4546                  * break between regional indicator (RI) symbols if there is an
4547                  * odd number of RI characters before the break point.
4548                  *  GB12   sot (RI RI)* RI × RI
4549                  *  GB13 [^RI] (RI RI)* RI × RI */
4550 
4551                 while (backup_one_GCB(strbeg,
4552                                     &temp_pos,
4553                                     utf8_target) == GCB_Regional_Indicator)
4554                 {
4555                     RI_count++;
4556                 }
4557 
4558                 return RI_count % 2 != 1;
4559             }
4560 
4561         case GCB_EX_then_EM:
4562 
4563             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
4564             {
4565                 U8 * temp_pos = (U8 *) curpos;
4566                 GCB_enum prev;
4567 
4568                 do {
4569                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4570                 }
4571                 while (prev == GCB_Extend);
4572 
4573                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4574             }
4575 
4576         case GCB_Maybe_Emoji_NonBreak:
4577 
4578             {
4579 
4580             /* Do not break within emoji modifier sequences or emoji zwj sequences.
4581               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4582               */
4583                 U8 * temp_pos = (U8 *) curpos;
4584                 GCB_enum prev;
4585 
4586                 do {
4587                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4588                 }
4589                 while (prev == GCB_Extend);
4590 
4591                 return prev != GCB_ExtPict_XX;
4592             }
4593 
4594         default:
4595             break;
4596     }
4597 
4598 #ifdef DEBUGGING
4599     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4600                                   before, after, GCB_table[before][after]);
4601     assert(0);
4602 #endif
4603     return TRUE;
4604 }
4605 
4606 STATIC GCB_enum
4607 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4608 {
4609     dVAR;
4610     GCB_enum gcb;
4611 
4612     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4613 
4614     if (*curpos < strbeg) {
4615         return GCB_EDGE;
4616     }
4617 
4618     if (utf8_target) {
4619         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4620         U8 * prev_prev_char_pos;
4621 
4622         if (! prev_char_pos) {
4623             return GCB_EDGE;
4624         }
4625 
4626         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4627             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4628             *curpos = prev_char_pos;
4629             prev_char_pos = prev_prev_char_pos;
4630         }
4631         else {
4632             *curpos = (U8 *) strbeg;
4633             return GCB_EDGE;
4634         }
4635     }
4636     else {
4637         if (*curpos - 2 < strbeg) {
4638             *curpos = (U8 *) strbeg;
4639             return GCB_EDGE;
4640         }
4641         (*curpos)--;
4642         gcb = getGCB_VAL_CP(*(*curpos - 1));
4643     }
4644 
4645     return gcb;
4646 }
4647 
4648 /* Combining marks attach to most classes that precede them, but this defines
4649  * the exceptions (from TR14) */
4650 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
4651                                      || prev == LB_Mandatory_Break      \
4652                                      || prev == LB_Carriage_Return      \
4653                                      || prev == LB_Line_Feed            \
4654                                      || prev == LB_Next_Line            \
4655                                      || prev == LB_Space                \
4656                                      || prev == LB_ZWSpace))
4657 
4658 STATIC bool
4659 S_isLB(pTHX_ LB_enum before,
4660              LB_enum after,
4661              const U8 * const strbeg,
4662              const U8 * const curpos,
4663              const U8 * const strend,
4664              const bool utf8_target)
4665 {
4666     U8 * temp_pos = (U8 *) curpos;
4667     LB_enum prev = before;
4668 
4669     /* Is the boundary between 'before' and 'after' line-breakable?
4670      * Most of this is just a table lookup of a generated table from Unicode
4671      * rules.  But some rules require context to decide, and so have to be
4672      * implemented in code */
4673 
4674     PERL_ARGS_ASSERT_ISLB;
4675 
4676     /* Rule numbers in the comments below are as of Unicode 9.0 */
4677 
4678   redo:
4679     before = prev;
4680     switch (LB_table[before][after]) {
4681         case LB_BREAKABLE:
4682             return TRUE;
4683 
4684         case LB_NOBREAK:
4685         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4686             return FALSE;
4687 
4688         case LB_SP_foo + LB_BREAKABLE:
4689         case LB_SP_foo + LB_NOBREAK:
4690         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4691 
4692             /* When we have something following a SP, we have to look at the
4693              * context in order to know what to do.
4694              *
4695              * SP SP should not reach here because LB7: Do not break before
4696              * spaces.  (For two spaces in a row there is nothing that
4697              * overrides that) */
4698             assert(after != LB_Space);
4699 
4700             /* Here we have a space followed by a non-space.  Mostly this is a
4701              * case of LB18: "Break after spaces".  But there are complications
4702              * as the handling of spaces is somewhat tricky.  They are in a
4703              * number of rules, which have to be applied in priority order, but
4704              * something earlier in the string can cause a rule to be skipped
4705              * and a lower priority rule invoked.  A prime example is LB7 which
4706              * says don't break before a space.  But rule LB8 (lower priority)
4707              * says that the first break opportunity after a ZW is after any
4708              * span of spaces immediately after it.  If a ZW comes before a SP
4709              * in the input, rule LB8 applies, and not LB7.  Other such rules
4710              * involve combining marks which are rules 9 and 10, but they may
4711              * override higher priority rules if they come earlier in the
4712              * string.  Since we're doing random access into the middle of the
4713              * string, we have to look for rules that should get applied based
4714              * on both string position and priority.  Combining marks do not
4715              * attach to either ZW nor SP, so we don't have to consider them
4716              * until later.
4717              *
4718              * To check for LB8, we have to find the first non-space character
4719              * before this span of spaces */
4720             do {
4721                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4722             }
4723             while (prev == LB_Space);
4724 
4725             /* LB8 Break before any character following a zero-width space,
4726              * even if one or more spaces intervene.
4727              *      ZW SP* ÷
4728              * So if we have a ZW just before this span, and to get here this
4729              * is the final space in the span. */
4730             if (prev == LB_ZWSpace) {
4731                 return TRUE;
4732             }
4733 
4734             /* Here, not ZW SP+.  There are several rules that have higher
4735              * priority than LB18 and can be resolved now, as they don't depend
4736              * on anything earlier in the string (except ZW, which we have
4737              * already handled).  One of these rules is LB11 Do not break
4738              * before Word joiner, but we have specially encoded that in the
4739              * lookup table so it is caught by the single test below which
4740              * catches the other ones. */
4741             if (LB_table[LB_Space][after] - LB_SP_foo
4742                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4743             {
4744                 return FALSE;
4745             }
4746 
4747             /* If we get here, we have to XXX consider combining marks. */
4748             if (prev == LB_Combining_Mark) {
4749 
4750                 /* What happens with these depends on the character they
4751                  * follow.  */
4752                 do {
4753                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4754                 }
4755                 while (prev == LB_Combining_Mark);
4756 
4757                 /* Most times these attach to and inherit the characteristics
4758                  * of that character, but not always, and when not, they are to
4759                  * be treated as AL by rule LB10. */
4760                 if (! LB_CM_ATTACHES_TO(prev)) {
4761                     prev = LB_Alphabetic;
4762                 }
4763             }
4764 
4765             /* Here, we have the character preceding the span of spaces all set
4766              * up.  We follow LB18: "Break after spaces" unless the table shows
4767              * that is overriden */
4768             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4769 
4770         case LB_CM_ZWJ_foo:
4771 
4772             /* We don't know how to treat the CM except by looking at the first
4773              * non-CM character preceding it.  ZWJ is treated as CM */
4774             do {
4775                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4776             }
4777             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4778 
4779             /* Here, 'prev' is that first earlier non-CM character.  If the CM
4780              * attatches to it, then it inherits the behavior of 'prev'.  If it
4781              * doesn't attach, it is to be treated as an AL */
4782             if (! LB_CM_ATTACHES_TO(prev)) {
4783                 prev = LB_Alphabetic;
4784             }
4785 
4786             goto redo;
4787 
4788         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4789         case LB_HY_or_BA_then_foo + LB_NOBREAK:
4790 
4791             /* LB21a Don't break after Hebrew + Hyphen.
4792              * HL (HY | BA) × */
4793 
4794             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4795                                                           == LB_Hebrew_Letter)
4796             {
4797                 return FALSE;
4798             }
4799 
4800             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4801 
4802         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4803         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4804 
4805             /* LB25a (PR | PO) × ( OP | HY )? NU */
4806             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4807                 return FALSE;
4808             }
4809 
4810             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4811                                                                 == LB_BREAKABLE;
4812 
4813         case LB_SY_or_IS_then_various + LB_BREAKABLE:
4814         case LB_SY_or_IS_then_various + LB_NOBREAK:
4815         {
4816             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4817 
4818             LB_enum temp = prev;
4819             do {
4820                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4821             }
4822             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4823             if (temp == LB_Numeric) {
4824                 return FALSE;
4825             }
4826 
4827             return LB_table[prev][after] - LB_SY_or_IS_then_various
4828                                                                == LB_BREAKABLE;
4829         }
4830 
4831         case LB_various_then_PO_or_PR + LB_BREAKABLE:
4832         case LB_various_then_PO_or_PR + LB_NOBREAK:
4833         {
4834             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4835 
4836             LB_enum temp = prev;
4837             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4838             {
4839                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4840             }
4841             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4842                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4843             }
4844             if (temp == LB_Numeric) {
4845                 return FALSE;
4846             }
4847             return LB_various_then_PO_or_PR;
4848         }
4849 
4850         case LB_RI_then_RI + LB_NOBREAK:
4851         case LB_RI_then_RI + LB_BREAKABLE:
4852             {
4853                 int RI_count = 1;
4854 
4855                 /* LB30a Break between two regional indicator symbols if and
4856                  * only if there are an even number of regional indicators
4857                  * preceding the position of the break.
4858                  *
4859                  *    sot (RI RI)* RI × RI
4860                  *  [^RI] (RI RI)* RI × RI */
4861 
4862                 while (backup_one_LB(strbeg,
4863                                      &temp_pos,
4864                                      utf8_target) == LB_Regional_Indicator)
4865                 {
4866                     RI_count++;
4867                 }
4868 
4869                 return RI_count % 2 == 0;
4870             }
4871 
4872         default:
4873             break;
4874     }
4875 
4876 #ifdef DEBUGGING
4877     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
4878                                   before, after, LB_table[before][after]);
4879     assert(0);
4880 #endif
4881     return TRUE;
4882 }
4883 
4884 STATIC LB_enum
4885 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4886 {
4887     dVAR;
4888 
4889     LB_enum lb;
4890 
4891     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4892 
4893     if (*curpos >= strend) {
4894         return LB_EDGE;
4895     }
4896 
4897     if (utf8_target) {
4898         *curpos += UTF8SKIP(*curpos);
4899         if (*curpos >= strend) {
4900             return LB_EDGE;
4901         }
4902         lb = getLB_VAL_UTF8(*curpos, strend);
4903     }
4904     else {
4905         (*curpos)++;
4906         if (*curpos >= strend) {
4907             return LB_EDGE;
4908         }
4909         lb = getLB_VAL_CP(**curpos);
4910     }
4911 
4912     return lb;
4913 }
4914 
4915 STATIC LB_enum
4916 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4917 {
4918     dVAR;
4919     LB_enum lb;
4920 
4921     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4922 
4923     if (*curpos < strbeg) {
4924         return LB_EDGE;
4925     }
4926 
4927     if (utf8_target) {
4928         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4929         U8 * prev_prev_char_pos;
4930 
4931         if (! prev_char_pos) {
4932             return LB_EDGE;
4933         }
4934 
4935         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4936             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4937             *curpos = prev_char_pos;
4938             prev_char_pos = prev_prev_char_pos;
4939         }
4940         else {
4941             *curpos = (U8 *) strbeg;
4942             return LB_EDGE;
4943         }
4944     }
4945     else {
4946         if (*curpos - 2 < strbeg) {
4947             *curpos = (U8 *) strbeg;
4948             return LB_EDGE;
4949         }
4950         (*curpos)--;
4951         lb = getLB_VAL_CP(*(*curpos - 1));
4952     }
4953 
4954     return lb;
4955 }
4956 
4957 STATIC bool
4958 S_isSB(pTHX_ SB_enum before,
4959              SB_enum after,
4960              const U8 * const strbeg,
4961              const U8 * const curpos,
4962              const U8 * const strend,
4963              const bool utf8_target)
4964 {
4965     /* returns a boolean indicating if there is a Sentence Boundary Break
4966      * between the inputs.  See https://www.unicode.org/reports/tr29/ */
4967 
4968     U8 * lpos = (U8 *) curpos;
4969     bool has_para_sep = FALSE;
4970     bool has_sp = FALSE;
4971 
4972     PERL_ARGS_ASSERT_ISSB;
4973 
4974     /* Break at the start and end of text.
4975         SB1.  sot  ÷
4976         SB2.  ÷  eot
4977       But unstated in Unicode is don't break if the text is empty */
4978     if (before == SB_EDGE || after == SB_EDGE) {
4979         return before != after;
4980     }
4981 
4982     /* SB 3: Do not break within CRLF. */
4983     if (before == SB_CR && after == SB_LF) {
4984         return FALSE;
4985     }
4986 
4987     /* Break after paragraph separators.  CR and LF are considered
4988      * so because Unicode views text as like word processing text where there
4989      * are no newlines except between paragraphs, and the word processor takes
4990      * care of wrapping without there being hard line-breaks in the text *./
4991        SB4.  Sep | CR | LF  ÷ */
4992     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4993         return TRUE;
4994     }
4995 
4996     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4997      * (See Section 6.2, Replacing Ignore Rules.)
4998         SB5.  X (Extend | Format)*  →  X */
4999     if (after == SB_Extend || after == SB_Format) {
5000 
5001         /* Implied is that the these characters attach to everything
5002          * immediately prior to them except for those separator-type
5003          * characters.  And the rules earlier have already handled the case
5004          * when one of those immediately precedes the extend char */
5005         return FALSE;
5006     }
5007 
5008     if (before == SB_Extend || before == SB_Format) {
5009         U8 * temp_pos = lpos;
5010         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5011         if (   backup != SB_EDGE
5012             && backup != SB_Sep
5013             && backup != SB_CR
5014             && backup != SB_LF)
5015         {
5016             before = backup;
5017             lpos = temp_pos;
5018         }
5019 
5020         /* Here, both 'before' and 'backup' are these types; implied is that we
5021          * don't break between them */
5022         if (backup == SB_Extend || backup == SB_Format) {
5023             return FALSE;
5024         }
5025     }
5026 
5027     /* Do not break after ambiguous terminators like period, if they are
5028      * immediately followed by a number or lowercase letter, if they are
5029      * between uppercase letters, if the first following letter (optionally
5030      * after certain punctuation) is lowercase, or if they are followed by
5031      * "continuation" punctuation such as comma, colon, or semicolon. For
5032      * example, a period may be an abbreviation or numeric period, and thus may
5033      * not mark the end of a sentence.
5034 
5035      * SB6. ATerm  ×  Numeric */
5036     if (before == SB_ATerm && after == SB_Numeric) {
5037         return FALSE;
5038     }
5039 
5040     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5041     if (before == SB_ATerm && after == SB_Upper) {
5042         U8 * temp_pos = lpos;
5043         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5044         if (backup == SB_Upper || backup == SB_Lower) {
5045             return FALSE;
5046         }
5047     }
5048 
5049     /* The remaining rules that aren't the final one, all require an STerm or
5050      * an ATerm after having backed up over some Close* Sp*, and in one case an
5051      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5052      * So do that backup now, setting flags if either Sp or a paragraph
5053      * separator are found */
5054 
5055     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5056         has_para_sep = TRUE;
5057         before = backup_one_SB(strbeg, &lpos, utf8_target);
5058     }
5059 
5060     if (before == SB_Sp) {
5061         has_sp = TRUE;
5062         do {
5063             before = backup_one_SB(strbeg, &lpos, utf8_target);
5064         }
5065         while (before == SB_Sp);
5066     }
5067 
5068     while (before == SB_Close) {
5069         before = backup_one_SB(strbeg, &lpos, utf8_target);
5070     }
5071 
5072     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5073      * most cases an STerm */
5074     if (before == SB_STerm || before == SB_ATerm) {
5075 
5076         /* So, here the lhs matches
5077          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5078          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5079          * The rules that apply here are:
5080          *
5081          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5082                                            | LF | STerm | ATerm) )* Lower
5083            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5084            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5085            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5086            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5087          */
5088 
5089         /* And all but SB11 forbid having seen a paragraph separator */
5090         if (! has_para_sep) {
5091             if (before == SB_ATerm) {          /* SB8 */
5092                 U8 * rpos = (U8 *) curpos;
5093                 SB_enum later = after;
5094 
5095                 while (    later != SB_OLetter
5096                         && later != SB_Upper
5097                         && later != SB_Lower
5098                         && later != SB_Sep
5099                         && later != SB_CR
5100                         && later != SB_LF
5101                         && later != SB_STerm
5102                         && later != SB_ATerm
5103                         && later != SB_EDGE)
5104                 {
5105                     later = advance_one_SB(&rpos, strend, utf8_target);
5106                 }
5107                 if (later == SB_Lower) {
5108                     return FALSE;
5109                 }
5110             }
5111 
5112             if (   after == SB_SContinue    /* SB8a */
5113                 || after == SB_STerm
5114                 || after == SB_ATerm)
5115             {
5116                 return FALSE;
5117             }
5118 
5119             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5120                 if (   after == SB_Close
5121                     || after == SB_Sp
5122                     || after == SB_Sep
5123                     || after == SB_CR
5124                     || after == SB_LF)
5125                 {
5126                     return FALSE;
5127                 }
5128             }
5129 
5130             /* SB10.  This and SB9 could probably be combined some way, but khw
5131              * has decided to follow the Unicode rule book precisely for
5132              * simplified maintenance */
5133             if (   after == SB_Sp
5134                 || after == SB_Sep
5135                 || after == SB_CR
5136                 || after == SB_LF)
5137             {
5138                 return FALSE;
5139             }
5140         }
5141 
5142         /* SB11.  */
5143         return TRUE;
5144     }
5145 
5146     /* Otherwise, do not break.
5147     SB12.  Any  ×  Any */
5148 
5149     return FALSE;
5150 }
5151 
5152 STATIC SB_enum
5153 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5154 {
5155     dVAR;
5156     SB_enum sb;
5157 
5158     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5159 
5160     if (*curpos >= strend) {
5161         return SB_EDGE;
5162     }
5163 
5164     if (utf8_target) {
5165         do {
5166             *curpos += UTF8SKIP(*curpos);
5167             if (*curpos >= strend) {
5168                 return SB_EDGE;
5169             }
5170             sb = getSB_VAL_UTF8(*curpos, strend);
5171         } while (sb == SB_Extend || sb == SB_Format);
5172     }
5173     else {
5174         do {
5175             (*curpos)++;
5176             if (*curpos >= strend) {
5177                 return SB_EDGE;
5178             }
5179             sb = getSB_VAL_CP(**curpos);
5180         } while (sb == SB_Extend || sb == SB_Format);
5181     }
5182 
5183     return sb;
5184 }
5185 
5186 STATIC SB_enum
5187 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5188 {
5189     dVAR;
5190     SB_enum sb;
5191 
5192     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5193 
5194     if (*curpos < strbeg) {
5195         return SB_EDGE;
5196     }
5197 
5198     if (utf8_target) {
5199         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5200         if (! prev_char_pos) {
5201             return SB_EDGE;
5202         }
5203 
5204         /* Back up over Extend and Format.  curpos is always just to the right
5205          * of the characater whose value we are getting */
5206         do {
5207             U8 * prev_prev_char_pos;
5208             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5209                                                                       strbeg)))
5210             {
5211                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5212                 *curpos = prev_char_pos;
5213                 prev_char_pos = prev_prev_char_pos;
5214             }
5215             else {
5216                 *curpos = (U8 *) strbeg;
5217                 return SB_EDGE;
5218             }
5219         } while (sb == SB_Extend || sb == SB_Format);
5220     }
5221     else {
5222         do {
5223             if (*curpos - 2 < strbeg) {
5224                 *curpos = (U8 *) strbeg;
5225                 return SB_EDGE;
5226             }
5227             (*curpos)--;
5228             sb = getSB_VAL_CP(*(*curpos - 1));
5229         } while (sb == SB_Extend || sb == SB_Format);
5230     }
5231 
5232     return sb;
5233 }
5234 
5235 STATIC bool
5236 S_isWB(pTHX_ WB_enum previous,
5237              WB_enum before,
5238              WB_enum after,
5239              const U8 * const strbeg,
5240              const U8 * const curpos,
5241              const U8 * const strend,
5242              const bool utf8_target)
5243 {
5244     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5245      *  a Unicode word break, using their published algorithm, but tailored for
5246      *  Perl by treating spans of white space as one unit.  Context may be
5247      *  needed to make this determination.  If the value for the character
5248      *  before 'before' is known, it is passed as 'previous'; otherwise that
5249      *  should be set to WB_UNKNOWN.  The other input parameters give the
5250      *  boundaries and current position in the matching of the string.  That
5251      *  is, 'curpos' marks the position where the character whose wb value is
5252      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5253 
5254     U8 * before_pos = (U8 *) curpos;
5255     U8 * after_pos = (U8 *) curpos;
5256     WB_enum prev = before;
5257     WB_enum next;
5258 
5259     PERL_ARGS_ASSERT_ISWB;
5260 
5261     /* Rule numbers in the comments below are as of Unicode 9.0 */
5262 
5263   redo:
5264     before = prev;
5265     switch (WB_table[before][after]) {
5266         case WB_BREAKABLE:
5267             return TRUE;
5268 
5269         case WB_NOBREAK:
5270             return FALSE;
5271 
5272         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5273             next = advance_one_WB(&after_pos, strend, utf8_target,
5274                                  FALSE /* Don't skip Extend nor Format */ );
5275             /* A space immediately preceeding an Extend or Format is attached
5276              * to by them, and hence gets separated from previous spaces.
5277              * Otherwise don't break between horizontal white space */
5278             return next == WB_Extend || next == WB_Format;
5279 
5280         /* WB4 Ignore Format and Extend characters, except when they appear at
5281          * the beginning of a region of text.  This code currently isn't
5282          * general purpose, but it works as the rules are currently and likely
5283          * to be laid out.  The reason it works is that when 'they appear at
5284          * the beginning of a region of text', the rule is to break before
5285          * them, just like any other character.  Therefore, the default rule
5286          * applies and we don't have to look in more depth.  Should this ever
5287          * change, we would have to have 2 'case' statements, like in the rules
5288          * below, and backup a single character (not spacing over the extend
5289          * ones) and then see if that is one of the region-end characters and
5290          * go from there */
5291         case WB_Ex_or_FO_or_ZWJ_then_foo:
5292             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5293             goto redo;
5294 
5295         case WB_DQ_then_HL + WB_BREAKABLE:
5296         case WB_DQ_then_HL + WB_NOBREAK:
5297 
5298             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5299 
5300             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5301                                                             == WB_Hebrew_Letter)
5302             {
5303                 return FALSE;
5304             }
5305 
5306              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5307 
5308         case WB_HL_then_DQ + WB_BREAKABLE:
5309         case WB_HL_then_DQ + WB_NOBREAK:
5310 
5311             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5312 
5313             if (advance_one_WB(&after_pos, strend, utf8_target,
5314                                        TRUE /* Do skip Extend and Format */ )
5315                                                             == WB_Hebrew_Letter)
5316             {
5317                 return FALSE;
5318             }
5319 
5320             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5321 
5322         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5323         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5324 
5325             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5326              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5327 
5328             next = advance_one_WB(&after_pos, strend, utf8_target,
5329                                        TRUE /* Do skip Extend and Format */ );
5330 
5331             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5332             {
5333                 return FALSE;
5334             }
5335 
5336             return WB_table[before][after]
5337                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5338 
5339         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5340         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5341 
5342             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5343              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5344 
5345             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5346             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5347             {
5348                 return FALSE;
5349             }
5350 
5351             return WB_table[before][after]
5352                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5353 
5354         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5355         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5356 
5357             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5358              * */
5359 
5360             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5361                                                             == WB_Numeric)
5362             {
5363                 return FALSE;
5364             }
5365 
5366             return WB_table[before][after]
5367                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5368 
5369         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5370         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5371 
5372             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5373 
5374             if (advance_one_WB(&after_pos, strend, utf8_target,
5375                                        TRUE /* Do skip Extend and Format */ )
5376                                                             == WB_Numeric)
5377             {
5378                 return FALSE;
5379             }
5380 
5381             return WB_table[before][after]
5382                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5383 
5384         case WB_RI_then_RI + WB_NOBREAK:
5385         case WB_RI_then_RI + WB_BREAKABLE:
5386             {
5387                 int RI_count = 1;
5388 
5389                 /* Do not break within emoji flag sequences. That is, do not
5390                  * break between regional indicator (RI) symbols if there is an
5391                  * odd number of RI characters before the potential break
5392                  * point.
5393                  *
5394                  * WB15   sot (RI RI)* RI × RI
5395                  * WB16 [^RI] (RI RI)* RI × RI */
5396 
5397                 while (backup_one_WB(&previous,
5398                                      strbeg,
5399                                      &before_pos,
5400                                      utf8_target) == WB_Regional_Indicator)
5401                 {
5402                     RI_count++;
5403                 }
5404 
5405                 return RI_count % 2 != 1;
5406             }
5407 
5408         default:
5409             break;
5410     }
5411 
5412 #ifdef DEBUGGING
5413     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5414                                   before, after, WB_table[before][after]);
5415     assert(0);
5416 #endif
5417     return TRUE;
5418 }
5419 
5420 STATIC WB_enum
5421 S_advance_one_WB(pTHX_ U8 ** curpos,
5422                        const U8 * const strend,
5423                        const bool utf8_target,
5424                        const bool skip_Extend_Format)
5425 {
5426     dVAR;
5427     WB_enum wb;
5428 
5429     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5430 
5431     if (*curpos >= strend) {
5432         return WB_EDGE;
5433     }
5434 
5435     if (utf8_target) {
5436 
5437         /* Advance over Extend and Format */
5438         do {
5439             *curpos += UTF8SKIP(*curpos);
5440             if (*curpos >= strend) {
5441                 return WB_EDGE;
5442             }
5443             wb = getWB_VAL_UTF8(*curpos, strend);
5444         } while (    skip_Extend_Format
5445                  && (wb == WB_Extend || wb == WB_Format));
5446     }
5447     else {
5448         do {
5449             (*curpos)++;
5450             if (*curpos >= strend) {
5451                 return WB_EDGE;
5452             }
5453             wb = getWB_VAL_CP(**curpos);
5454         } while (    skip_Extend_Format
5455                  && (wb == WB_Extend || wb == WB_Format));
5456     }
5457 
5458     return wb;
5459 }
5460 
5461 STATIC WB_enum
5462 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5463 {
5464     dVAR;
5465     WB_enum wb;
5466 
5467     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5468 
5469     /* If we know what the previous character's break value is, don't have
5470         * to look it up */
5471     if (*previous != WB_UNKNOWN) {
5472         wb = *previous;
5473 
5474         /* But we need to move backwards by one */
5475         if (utf8_target) {
5476             *curpos = reghopmaybe3(*curpos, -1, strbeg);
5477             if (! *curpos) {
5478                 *previous = WB_EDGE;
5479                 *curpos = (U8 *) strbeg;
5480             }
5481             else {
5482                 *previous = WB_UNKNOWN;
5483             }
5484         }
5485         else {
5486             (*curpos)--;
5487             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5488         }
5489 
5490         /* And we always back up over these three types */
5491         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5492             return wb;
5493         }
5494     }
5495 
5496     if (*curpos < strbeg) {
5497         return WB_EDGE;
5498     }
5499 
5500     if (utf8_target) {
5501         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5502         if (! prev_char_pos) {
5503             return WB_EDGE;
5504         }
5505 
5506         /* Back up over Extend and Format.  curpos is always just to the right
5507          * of the characater whose value we are getting */
5508         do {
5509             U8 * prev_prev_char_pos;
5510             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5511                                                    -1,
5512                                                    strbeg)))
5513             {
5514                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5515                 *curpos = prev_char_pos;
5516                 prev_char_pos = prev_prev_char_pos;
5517             }
5518             else {
5519                 *curpos = (U8 *) strbeg;
5520                 return WB_EDGE;
5521             }
5522         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5523     }
5524     else {
5525         do {
5526             if (*curpos - 2 < strbeg) {
5527                 *curpos = (U8 *) strbeg;
5528                 return WB_EDGE;
5529             }
5530             (*curpos)--;
5531             wb = getWB_VAL_CP(*(*curpos - 1));
5532         } while (wb == WB_Extend || wb == WB_Format);
5533     }
5534 
5535     return wb;
5536 }
5537 
5538 /* Macros for regmatch(), using its internal variables */
5539 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
5540 #define NEXTCHR_IS_EOS (nextchr < 0)
5541 
5542 #define SET_nextchr \
5543     nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
5544 
5545 #define SET_locinput(p) \
5546     locinput = (p);  \
5547     SET_nextchr
5548 
5549 #define sayYES goto yes
5550 #define sayNO goto no
5551 #define sayNO_SILENT goto no_silent
5552 
5553 /* we dont use STMT_START/END here because it leads to
5554    "unreachable code" warnings, which are bogus, but distracting. */
5555 #define CACHEsayNO \
5556     if (ST.cache_mask) \
5557        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
5558     sayNO
5559 
5560 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
5561 (                                                           \
5562     (   ( st )                                         ) && \
5563     (   ( st )->u.eval.close_paren                     ) && \
5564     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5565 )
5566 
5567 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
5568 (                                                           \
5569     (   ( st )                                         ) && \
5570     (   ( st )->u.eval.close_paren                     ) && \
5571     (   ( expr )                                       ) && \
5572     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
5573 )
5574 
5575 
5576 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5577     (st)->u.eval.close_paren = ( (expr) + 1 )
5578 
5579 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5580     (st)->u.eval.close_paren = 0
5581 
5582 /* push a new state then goto it */
5583 
5584 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
5585     pushinput = input; \
5586     pusheol = eol; \
5587     pushsr0 = sr0; \
5588     scan = node; \
5589     st->resume_state = state; \
5590     goto push_state;
5591 
5592 /* push a new state with success backtracking, then goto it */
5593 
5594 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
5595     pushinput = input; \
5596     pusheol = eol;     \
5597     pushsr0 = sr0; \
5598     scan = node; \
5599     st->resume_state = state; \
5600     goto push_yes_state;
5601 
5602 #define DEBUG_STATE_pp(pp)                                  \
5603     DEBUG_STATE_r({                                         \
5604         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
5605         Perl_re_printf( aTHX_                               \
5606             "%*s" pp " %s%s%s%s%s\n",                       \
5607             INDENT_CHARS(depth), "",                        \
5608             PL_reg_name[st->resume_state],                  \
5609             ((st==yes_state||st==mark_state) ? "[" : ""),   \
5610             ((st==yes_state) ? "Y" : ""),                   \
5611             ((st==mark_state) ? "M" : ""),                  \
5612             ((st==yes_state||st==mark_state) ? "]" : "")    \
5613         );                                                  \
5614     });
5615 
5616 /*
5617 
5618 regmatch() - main matching routine
5619 
5620 This is basically one big switch statement in a loop. We execute an op,
5621 set 'next' to point the next op, and continue. If we come to a point which
5622 we may need to backtrack to on failure such as (A|B|C), we push a
5623 backtrack state onto the backtrack stack. On failure, we pop the top
5624 state, and re-enter the loop at the state indicated. If there are no more
5625 states to pop, we return failure.
5626 
5627 Sometimes we also need to backtrack on success; for example /A+/, where
5628 after successfully matching one A, we need to go back and try to
5629 match another one; similarly for lookahead assertions: if the assertion
5630 completes successfully, we backtrack to the state just before the assertion
5631 and then carry on.  In these cases, the pushed state is marked as
5632 'backtrack on success too'. This marking is in fact done by a chain of
5633 pointers, each pointing to the previous 'yes' state. On success, we pop to
5634 the nearest yes state, discarding any intermediate failure-only states.
5635 Sometimes a yes state is pushed just to force some cleanup code to be
5636 called at the end of a successful match or submatch; e.g. (??{$re}) uses
5637 it to free the inner regex.
5638 
5639 Note that failure backtracking rewinds the cursor position, while
5640 success backtracking leaves it alone.
5641 
5642 A pattern is complete when the END op is executed, while a subpattern
5643 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
5644 ops trigger the "pop to last yes state if any, otherwise return true"
5645 behaviour.
5646 
5647 A common convention in this function is to use A and B to refer to the two
5648 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
5649 the subpattern to be matched possibly multiple times, while B is the entire
5650 rest of the pattern. Variable and state names reflect this convention.
5651 
5652 The states in the main switch are the union of ops and failure/success of
5653 substates associated with that op.  For example, IFMATCH is the op
5654 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
5655 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
5656 successfully matched A and IFMATCH_A_fail is a state saying that we have
5657 just failed to match A. Resume states always come in pairs. The backtrack
5658 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
5659 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
5660 on success or failure.
5661 
5662 The struct that holds a backtracking state is actually a big union, with
5663 one variant for each major type of op. The variable st points to the
5664 top-most backtrack struct. To make the code clearer, within each
5665 block of code we #define ST to alias the relevant union.
5666 
5667 Here's a concrete example of a (vastly oversimplified) IFMATCH
5668 implementation:
5669 
5670     switch (state) {
5671     ....
5672 
5673 #define ST st->u.ifmatch
5674 
5675     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5676 	ST.foo = ...; // some state we wish to save
5677 	...
5678 	// push a yes backtrack state with a resume value of
5679 	// IFMATCH_A/IFMATCH_A_fail, then continue execution at the
5680 	// first node of A:
5681 	PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
5682 	// NOTREACHED
5683 
5684     case IFMATCH_A: // we have successfully executed A; now continue with B
5685 	next = B;
5686 	bar = ST.foo; // do something with the preserved value
5687 	break;
5688 
5689     case IFMATCH_A_fail: // A failed, so the assertion failed
5690 	...;   // do some housekeeping, then ...
5691 	sayNO; // propagate the failure
5692 
5693 #undef ST
5694 
5695     ...
5696     }
5697 
5698 For any old-timers reading this who are familiar with the old recursive
5699 approach, the code above is equivalent to:
5700 
5701     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
5702     {
5703 	int foo = ...
5704 	...
5705 	if (regmatch(A)) {
5706 	    next = B;
5707 	    bar = foo;
5708 	    break;
5709 	}
5710 	...;   // do some housekeeping, then ...
5711 	sayNO; // propagate the failure
5712     }
5713 
5714 The topmost backtrack state, pointed to by st, is usually free. If you
5715 want to claim it, populate any ST.foo fields in it with values you wish to
5716 save, then do one of
5717 
5718 	PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
5719 	PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
5720 
5721 which sets that backtrack state's resume value to 'resume_state', pushes a
5722 new free entry to the top of the backtrack stack, then goes to 'node'.
5723 On backtracking, the free slot is popped, and the saved state becomes the
5724 new free state. An ST.foo field in this new top state can be temporarily
5725 accessed to retrieve values, but once the main loop is re-entered, it
5726 becomes available for reuse.
5727 
5728 Note that the depth of the backtrack stack constantly increases during the
5729 left-to-right execution of the pattern, rather than going up and down with
5730 the pattern nesting. For example the stack is at its maximum at Z at the
5731 end of the pattern, rather than at X in the following:
5732 
5733     /(((X)+)+)+....(Y)+....Z/
5734 
5735 The only exceptions to this are lookahead/behind assertions and the cut,
5736 (?>A), which pop all the backtrack states associated with A before
5737 continuing.
5738 
5739 Backtrack state structs are allocated in slabs of about 4K in size.
5740 PL_regmatch_state and st always point to the currently active state,
5741 and PL_regmatch_slab points to the slab currently containing
5742 PL_regmatch_state.  The first time regmatch() is called, the first slab is
5743 allocated, and is never freed until interpreter destruction. When the slab
5744 is full, a new one is allocated and chained to the end. At exit from
5745 regmatch(), slabs allocated since entry are freed.
5746 
5747 In order to work with variable length lookbehinds, an upper limit is placed on
5748 lookbehinds which is set to where the match position is at the end of where the
5749 lookbehind would get to.  Nothing in the lookbehind should match above that,
5750 except we should be able to look beyond if for things like \b, which need the
5751 next character in the string to be able to determine if this is a boundary or
5752 not.  We also can't match the end of string/line unless we are also at the end
5753 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
5754 that match a width, we have to add a condition that they are within the legal
5755 bounds of our window into the string.
5756 
5757 */
5758 
5759 /* returns -1 on failure, $+[0] on success */
5760 STATIC SSize_t
5761 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5762 {
5763     dVAR;
5764     const bool utf8_target = reginfo->is_utf8_target;
5765     const U32 uniflags = UTF8_ALLOW_DEFAULT;
5766     REGEXP *rex_sv = reginfo->prog;
5767     regexp *rex = ReANY(rex_sv);
5768     RXi_GET_DECL(rex,rexi);
5769     /* the current state. This is a cached copy of PL_regmatch_state */
5770     regmatch_state *st;
5771     /* cache heavy used fields of st in registers */
5772     regnode *scan;
5773     regnode *next;
5774     U32 n = 0;	/* general value; init to avoid compiler warning */
5775     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
5776     SSize_t endref = 0; /* offset of end of backref when ln is start */
5777     char *locinput = startpos;
5778     char *loceol = reginfo->strend;
5779     char *pushinput; /* where to continue after a PUSH */
5780     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
5781     U8   *pushsr0;   /* save starting pos of script run */
5782     I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
5783 
5784     bool result = 0;	    /* return value of S_regmatch */
5785     U32 depth = 0;            /* depth of backtrack stack */
5786     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5787     const U32 max_nochange_depth =
5788         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5789         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5790     regmatch_state *yes_state = NULL; /* state to pop to on success of
5791 							    subpattern */
5792     /* mark_state piggy backs on the yes_state logic so that when we unwind
5793        the stack on success we can update the mark_state as we go */
5794     regmatch_state *mark_state = NULL; /* last mark state we have seen */
5795     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5796     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
5797     U32 state_num;
5798     bool no_final = 0;      /* prevent failure from backtracking? */
5799     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
5800     char *startpoint = locinput;
5801     SV *popmark = NULL;     /* are we looking for a mark? */
5802     SV *sv_commit = NULL;   /* last mark name seen in failure */
5803     SV *sv_yes_mark = NULL; /* last mark name we have seen
5804                                during a successful match */
5805     U32 lastopen = 0;       /* last open we saw */
5806     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5807     SV* const oreplsv = GvSVn(PL_replgv);
5808     /* these three flags are set by various ops to signal information to
5809      * the very next op. They have a useful lifetime of exactly one loop
5810      * iteration, and are not preserved or restored by state pushes/pops
5811      */
5812     bool sw = 0;	    /* the condition value in (?(cond)a|b) */
5813     bool minmod = 0;	    /* the next "{n,m}" is a "{n,m}?" */
5814     int logical = 0;	    /* the following EVAL is:
5815 				0: (?{...})
5816 				1: (?(?{...})X|Y)
5817 				2: (??{...})
5818 			       or the following IFMATCH/UNLESSM is:
5819 			        false: plain (?=foo)
5820 				true:  used as a condition: (?(?=foo))
5821 			    */
5822     PAD* last_pad = NULL;
5823     dMULTICALL;
5824     U8 gimme = G_SCALAR;
5825     CV *caller_cv = NULL;	/* who called us */
5826     CV *last_pushed_cv = NULL;	/* most recently called (?{}) CV */
5827     U32 maxopenparen = 0;       /* max '(' index seen so far */
5828     int to_complement;  /* Invert the result? */
5829     _char_class_number classnum;
5830     bool is_utf8_pat = reginfo->is_utf8_pat;
5831     bool match = FALSE;
5832     I32 orig_savestack_ix = PL_savestack_ix;
5833     U8 * script_run_begin = NULL;
5834 
5835 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5836 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5837 #  define SOLARIS_BAD_OPTIMIZER
5838     const U32 *pl_charclass_dup = PL_charclass;
5839 #  define PL_charclass pl_charclass_dup
5840 #endif
5841 
5842 #ifdef DEBUGGING
5843     DECLARE_AND_GET_RE_DEBUG_FLAGS;
5844 #endif
5845 
5846     /* protect against undef(*^R) */
5847     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5848 
5849     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5850     multicall_oldcatch = 0;
5851     PERL_UNUSED_VAR(multicall_cop);
5852 
5853     PERL_ARGS_ASSERT_REGMATCH;
5854 
5855     st = PL_regmatch_state;
5856 
5857     /* Note that nextchr is a byte even in UTF */
5858     SET_nextchr;
5859     scan = prog;
5860 
5861     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5862             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5863             Perl_re_printf( aTHX_ "regmatch start\n" );
5864     }));
5865 
5866     while (scan != NULL) {
5867 	next = scan + NEXT_OFF(scan);
5868 	if (next == scan)
5869 	    next = NULL;
5870 	state_num = OP(scan);
5871 
5872       reenter_switch:
5873         DEBUG_EXECUTE_r(
5874             if (state_num <= REGNODE_MAX) {
5875                 SV * const prop = sv_newmortal();
5876                 regnode *rnext = regnext(scan);
5877 
5878                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5879                 regprop(rex, prop, scan, reginfo, NULL);
5880                 Perl_re_printf( aTHX_
5881                     "%*s%" IVdf ":%s(%" IVdf ")\n",
5882                     INDENT_CHARS(depth), "",
5883                     (IV)(scan - rexi->program),
5884                     SvPVX_const(prop),
5885                     (PL_regkind[OP(scan)] == END || !rnext) ?
5886                         0 : (IV)(rnext - rexi->program));
5887             }
5888         );
5889 
5890         to_complement = 0;
5891 
5892         SET_nextchr;
5893         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5894 
5895 	switch (state_num) {
5896 	case SBOL: /*  /^../ and /\A../  */
5897 	    if (locinput == reginfo->strbeg)
5898 		break;
5899 	    sayNO;
5900 
5901 	case MBOL: /*  /^../m  */
5902 	    if (locinput == reginfo->strbeg ||
5903 		(!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5904 	    {
5905 		break;
5906 	    }
5907 	    sayNO;
5908 
5909 	case GPOS: /*  \G  */
5910 	    if (locinput == reginfo->ganch)
5911 		break;
5912 	    sayNO;
5913 
5914 	case KEEPS: /*   \K  */
5915 	    /* update the startpoint */
5916 	    st->u.keeper.val = rex->offs[0].start;
5917 	    rex->offs[0].start = locinput - reginfo->strbeg;
5918 	    PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
5919                             script_run_begin);
5920 	    NOT_REACHED; /* NOTREACHED */
5921 
5922 	case KEEPS_next_fail:
5923 	    /* rollback the start point change */
5924 	    rex->offs[0].start = st->u.keeper.val;
5925 	    sayNO_SILENT;
5926 	    NOT_REACHED; /* NOTREACHED */
5927 
5928 	case MEOL: /* /..$/m  */
5929 	    if (!NEXTCHR_IS_EOS && nextchr != '\n')
5930 		sayNO;
5931 	    break;
5932 
5933 	case SEOL: /* /..$/  */
5934 	    if (!NEXTCHR_IS_EOS && nextchr != '\n')
5935 		sayNO;
5936 	    if (reginfo->strend - locinput > 1)
5937 		sayNO;
5938 	    break;
5939 
5940 	case EOS: /*  \z  */
5941 	    if (!NEXTCHR_IS_EOS)
5942 		sayNO;
5943 	    break;
5944 
5945 	case SANY: /*  /./s  */
5946 	    if (NEXTCHR_IS_EOS || locinput >= loceol)
5947 		sayNO;
5948             goto increment_locinput;
5949 
5950 	case REG_ANY: /*  /./  */
5951 	    if (   NEXTCHR_IS_EOS
5952                 || locinput >= loceol
5953                 || nextchr == '\n')
5954             {
5955 		sayNO;
5956             }
5957             goto increment_locinput;
5958 
5959 
5960 #undef  ST
5961 #define ST st->u.trie
5962         case TRIEC: /* (ab|cd) with known charclass */
5963             /* In this case the charclass data is available inline so
5964                we can fail fast without a lot of extra overhead.
5965              */
5966             if ( !   NEXTCHR_IS_EOS
5967                 &&   locinput < loceol
5968                 && ! ANYOF_BITMAP_TEST(scan, nextchr))
5969             {
5970                 DEBUG_EXECUTE_r(
5971                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
5972                               depth, PL_colors[4], PL_colors[5])
5973                 );
5974                 sayNO_SILENT;
5975                 NOT_REACHED; /* NOTREACHED */
5976             }
5977             /* FALLTHROUGH */
5978 	case TRIE:  /* (ab|cd)  */
5979 	    /* the basic plan of execution of the trie is:
5980 	     * At the beginning, run though all the states, and
5981 	     * find the longest-matching word. Also remember the position
5982 	     * of the shortest matching word. For example, this pattern:
5983 	     *    1  2 3 4    5
5984 	     *    ab|a|x|abcd|abc
5985 	     * when matched against the string "abcde", will generate
5986 	     * accept states for all words except 3, with the longest
5987 	     * matching word being 4, and the shortest being 2 (with
5988 	     * the position being after char 1 of the string).
5989 	     *
5990 	     * Then for each matching word, in word order (i.e. 1,2,4,5),
5991 	     * we run the remainder of the pattern; on each try setting
5992 	     * the current position to the character following the word,
5993 	     * returning to try the next word on failure.
5994 	     *
5995 	     * We avoid having to build a list of words at runtime by
5996 	     * using a compile-time structure, wordinfo[].prev, which
5997 	     * gives, for each word, the previous accepting word (if any).
5998 	     * In the case above it would contain the mappings 1->2, 2->0,
5999 	     * 3->0, 4->5, 5->1.  We can use this table to generate, from
6000 	     * the longest word (4 above), a list of all words, by
6001 	     * following the list of prev pointers; this gives us the
6002 	     * unordered list 4,5,1,2. Then given the current word we have
6003 	     * just tried, we can go through the list and find the
6004 	     * next-biggest word to try (so if we just failed on word 2,
6005 	     * the next in the list is 4).
6006 	     *
6007 	     * Since at runtime we don't record the matching position in
6008 	     * the string for each word, we have to work that out for
6009 	     * each word we're about to process. The wordinfo table holds
6010 	     * the character length of each word; given that we recorded
6011 	     * at the start: the position of the shortest word and its
6012 	     * length in chars, we just need to move the pointer the
6013 	     * difference between the two char lengths. Depending on
6014 	     * Unicode status and folding, that's cheap or expensive.
6015 	     *
6016 	     * This algorithm is optimised for the case where are only a
6017 	     * small number of accept states, i.e. 0,1, or maybe 2.
6018 	     * With lots of accepts states, and having to try all of them,
6019 	     * it becomes quadratic on number of accept states to find all
6020 	     * the next words.
6021 	     */
6022 
6023 	    {
6024                 /* what type of TRIE am I? (utf8 makes this contextual) */
6025                 DECL_TRIE_TYPE(scan);
6026 
6027                 /* what trie are we using right now */
6028 		reg_trie_data * const trie
6029         	    = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6030 		HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6031                 U32 state = trie->startstate;
6032 
6033                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6034                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6035                     if (utf8_target
6036                         && ! NEXTCHR_IS_EOS
6037                         && UTF8_IS_ABOVE_LATIN1(nextchr)
6038                         && scan->flags == EXACTL)
6039                     {
6040                         /* We only output for EXACTL, as we let the folder
6041                          * output this message for EXACTFLU8 to avoid
6042                          * duplication */
6043                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6044                                                                reginfo->strend);
6045                     }
6046                 }
6047                 if (   trie->bitmap
6048                     && (     NEXTCHR_IS_EOS
6049                         ||   locinput >= loceol
6050                         || ! TRIE_BITMAP_TEST(trie, nextchr)))
6051                 {
6052         	    if (trie->states[ state ].wordnum) {
6053         	         DEBUG_EXECUTE_r(
6054                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
6055                                           depth, PL_colors[4], PL_colors[5])
6056                         );
6057 			if (!trie->jump)
6058 			    break;
6059         	    } else {
6060         	        DEBUG_EXECUTE_r(
6061                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6062                                           depth, PL_colors[4], PL_colors[5])
6063                         );
6064         	        sayNO_SILENT;
6065         	   }
6066                 }
6067 
6068             {
6069 		U8 *uc = ( U8* )locinput;
6070 
6071 		STRLEN len = 0;
6072 		STRLEN foldlen = 0;
6073 		U8 *uscan = (U8*)NULL;
6074 		U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6075 		U32 charcount = 0; /* how many input chars we have matched */
6076 		U32 accepted = 0; /* have we seen any accepting states? */
6077 
6078 		ST.jump = trie->jump;
6079 		ST.me = scan;
6080 		ST.firstpos = NULL;
6081 		ST.longfold = FALSE; /* char longer if folded => it's harder */
6082 		ST.nextword = 0;
6083 
6084 		/* fully traverse the TRIE; note the position of the
6085 		   shortest accept state and the wordnum of the longest
6086 		   accept state */
6087 
6088 		while ( state && uc <= (U8*)(loceol) ) {
6089                     U32 base = trie->states[ state ].trans.base;
6090                     UV uvc = 0;
6091                     U16 charid = 0;
6092 		    U16 wordnum;
6093                     wordnum = trie->states[ state ].wordnum;
6094 
6095 		    if (wordnum) { /* it's an accept state */
6096 			if (!accepted) {
6097 			    accepted = 1;
6098 			    /* record first match position */
6099 			    if (ST.longfold) {
6100 				ST.firstpos = (U8*)locinput;
6101 				ST.firstchars = 0;
6102 			    }
6103 			    else {
6104 				ST.firstpos = uc;
6105 				ST.firstchars = charcount;
6106 			    }
6107 			}
6108 			if (!ST.nextword || wordnum < ST.nextword)
6109 			    ST.nextword = wordnum;
6110 			ST.topword = wordnum;
6111 		    }
6112 
6113 		    DEBUG_TRIE_EXECUTE_r({
6114                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6115                                 /* HERE */
6116                                 PerlIO_printf( Perl_debug_log,
6117                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6118                                     INDENT_CHARS(depth), "", PL_colors[4],
6119 			            (UV)state, (accepted ? 'Y' : 'N'));
6120 		    });
6121 
6122 		    /* read a char and goto next state */
6123 		    if ( base && (foldlen || uc < (U8*)(loceol))) {
6124 			I32 offset;
6125 			REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6126                                              (U8 *) loceol, uscan,
6127                                              len, uvc, charid, foldlen,
6128                                              foldbuf, uniflags);
6129 			charcount++;
6130 			if (foldlen>0)
6131 			    ST.longfold = TRUE;
6132 			if (charid &&
6133 			     ( ((offset =
6134 			      base + charid - 1 - trie->uniquecharcount)) >= 0)
6135 
6136 			     && ((U32)offset < trie->lasttrans)
6137 			     && trie->trans[offset].check == state)
6138 			{
6139 			    state = trie->trans[offset].next;
6140 			}
6141 			else {
6142 			    state = 0;
6143 			}
6144 			uc += len;
6145 
6146 		    }
6147 		    else {
6148 			state = 0;
6149 		    }
6150 		    DEBUG_TRIE_EXECUTE_r(
6151                         Perl_re_printf( aTHX_
6152 		            "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6153 		            charid, uvc, (UV)state, PL_colors[5] );
6154 		    );
6155 		}
6156 		if (!accepted)
6157 		   sayNO;
6158 
6159 		/* calculate total number of accept states */
6160 		{
6161 		    U16 w = ST.topword;
6162 		    accepted = 0;
6163 		    while (w) {
6164 			w = trie->wordinfo[w].prev;
6165 			accepted++;
6166 		    }
6167 		    ST.accepted = accepted;
6168 		}
6169 
6170 		DEBUG_EXECUTE_r(
6171                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6172                         depth,
6173 			PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6174 		);
6175 		goto trie_first_try; /* jump into the fail handler */
6176 	    }}
6177 	    NOT_REACHED; /* NOTREACHED */
6178 
6179 	case TRIE_next_fail: /* we failed - try next alternative */
6180         {
6181             U8 *uc;
6182             if ( ST.jump ) {
6183                 /* undo any captures done in the tail part of a branch,
6184                  * e.g.
6185                  *    /(?:X(.)(.)|Y(.)).../
6186                  * where the trie just matches X then calls out to do the
6187                  * rest of the branch */
6188                 REGCP_UNWIND(ST.cp);
6189                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6190 	    }
6191 	    if (!--ST.accepted) {
6192 	        DEBUG_EXECUTE_r({
6193                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6194                         depth,
6195 			PL_colors[4],
6196 			PL_colors[5] );
6197 		});
6198 		sayNO_SILENT;
6199 	    }
6200 	    {
6201 		/* Find next-highest word to process.  Note that this code
6202 		 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6203 		U16 min = 0;
6204 		U16 word;
6205 		U16 const nextword = ST.nextword;
6206 		reg_trie_wordinfo * const wordinfo
6207 		    = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6208 		for (word=ST.topword; word; word=wordinfo[word].prev) {
6209 		    if (word > nextword && (!min || word < min))
6210 			min = word;
6211 		}
6212 		ST.nextword = min;
6213 	    }
6214 
6215           trie_first_try:
6216             if (do_cutgroup) {
6217                 do_cutgroup = 0;
6218                 no_final = 0;
6219             }
6220 
6221             if ( ST.jump ) {
6222                 ST.lastparen = rex->lastparen;
6223                 ST.lastcloseparen = rex->lastcloseparen;
6224 	        REGCP_SET(ST.cp);
6225             }
6226 
6227 	    /* find start char of end of current word */
6228 	    {
6229 		U32 chars; /* how many chars to skip */
6230 		reg_trie_data * const trie
6231 		    = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6232 
6233 		assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6234 			    >=  ST.firstchars);
6235 		chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6236 			    - ST.firstchars;
6237 		uc = ST.firstpos;
6238 
6239 		if (ST.longfold) {
6240 		    /* the hard option - fold each char in turn and find
6241 		     * its folded length (which may be different */
6242 		    U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6243 		    STRLEN foldlen;
6244 		    STRLEN len;
6245 		    UV uvc;
6246 		    U8 *uscan;
6247 
6248 		    while (chars) {
6249 			if (utf8_target) {
6250                             /* XXX This assumes the length is well-formed, as
6251                              * does the UTF8SKIP below */
6252 			    uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6253 						    uniflags);
6254 			    uc += len;
6255 			}
6256 			else {
6257 			    uvc = *uc;
6258 			    uc++;
6259 			}
6260 			uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6261 			uscan = foldbuf;
6262 			while (foldlen) {
6263 			    if (!--chars)
6264 				break;
6265 			    uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6266                                                  uniflags);
6267 			    uscan += len;
6268 			    foldlen -= len;
6269 			}
6270 		    }
6271 		}
6272 		else {
6273 		    if (utf8_target)
6274 			while (chars--)
6275 			    uc += UTF8SKIP(uc);
6276 		    else
6277 			uc += chars;
6278 		}
6279 	    }
6280 
6281 	    scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6282 			    ? ST.jump[ST.nextword]
6283 			    : NEXT_OFF(ST.me));
6284 
6285 	    DEBUG_EXECUTE_r({
6286                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6287                     depth,
6288 		    PL_colors[4],
6289 		    ST.nextword,
6290 		    PL_colors[5]
6291 		    );
6292 	    });
6293 
6294 	    if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6295 		PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6296                                 script_run_begin);
6297 		NOT_REACHED; /* NOTREACHED */
6298 	    }
6299 	    /* only one choice left - just continue */
6300 	    DEBUG_EXECUTE_r({
6301 		AV *const trie_words
6302 		    = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6303 		SV ** const tmp = trie_words
6304                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6305 		SV *sv= tmp ? sv_newmortal() : NULL;
6306 
6307                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6308                     depth, PL_colors[4],
6309 		    ST.nextword,
6310 		    tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6311 			    PL_colors[0], PL_colors[1],
6312 			    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6313 			)
6314 		    : "not compiled under -Dr",
6315 		    PL_colors[5] );
6316 	    });
6317 
6318 	    locinput = (char*)uc;
6319 	    continue; /* execute rest of RE */
6320             /* NOTREACHED */
6321         }
6322 #undef  ST
6323 
6324 	case LEXACT_REQ8:
6325             if (! utf8_target) {
6326                 sayNO;
6327             }
6328             /* FALLTHROUGH */
6329 
6330 	case LEXACT:
6331         {
6332 	    char *s;
6333 
6334 	    s = STRINGl(scan);
6335 	    ln = STR_LENl(scan);
6336             goto join_short_long_exact;
6337 
6338 	case EXACTL:             /*  /abc/l       */
6339             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6340 
6341             /* Complete checking would involve going through every character
6342              * matched by the string to see if any is above latin1.  But the
6343              * comparision otherwise might very well be a fast assembly
6344              * language routine, and I (khw) don't think slowing things down
6345              * just to check for this warning is worth it.  So this just checks
6346              * the first character */
6347             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6348                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6349             }
6350             goto do_exact;
6351 	case EXACT_REQ8:
6352             if (! utf8_target) {
6353                 sayNO;
6354             }
6355             /* FALLTHROUGH */
6356 
6357 	case EXACT:             /*  /abc/        */
6358           do_exact:
6359 	    s = STRINGs(scan);
6360 	    ln = STR_LENs(scan);
6361 
6362           join_short_long_exact:
6363 	    if (utf8_target != is_utf8_pat) {
6364 		/* The target and the pattern have differing utf8ness. */
6365 		char *l = locinput;
6366 		const char * const e = s + ln;
6367 
6368 		if (utf8_target) {
6369                     /* The target is utf8, the pattern is not utf8.
6370                      * Above-Latin1 code points can't match the pattern;
6371                      * invariants match exactly, and the other Latin1 ones need
6372                      * to be downgraded to a single byte in order to do the
6373                      * comparison.  (If we could be confident that the target
6374                      * is not malformed, this could be refactored to have fewer
6375                      * tests by just assuming that if the first bytes match, it
6376                      * is an invariant, but there are tests in the test suite
6377                      * dealing with (??{...}) which violate this) */
6378 		    while (s < e) {
6379 			if (   l >= loceol
6380                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6381                         {
6382                             sayNO;
6383                         }
6384                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6385 			    if (*l != *s) {
6386                                 sayNO;
6387                             }
6388                             l++;
6389                         }
6390                         else {
6391                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6392                             {
6393                                 sayNO;
6394                             }
6395                             l += 2;
6396                         }
6397 			s++;
6398 		    }
6399 		}
6400 		else {
6401 		    /* The target is not utf8, the pattern is utf8. */
6402 		    while (s < e) {
6403                         if (   l >= loceol
6404                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6405                         {
6406                             sayNO;
6407                         }
6408                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6409 			    if (*s != *l) {
6410                                 sayNO;
6411                             }
6412                             s++;
6413                         }
6414                         else {
6415                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6416                             {
6417                                 sayNO;
6418                             }
6419                             s += 2;
6420                         }
6421 			l++;
6422 		    }
6423 		}
6424 		locinput = l;
6425 	    }
6426             else {
6427                 /* The target and the pattern have the same utf8ness. */
6428                 /* Inline the first character, for speed. */
6429                 if (   loceol - locinput < ln
6430                     || UCHARAT(s) != nextchr
6431                     || (ln > 1 && memNE(s, locinput, ln)))
6432                 {
6433                     sayNO;
6434                 }
6435                 locinput += ln;
6436             }
6437 	    break;
6438 	    }
6439 
6440 	case EXACTFL:            /*  /abc/il      */
6441           {
6442 	    re_fold_t folder;
6443 	    const U8 * fold_array;
6444 	    const char * s;
6445 	    U32 fold_utf8_flags;
6446 
6447             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6448             folder = foldEQ_locale;
6449             fold_array = PL_fold_locale;
6450 	    fold_utf8_flags = FOLDEQ_LOCALE;
6451 	    goto do_exactf;
6452 
6453         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6454                                       is effectively /u; hence to match, target
6455                                       must be UTF-8. */
6456             if (! utf8_target) {
6457                 sayNO;
6458             }
6459             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6460                                              | FOLDEQ_S2_FOLDS_SANE;
6461 	    folder = foldEQ_latin1_s2_folded;
6462 	    fold_array = PL_fold_latin1;
6463 	    goto do_exactf;
6464 
6465         case EXACTFU_REQ8:      /* /abc/iu with something in /abc/ > 255 */
6466             if (! utf8_target) {
6467                 sayNO;
6468             }
6469 	    assert(is_utf8_pat);
6470 	    fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6471 	    goto do_exactf;
6472 
6473         case EXACTFUP:          /*  /foo/iu, and something is problematic in
6474                                     'foo' so can't take shortcuts. */
6475             assert(! is_utf8_pat);
6476             folder = foldEQ_latin1;
6477 	    fold_array = PL_fold_latin1;
6478 	    fold_utf8_flags = 0;
6479 	    goto do_exactf;
6480 
6481 	case EXACTFU:            /*  /abc/iu      */
6482             folder = foldEQ_latin1_s2_folded;
6483 	    fold_array = PL_fold_latin1;
6484 	    fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6485 	    goto do_exactf;
6486 
6487         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
6488                                    patterns */
6489             assert(! is_utf8_pat);
6490             /* FALLTHROUGH */
6491 	case EXACTFAA:            /*  /abc/iaa     */
6492             folder = foldEQ_latin1_s2_folded;
6493 	    fold_array = PL_fold_latin1;
6494 	    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6495             if (is_utf8_pat || ! utf8_target) {
6496 
6497                 /* The possible presence of a MICRO SIGN in the pattern forbids
6498                  * us to view a non-UTF-8 pattern as folded when there is a
6499                  * UTF-8 target */
6500                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
6501                                   |FOLDEQ_S2_FOLDS_SANE;
6502             }
6503 	    goto do_exactf;
6504 
6505 
6506         case EXACTF:             /*  /abc/i    This node only generated for
6507                                                non-utf8 patterns */
6508             assert(! is_utf8_pat);
6509 	    folder = foldEQ;
6510 	    fold_array = PL_fold;
6511 	    fold_utf8_flags = 0;
6512 
6513 	  do_exactf:
6514 	    s = STRINGs(scan);
6515 	    ln = STR_LENs(scan);
6516 
6517 	    if (   utf8_target
6518                 || is_utf8_pat
6519                 || state_num == EXACTFUP
6520                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6521             {
6522 	      /* Either target or the pattern are utf8, or has the issue where
6523 	       * the fold lengths may differ. */
6524 		const char * const l = locinput;
6525 		char *e = loceol;
6526 
6527 		if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
6528                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
6529 		{
6530 		    sayNO;
6531 		}
6532 		locinput = e;
6533 		break;
6534 	    }
6535 
6536 	    /* Neither the target nor the pattern are utf8 */
6537 	    if (UCHARAT(s) != nextchr
6538                 && !NEXTCHR_IS_EOS
6539 		&& UCHARAT(s) != fold_array[nextchr])
6540 	    {
6541 		sayNO;
6542 	    }
6543 	    if (loceol - locinput < ln)
6544 		sayNO;
6545 	    if (ln > 1 && ! folder(locinput, s, ln))
6546 		sayNO;
6547 	    locinput += ln;
6548 	    break;
6549 	}
6550 
6551 	case NBOUNDL: /*  /\B/l  */
6552             to_complement = 1;
6553             /* FALLTHROUGH */
6554 
6555 	case BOUNDL:  /*  /\b/l  */
6556         {
6557             bool b1, b2;
6558             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6559 
6560             if (FLAGS(scan) != TRADITIONAL_BOUND) {
6561                 if (! IN_UTF8_CTYPE_LOCALE) {
6562                     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6563                                                 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6564                 }
6565                 goto boundu;
6566             }
6567 
6568 	    if (utf8_target) {
6569 		if (locinput == reginfo->strbeg)
6570 		    b1 = isWORDCHAR_LC('\n');
6571 		else {
6572                     U8 *p = reghop3((U8*)locinput, -1,
6573                                     (U8*)(reginfo->strbeg));
6574                     b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
6575 		}
6576                 b2 = (NEXTCHR_IS_EOS)
6577                     ? isWORDCHAR_LC('\n')
6578                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6579                                               (U8*) reginfo->strend);
6580 	    }
6581 	    else { /* Here the string isn't utf8 */
6582 		b1 = (locinput == reginfo->strbeg)
6583                      ? isWORDCHAR_LC('\n')
6584                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
6585                 b2 = (NEXTCHR_IS_EOS)
6586                     ? isWORDCHAR_LC('\n')
6587                     : isWORDCHAR_LC(nextchr);
6588 	    }
6589             if (to_complement ^ (b1 == b2)) {
6590                 sayNO;
6591             }
6592 	    break;
6593         }
6594 
6595 	case NBOUND:  /*  /\B/   */
6596             to_complement = 1;
6597             /* FALLTHROUGH */
6598 
6599 	case BOUND:   /*  /\b/   */
6600 	    if (utf8_target) {
6601                 goto bound_utf8;
6602             }
6603             goto bound_ascii_match_only;
6604 
6605 	case NBOUNDA: /*  /\B/a  */
6606             to_complement = 1;
6607             /* FALLTHROUGH */
6608 
6609 	case BOUNDA:  /*  /\b/a  */
6610         {
6611             bool b1, b2;
6612 
6613           bound_ascii_match_only:
6614             /* Here the string isn't utf8, or is utf8 and only ascii characters
6615              * are to match \w.  In the latter case looking at the byte just
6616              * prior to the current one may be just the final byte of a
6617              * multi-byte character.  This is ok.  There are two cases:
6618              * 1) it is a single byte character, and then the test is doing
6619              *    just what it's supposed to.
6620              * 2) it is a multi-byte character, in which case the final byte is
6621              *    never mistakable for ASCII, and so the test will say it is
6622              *    not a word character, which is the correct answer. */
6623             b1 = (locinput == reginfo->strbeg)
6624                  ? isWORDCHAR_A('\n')
6625                  : isWORDCHAR_A(UCHARAT(locinput - 1));
6626             b2 = (NEXTCHR_IS_EOS)
6627                 ? isWORDCHAR_A('\n')
6628                 : isWORDCHAR_A(nextchr);
6629             if (to_complement ^ (b1 == b2)) {
6630                 sayNO;
6631             }
6632 	    break;
6633         }
6634 
6635 	case NBOUNDU: /*  /\B/u  */
6636             to_complement = 1;
6637             /* FALLTHROUGH */
6638 
6639 	case BOUNDU:  /*  /\b/u  */
6640 
6641           boundu:
6642             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6643                 match = FALSE;
6644             }
6645             else if (utf8_target) {
6646               bound_utf8:
6647                 switch((bound_type) FLAGS(scan)) {
6648                     case TRADITIONAL_BOUND:
6649                     {
6650                         bool b1, b2;
6651                         if (locinput == reginfo->strbeg) {
6652                             b1 = 0 /* isWORDCHAR_L1('\n') */;
6653                         }
6654                         else {
6655                             U8 *p = reghop3((U8*)locinput, -1,
6656                                             (U8*)(reginfo->strbeg));
6657 
6658                             b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
6659                         }
6660                         b2 = (NEXTCHR_IS_EOS)
6661                             ? 0 /* isWORDCHAR_L1('\n') */
6662                             : isWORDCHAR_utf8_safe((U8*)locinput,
6663                                                    (U8*) reginfo->strend);
6664                         match = cBOOL(b1 != b2);
6665                         break;
6666                     }
6667                     case GCB_BOUND:
6668                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6669                             match = TRUE; /* GCB always matches at begin and
6670                                              end */
6671                         }
6672                         else {
6673                             /* Find the gcb values of previous and current
6674                              * chars, then see if is a break point */
6675                             match = isGCB(getGCB_VAL_UTF8(
6676                                                 reghop3((U8*)locinput,
6677                                                         -1,
6678                                                         (U8*)(reginfo->strbeg)),
6679                                                 (U8*) reginfo->strend),
6680                                           getGCB_VAL_UTF8((U8*) locinput,
6681                                                         (U8*) reginfo->strend),
6682                                           (U8*) reginfo->strbeg,
6683                                           (U8*) locinput,
6684                                           utf8_target);
6685                         }
6686                         break;
6687 
6688                     case LB_BOUND:
6689                         if (locinput == reginfo->strbeg) {
6690                             match = FALSE;
6691                         }
6692                         else if (NEXTCHR_IS_EOS) {
6693                             match = TRUE;
6694                         }
6695                         else {
6696                             match = isLB(getLB_VAL_UTF8(
6697                                                 reghop3((U8*)locinput,
6698                                                         -1,
6699                                                         (U8*)(reginfo->strbeg)),
6700                                                 (U8*) reginfo->strend),
6701                                           getLB_VAL_UTF8((U8*) locinput,
6702                                                         (U8*) reginfo->strend),
6703                                           (U8*) reginfo->strbeg,
6704                                           (U8*) locinput,
6705                                           (U8*) reginfo->strend,
6706                                           utf8_target);
6707                         }
6708                         break;
6709 
6710                     case SB_BOUND: /* Always matches at begin and end */
6711                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6712                             match = TRUE;
6713                         }
6714                         else {
6715                             match = isSB(getSB_VAL_UTF8(
6716                                                 reghop3((U8*)locinput,
6717                                                         -1,
6718                                                         (U8*)(reginfo->strbeg)),
6719                                                 (U8*) reginfo->strend),
6720                                           getSB_VAL_UTF8((U8*) locinput,
6721                                                         (U8*) reginfo->strend),
6722                                           (U8*) reginfo->strbeg,
6723                                           (U8*) locinput,
6724                                           (U8*) reginfo->strend,
6725                                           utf8_target);
6726                         }
6727                         break;
6728 
6729                     case WB_BOUND:
6730                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6731                             match = TRUE;
6732                         }
6733                         else {
6734                             match = isWB(WB_UNKNOWN,
6735                                          getWB_VAL_UTF8(
6736                                                 reghop3((U8*)locinput,
6737                                                         -1,
6738                                                         (U8*)(reginfo->strbeg)),
6739                                                 (U8*) reginfo->strend),
6740                                           getWB_VAL_UTF8((U8*) locinput,
6741                                                         (U8*) reginfo->strend),
6742                                           (U8*) reginfo->strbeg,
6743                                           (U8*) locinput,
6744                                           (U8*) reginfo->strend,
6745                                           utf8_target);
6746                         }
6747                         break;
6748                 }
6749 	    }
6750 	    else {  /* Not utf8 target */
6751                 switch((bound_type) FLAGS(scan)) {
6752                     case TRADITIONAL_BOUND:
6753                     {
6754                         bool b1, b2;
6755                         b1 = (locinput == reginfo->strbeg)
6756                             ? 0 /* isWORDCHAR_L1('\n') */
6757                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
6758                         b2 = (NEXTCHR_IS_EOS)
6759                             ? 0 /* isWORDCHAR_L1('\n') */
6760                             : isWORDCHAR_L1(nextchr);
6761                         match = cBOOL(b1 != b2);
6762                         break;
6763                     }
6764 
6765                     case GCB_BOUND:
6766                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6767                             match = TRUE; /* GCB always matches at begin and
6768                                              end */
6769                         }
6770                         else {  /* Only CR-LF combo isn't a GCB in 0-255
6771                                    range */
6772                             match =    UCHARAT(locinput - 1) != '\r'
6773                                     || UCHARAT(locinput) != '\n';
6774                         }
6775                         break;
6776 
6777                     case LB_BOUND:
6778                         if (locinput == reginfo->strbeg) {
6779                             match = FALSE;
6780                         }
6781                         else if (NEXTCHR_IS_EOS) {
6782                             match = TRUE;
6783                         }
6784                         else {
6785                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6786                                          getLB_VAL_CP(UCHARAT(locinput)),
6787                                          (U8*) reginfo->strbeg,
6788                                          (U8*) locinput,
6789                                          (U8*) reginfo->strend,
6790                                          utf8_target);
6791                         }
6792                         break;
6793 
6794                     case SB_BOUND: /* Always matches at begin and end */
6795                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6796                             match = TRUE;
6797                         }
6798                         else {
6799                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6800                                          getSB_VAL_CP(UCHARAT(locinput)),
6801                                          (U8*) reginfo->strbeg,
6802                                          (U8*) locinput,
6803                                          (U8*) reginfo->strend,
6804                                          utf8_target);
6805                         }
6806                         break;
6807 
6808                     case WB_BOUND:
6809                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6810                             match = TRUE;
6811                         }
6812                         else {
6813                             match = isWB(WB_UNKNOWN,
6814                                          getWB_VAL_CP(UCHARAT(locinput -1)),
6815                                          getWB_VAL_CP(UCHARAT(locinput)),
6816                                          (U8*) reginfo->strbeg,
6817                                          (U8*) locinput,
6818                                          (U8*) reginfo->strend,
6819                                          utf8_target);
6820                         }
6821                         break;
6822                 }
6823 	    }
6824 
6825             if (to_complement ^ ! match) {
6826                 sayNO;
6827             }
6828 	    break;
6829 
6830         case ANYOFPOSIXL:
6831 	case ANYOFL:  /*  /[abc]/l      */
6832             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6833 
6834             if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6835             {
6836               Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6837             }
6838             /* FALLTHROUGH */
6839 	case ANYOFD:  /*   /[abc]/d       */
6840 	case ANYOF:  /*   /[abc]/       */
6841             if (NEXTCHR_IS_EOS || locinput >= loceol)
6842                 sayNO;
6843 	    if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
6844 	        && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
6845             {
6846                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
6847 		    sayNO;
6848                 }
6849 		locinput++;
6850             }
6851             else {
6852 	        if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6853                                                                    utf8_target))
6854                 {
6855 		    sayNO;
6856                 }
6857                 goto increment_locinput;
6858             }
6859 	    break;
6860 
6861         case ANYOFM:
6862             if (   NEXTCHR_IS_EOS
6863                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
6864                 || locinput >= loceol)
6865             {
6866                 sayNO;
6867             }
6868             locinput++; /* ANYOFM is always single byte */
6869             break;
6870 
6871         case NANYOFM:
6872             if (   NEXTCHR_IS_EOS
6873                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
6874                 || locinput >= loceol)
6875             {
6876                 sayNO;
6877             }
6878             goto increment_locinput;
6879             break;
6880 
6881         case ANYOFH:
6882             if (   ! utf8_target
6883                 ||   NEXTCHR_IS_EOS
6884                 ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
6885 	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6886                                                                    utf8_target))
6887             {
6888                 sayNO;
6889             }
6890             goto increment_locinput;
6891             break;
6892 
6893         case ANYOFHb:
6894             if (   ! utf8_target
6895                 ||   NEXTCHR_IS_EOS
6896                 ||   ANYOF_FLAGS(scan) != (U8) *locinput
6897 	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6898                                                                   utf8_target))
6899             {
6900                 sayNO;
6901             }
6902             goto increment_locinput;
6903             break;
6904 
6905         case ANYOFHr:
6906             if (   ! utf8_target
6907                 ||   NEXTCHR_IS_EOS
6908                 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
6909                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
6910                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
6911 	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6912                                                                    utf8_target))
6913             {
6914                 sayNO;
6915             }
6916             goto increment_locinput;
6917             break;
6918 
6919         case ANYOFHs:
6920             if (   ! utf8_target
6921                 ||   NEXTCHR_IS_EOS
6922                 ||   loceol - locinput < FLAGS(scan)
6923                 ||   memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
6924 	        || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
6925                                                                    utf8_target))
6926             {
6927                 sayNO;
6928             }
6929             goto increment_locinput;
6930             break;
6931 
6932         case ANYOFR:
6933             if (NEXTCHR_IS_EOS) {
6934                 sayNO;
6935             }
6936 
6937             if (utf8_target) {
6938                 if (    ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
6939                    || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
6940                                                 (U8 *) reginfo->strend,
6941                                                 NULL),
6942                                     ANYOFRbase(scan), ANYOFRdelta(scan)))
6943                 {
6944                     sayNO;
6945                 }
6946             }
6947             else {
6948                 if (! withinCOUNT((U8) *locinput,
6949                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
6950                 {
6951                     sayNO;
6952                 }
6953             }
6954             goto increment_locinput;
6955             break;
6956 
6957         case ANYOFRb:
6958             if (NEXTCHR_IS_EOS) {
6959                 sayNO;
6960             }
6961 
6962             if (utf8_target) {
6963                 if (     ANYOF_FLAGS(scan) != (U8) *locinput
6964                     || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
6965                                                 (U8 *) reginfo->strend,
6966                                                 NULL),
6967                                      ANYOFRbase(scan), ANYOFRdelta(scan)))
6968                 {
6969                     sayNO;
6970                 }
6971             }
6972             else {
6973                 if (! withinCOUNT((U8) *locinput,
6974                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
6975                 {
6976                     sayNO;
6977                 }
6978             }
6979             goto increment_locinput;
6980             break;
6981 
6982         /* The argument (FLAGS) to all the POSIX node types is the class number
6983          * */
6984 
6985         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
6986             to_complement = 1;
6987             /* FALLTHROUGH */
6988 
6989         case POSIXL:    /* \w or [:punct:] etc. under /l */
6990             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6991             if (NEXTCHR_IS_EOS || locinput >= loceol)
6992                 sayNO;
6993 
6994             /* Use isFOO_lc() for characters within Latin1.  (Note that
6995              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6996              * wouldn't be invariant) */
6997             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6998                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6999                     sayNO;
7000                 }
7001 
7002                 locinput++;
7003                 break;
7004             }
7005 
7006             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7007                 /* An above Latin-1 code point, or malformed */
7008                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7009                                                        reginfo->strend);
7010                 goto utf8_posix_above_latin1;
7011             }
7012 
7013             /* Here is a UTF-8 variant code point below 256 and the target is
7014              * UTF-8 */
7015             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7016                                             EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
7017                                             *(locinput + 1))))))
7018             {
7019                 sayNO;
7020             }
7021 
7022             goto increment_locinput;
7023 
7024         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
7025             to_complement = 1;
7026             /* FALLTHROUGH */
7027 
7028         case POSIXD:    /* \w or [:punct:] etc. under /d */
7029             if (utf8_target) {
7030                 goto utf8_posix;
7031             }
7032             goto posixa;
7033 
7034         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
7035 
7036             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7037                 sayNO;
7038             }
7039 
7040             /* All UTF-8 variants match */
7041             if (! UTF8_IS_INVARIANT(nextchr)) {
7042                 goto increment_locinput;
7043             }
7044 
7045             to_complement = 1;
7046             goto join_nposixa;
7047 
7048         case POSIXA:    /* \w or [:punct:] etc. under /a */
7049 
7050           posixa:
7051             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7052              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7053              * character is a single byte */
7054 
7055             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7056                 sayNO;
7057             }
7058 
7059           join_nposixa:
7060 
7061             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
7062                                                                 FLAGS(scan)))))
7063             {
7064                 sayNO;
7065             }
7066 
7067             /* Here we are either not in utf8, or we matched a utf8-invariant,
7068              * so the next char is the next byte */
7069             locinput++;
7070             break;
7071 
7072         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
7073             to_complement = 1;
7074             /* FALLTHROUGH */
7075 
7076         case POSIXU:    /* \w or [:punct:] etc. under /u */
7077           utf8_posix:
7078             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7079                 sayNO;
7080             }
7081 
7082             /* Use _generic_isCC() for characters within Latin1.  (Note that
7083              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7084              * wouldn't be invariant) */
7085             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
7086                 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
7087                                                            FLAGS(scan)))))
7088                 {
7089                     sayNO;
7090                 }
7091                 locinput++;
7092             }
7093             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7094                 if (! (to_complement
7095                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
7096                                                                *(locinput + 1)),
7097                                              FLAGS(scan)))))
7098                 {
7099                     sayNO;
7100                 }
7101                 locinput += 2;
7102             }
7103             else {  /* Handle above Latin-1 code points */
7104               utf8_posix_above_latin1:
7105                 classnum = (_char_class_number) FLAGS(scan);
7106                 switch (classnum) {
7107                     default:
7108                         if (! (to_complement
7109                            ^ cBOOL(_invlist_contains_cp(
7110                                       PL_XPosix_ptrs[classnum],
7111                                       utf8_to_uvchr_buf((U8 *) locinput,
7112                                                         (U8 *) reginfo->strend,
7113                                                         NULL)))))
7114                         {
7115                             sayNO;
7116                         }
7117                         break;
7118                     case _CC_ENUM_SPACE:
7119                         if (! (to_complement
7120                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
7121                         {
7122                             sayNO;
7123                         }
7124                         break;
7125                     case _CC_ENUM_BLANK:
7126                         if (! (to_complement
7127                                         ^ cBOOL(is_HORIZWS_high(locinput))))
7128                         {
7129                             sayNO;
7130                         }
7131                         break;
7132                     case _CC_ENUM_XDIGIT:
7133                         if (! (to_complement
7134                                         ^ cBOOL(is_XDIGIT_high(locinput))))
7135                         {
7136                             sayNO;
7137                         }
7138                         break;
7139                     case _CC_ENUM_VERTSPACE:
7140                         if (! (to_complement
7141                                         ^ cBOOL(is_VERTWS_high(locinput))))
7142                         {
7143                             sayNO;
7144                         }
7145                         break;
7146                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
7147                     case _CC_ENUM_ASCII:
7148                         if (! to_complement) {
7149                             sayNO;
7150                         }
7151                         break;
7152                 }
7153                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7154             }
7155             break;
7156 
7157 	case CLUMP: /* Match \X: logical Unicode character.  This is defined as
7158 		       a Unicode extended Grapheme Cluster */
7159 	    if (NEXTCHR_IS_EOS || locinput >= loceol)
7160 		sayNO;
7161 	    if  (! utf8_target) {
7162 
7163 		/* Match either CR LF  or '.', as all the other possibilities
7164 		 * require utf8 */
7165 		locinput++;	    /* Match the . or CR */
7166 		if (nextchr == '\r' /* And if it was CR, and the next is LF,
7167 				       match the LF */
7168 		    && locinput <  loceol
7169 		    && UCHARAT(locinput) == '\n')
7170                 {
7171                     locinput++;
7172                 }
7173 	    }
7174 	    else {
7175 
7176                 /* Get the gcb type for the current character */
7177                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7178                                                        (U8*) reginfo->strend);
7179 
7180                 /* Then scan through the input until we get to the first
7181                  * character whose type is supposed to be a gcb with the
7182                  * current character.  (There is always a break at the
7183                  * end-of-input) */
7184                 locinput += UTF8SKIP(locinput);
7185                 while (locinput < loceol) {
7186                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7187                                                          (U8*) reginfo->strend);
7188                     if (isGCB(prev_gcb, cur_gcb,
7189                               (U8*) reginfo->strbeg, (U8*) locinput,
7190                               utf8_target))
7191                     {
7192                         break;
7193                     }
7194 
7195                     prev_gcb = cur_gcb;
7196                     locinput += UTF8SKIP(locinput);
7197                 }
7198 
7199 
7200 	    }
7201 	    break;
7202 
7203 	case REFFLN:  /*  /\g{name}/il  */
7204 	{   /* The capture buffer cases.  The ones beginning with N for the
7205 	       named buffers just convert to the equivalent numbered and
7206 	       pretend they were called as the corresponding numbered buffer
7207 	       op.  */
7208 	    /* don't initialize these in the declaration, it makes C++
7209 	       unhappy */
7210 	    const char *s;
7211 	    char type;
7212 	    re_fold_t folder;
7213 	    const U8 *fold_array;
7214 	    UV utf8_fold_flags;
7215 
7216             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7217 	    folder = foldEQ_locale;
7218 	    fold_array = PL_fold_locale;
7219 	    type = REFFL;
7220 	    utf8_fold_flags = FOLDEQ_LOCALE;
7221 	    goto do_nref;
7222 
7223 	case REFFAN:  /*  /\g{name}/iaa  */
7224 	    folder = foldEQ_latin1;
7225 	    fold_array = PL_fold_latin1;
7226 	    type = REFFA;
7227 	    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7228 	    goto do_nref;
7229 
7230 	case REFFUN:  /*  /\g{name}/iu  */
7231 	    folder = foldEQ_latin1;
7232 	    fold_array = PL_fold_latin1;
7233 	    type = REFFU;
7234 	    utf8_fold_flags = 0;
7235 	    goto do_nref;
7236 
7237 	case REFFN:  /*  /\g{name}/i  */
7238 	    folder = foldEQ;
7239 	    fold_array = PL_fold;
7240 	    type = REFF;
7241 	    utf8_fold_flags = 0;
7242 	    goto do_nref;
7243 
7244 	case REFN:  /*  /\g{name}/   */
7245 	    type = REF;
7246 	    folder = NULL;
7247 	    fold_array = NULL;
7248 	    utf8_fold_flags = 0;
7249 	  do_nref:
7250 
7251 	    /* For the named back references, find the corresponding buffer
7252 	     * number */
7253 	    n = reg_check_named_buff_matched(rex,scan);
7254 
7255             if ( ! n ) {
7256                 sayNO;
7257 	    }
7258 	    goto do_nref_ref_common;
7259 
7260 	case REFFL:  /*  /\1/il  */
7261             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7262 	    folder = foldEQ_locale;
7263 	    fold_array = PL_fold_locale;
7264 	    utf8_fold_flags = FOLDEQ_LOCALE;
7265 	    goto do_ref;
7266 
7267 	case REFFA:  /*  /\1/iaa  */
7268 	    folder = foldEQ_latin1;
7269 	    fold_array = PL_fold_latin1;
7270 	    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7271 	    goto do_ref;
7272 
7273 	case REFFU:  /*  /\1/iu  */
7274 	    folder = foldEQ_latin1;
7275 	    fold_array = PL_fold_latin1;
7276 	    utf8_fold_flags = 0;
7277 	    goto do_ref;
7278 
7279 	case REFF:  /*  /\1/i  */
7280 	    folder = foldEQ;
7281 	    fold_array = PL_fold;
7282 	    utf8_fold_flags = 0;
7283 	    goto do_ref;
7284 
7285         case REF:  /*  /\1/    */
7286 	    folder = NULL;
7287 	    fold_array = NULL;
7288 	    utf8_fold_flags = 0;
7289 
7290 	  do_ref:
7291 	    type = OP(scan);
7292 	    n = ARG(scan);  /* which paren pair */
7293 
7294 	  do_nref_ref_common:
7295 	    ln = rex->offs[n].start;
7296 	    endref = rex->offs[n].end;
7297 	    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7298 	    if (rex->lastparen < n || ln == -1 || endref == -1)
7299 		sayNO;			/* Do not match unless seen CLOSEn. */
7300 	    if (ln == endref)
7301 		break;
7302 
7303 	    s = reginfo->strbeg + ln;
7304 	    if (type != REF	/* REF can do byte comparison */
7305 		&& (utf8_target || type == REFFU || type == REFFL))
7306 	    {
7307 		char * limit = loceol;
7308 
7309 		/* This call case insensitively compares the entire buffer
7310 		    * at s, with the current input starting at locinput, but
7311                     * not going off the end given by loceol, and
7312                     * returns in <limit> upon success, how much of the
7313                     * current input was matched */
7314 		if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7315 				    locinput, &limit, 0, utf8_target, utf8_fold_flags))
7316 		{
7317 		    sayNO;
7318 		}
7319 		locinput = limit;
7320 		break;
7321 	    }
7322 
7323 	    /* Not utf8:  Inline the first character, for speed. */
7324 	    if ( ! NEXTCHR_IS_EOS
7325                 && locinput < loceol
7326                 && UCHARAT(s) != nextchr
7327                 && (   type == REF
7328                     || UCHARAT(s) != fold_array[nextchr]))
7329             {
7330 		sayNO;
7331             }
7332 	    ln = endref - ln;
7333 	    if (locinput + ln > loceol)
7334 		sayNO;
7335 	    if (ln > 1 && (type == REF
7336 			   ? memNE(s, locinput, ln)
7337 			   : ! folder(locinput, s, ln)))
7338 		sayNO;
7339 	    locinput += ln;
7340 	    break;
7341 	}
7342 
7343 	case NOTHING: /* null op; e.g. the 'nothing' following
7344                        * the '*' in m{(a+|b)*}' */
7345 	    break;
7346 	case TAIL: /* placeholder while compiling (A|B|C) */
7347 	    break;
7348 
7349 #undef  ST
7350 #define ST st->u.eval
7351 #define CUR_EVAL cur_eval->u.eval
7352 
7353 	{
7354 	    SV *ret;
7355 	    REGEXP *re_sv;
7356             regexp *re;
7357             regexp_internal *rei;
7358             regnode *startpoint;
7359             U32 arg;
7360 
7361 	case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7362             arg= (U32)ARG(scan);
7363             if (cur_eval && cur_eval->locinput == locinput) {
7364                 if ( ++nochange_depth > max_nochange_depth )
7365                     Perl_croak(aTHX_
7366                         "Pattern subroutine nesting without pos change"
7367                         " exceeded limit in regex");
7368             } else {
7369                 nochange_depth = 0;
7370             }
7371 	    re_sv = rex_sv;
7372             re = rex;
7373             rei = rexi;
7374             startpoint = scan + ARG2L(scan);
7375             EVAL_CLOSE_PAREN_SET( st, arg );
7376             /* Detect infinite recursion
7377              *
7378              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7379              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7380              * So we track the position in the string we are at each time
7381              * we recurse and if we try to enter the same routine twice from
7382              * the same position we throw an error.
7383              */
7384             if ( rex->recurse_locinput[arg] == locinput ) {
7385                 /* FIXME: we should show the regop that is failing as part
7386                  * of the error message. */
7387                 Perl_croak(aTHX_ "Infinite recursion in regex");
7388             } else {
7389                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7390                 rex->recurse_locinput[arg]= locinput;
7391 
7392                 DEBUG_r({
7393                     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7394                     DEBUG_STACK_r({
7395                         Perl_re_exec_indentf( aTHX_
7396                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7397                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7398                         );
7399                     });
7400                 });
7401             }
7402 
7403             /* Save all the positions seen so far. */
7404             ST.cp = regcppush(rex, 0, maxopenparen);
7405             REGCP_SET(ST.lastcp);
7406 
7407             /* and then jump to the code we share with EVAL */
7408             goto eval_recurse_doit;
7409             /* NOTREACHED */
7410 
7411         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7412             if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7413 		if ( ++nochange_depth > max_nochange_depth )
7414                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7415             } else {
7416                 nochange_depth = 0;
7417             }
7418 	    {
7419 		/* execute the code in the {...} */
7420 
7421 		dSP;
7422 		IV before;
7423 		OP * const oop = PL_op;
7424 		COP * const ocurcop = PL_curcop;
7425 		OP *nop;
7426 		CV *newcv;
7427 
7428 		/* save *all* paren positions */
7429                 regcppush(rex, 0, maxopenparen);
7430                 REGCP_SET(ST.lastcp);
7431 
7432 		if (!caller_cv)
7433 		    caller_cv = find_runcv(NULL);
7434 
7435 		n = ARG(scan);
7436 
7437 		if (rexi->data->what[n] == 'r') { /* code from an external qr */
7438                     newcv = (ReANY(
7439                                     (REGEXP*)(rexi->data->data[n])
7440                             ))->qr_anoncv;
7441 		    nop = (OP*)rexi->data->data[n+1];
7442 		}
7443 		else if (rexi->data->what[n] == 'l') { /* literal code */
7444 		    newcv = caller_cv;
7445 		    nop = (OP*)rexi->data->data[n];
7446 		    assert(CvDEPTH(newcv));
7447 		}
7448 		else {
7449 		    /* literal with own CV */
7450 		    assert(rexi->data->what[n] == 'L');
7451 		    newcv = rex->qr_anoncv;
7452 		    nop = (OP*)rexi->data->data[n];
7453 		}
7454 
7455                 /* Some notes about MULTICALL and the context and save stacks.
7456                  *
7457                  * In something like
7458                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7459                  * since codeblocks don't introduce a new scope (so that
7460                  * local() etc accumulate), at the end of a successful
7461                  * match there will be a SAVEt_CLEARSV on the savestack
7462                  * for each of $x, $y, $z. If the three code blocks above
7463                  * happen to have come from different CVs (e.g. via
7464                  * embedded qr//s), then we must ensure that during any
7465                  * savestack unwinding, PL_comppad always points to the
7466                  * right pad at each moment. We achieve this by
7467                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7468                  * there is a change of pad.
7469                  * In theory whenever we call a code block, we should
7470                  * push a CXt_SUB context, then pop it on return from
7471                  * that code block. This causes a bit of an issue in that
7472                  * normally popping a context also clears the savestack
7473                  * back to cx->blk_oldsaveix, but here we specifically
7474                  * don't want to clear the save stack on exit from the
7475                  * code block.
7476                  * Also for efficiency we don't want to keep pushing and
7477                  * popping the single SUB context as we backtrack etc.
7478                  * So instead, we push a single context the first time
7479                  * we need, it, then hang onto it until the end of this
7480                  * function. Whenever we encounter a new code block, we
7481                  * update the CV etc if that's changed. During the times
7482                  * in this function where we're not executing a code
7483                  * block, having the SUB context still there is a bit
7484                  * naughty - but we hope that no-one notices.
7485                  * When the SUB context is initially pushed, we fake up
7486                  * cx->blk_oldsaveix to be as if we'd pushed this context
7487                  * on first entry to S_regmatch rather than at some random
7488                  * point during the regexe execution. That way if we
7489                  * croak, popping the context stack will ensure that
7490                  * *everything* SAVEd by this function is undone and then
7491                  * the context popped, rather than e.g., popping the
7492                  * context (and restoring the original PL_comppad) then
7493                  * popping more of the savestack and restoring a bad
7494                  * PL_comppad.
7495                  */
7496 
7497                 /* If this is the first EVAL, push a MULTICALL. On
7498                  * subsequent calls, if we're executing a different CV, or
7499                  * if PL_comppad has got messed up from backtracking
7500                  * through SAVECOMPPADs, then refresh the context.
7501                  */
7502 		if (newcv != last_pushed_cv || PL_comppad != last_pad)
7503 		{
7504                     U8 flags = (CXp_SUB_RE |
7505                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7506                     SAVECOMPPAD();
7507 		    if (last_pushed_cv) {
7508 			CHANGE_MULTICALL_FLAGS(newcv, flags);
7509 		    }
7510 		    else {
7511 			PUSH_MULTICALL_FLAGS(newcv, flags);
7512 		    }
7513                     /* see notes above */
7514                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7515 
7516 		    last_pushed_cv = newcv;
7517 		}
7518 		else {
7519                     /* these assignments are just to silence compiler
7520                      * warnings */
7521 		    multicall_cop = NULL;
7522 		}
7523 		last_pad = PL_comppad;
7524 
7525 		/* the initial nextstate you would normally execute
7526 		 * at the start of an eval (which would cause error
7527 		 * messages to come from the eval), may be optimised
7528 		 * away from the execution path in the regex code blocks;
7529 		 * so manually set PL_curcop to it initially */
7530 		{
7531 		    OP *o = cUNOPx(nop)->op_first;
7532 		    assert(o->op_type == OP_NULL);
7533 		    if (o->op_targ == OP_SCOPE) {
7534 			o = cUNOPo->op_first;
7535 		    }
7536 		    else {
7537 			assert(o->op_targ == OP_LEAVE);
7538 			o = cUNOPo->op_first;
7539 			assert(o->op_type == OP_ENTER);
7540 			o = OpSIBLING(o);
7541 		    }
7542 
7543 		    if (o->op_type != OP_STUB) {
7544 			assert(    o->op_type == OP_NEXTSTATE
7545 				|| o->op_type == OP_DBSTATE
7546 				|| (o->op_type == OP_NULL
7547 				    &&  (  o->op_targ == OP_NEXTSTATE
7548 					|| o->op_targ == OP_DBSTATE
7549 					)
7550 				    )
7551 			);
7552 			PL_curcop = (COP*)o;
7553 		    }
7554 		}
7555 		nop = nop->op_next;
7556 
7557                 DEBUG_STATE_r( Perl_re_printf( aTHX_
7558 		    "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7559 
7560 		rex->offs[0].end = locinput - reginfo->strbeg;
7561                 if (reginfo->info_aux_eval->pos_magic)
7562                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7563                                   reginfo->sv, reginfo->strbeg,
7564                                   locinput - reginfo->strbeg);
7565 
7566                 if (sv_yes_mark) {
7567                     SV *sv_mrk = get_sv("REGMARK", 1);
7568                     sv_setsv(sv_mrk, sv_yes_mark);
7569                 }
7570 
7571 		/* we don't use MULTICALL here as we want to call the
7572 		 * first op of the block of interest, rather than the
7573 		 * first op of the sub. Also, we don't want to free
7574                  * the savestack frame */
7575 		before = (IV)(SP-PL_stack_base);
7576 		PL_op = nop;
7577 		CALLRUNOPS(aTHX);			/* Scalar context. */
7578 		SPAGAIN;
7579 		if ((IV)(SP-PL_stack_base) == before)
7580 		    ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
7581 		else {
7582 		    ret = POPs;
7583 		    PUTBACK;
7584 		}
7585 
7586 		/* before restoring everything, evaluate the returned
7587 		 * value, so that 'uninit' warnings don't use the wrong
7588 		 * PL_op or pad. Also need to process any magic vars
7589 		 * (e.g. $1) *before* parentheses are restored */
7590 
7591 		PL_op = NULL;
7592 
7593                 re_sv = NULL;
7594 		if (logical == 0) {       /*   (?{})/   */
7595                     SV *replsv = save_scalar(PL_replgv);
7596                     sv_setsv(replsv, ret); /* $^R */
7597                     SvSETMAGIC(replsv);
7598                 }
7599 		else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
7600 		    sw = cBOOL(SvTRUE_NN(ret));
7601 		    logical = 0;
7602 		}
7603 		else {                   /*  /(??{})  */
7604 		    /*  if its overloaded, let the regex compiler handle
7605 		     *  it; otherwise extract regex, or stringify  */
7606 		    if (SvGMAGICAL(ret))
7607 			ret = sv_mortalcopy(ret);
7608 		    if (!SvAMAGIC(ret)) {
7609 			SV *sv = ret;
7610 			if (SvROK(sv))
7611 			    sv = SvRV(sv);
7612 			if (SvTYPE(sv) == SVt_REGEXP)
7613 			    re_sv = (REGEXP*) sv;
7614 			else if (SvSMAGICAL(ret)) {
7615 			    MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7616 			    if (mg)
7617 				re_sv = (REGEXP *) mg->mg_obj;
7618 			}
7619 
7620 			/* force any undef warnings here */
7621 			if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7622 			    ret = sv_mortalcopy(ret);
7623 			    (void) SvPV_force_nolen(ret);
7624 			}
7625 		    }
7626 
7627 		}
7628 
7629 		/* *** Note that at this point we don't restore
7630 		 * PL_comppad, (or pop the CxSUB) on the assumption it may
7631 		 * be used again soon. This is safe as long as nothing
7632 		 * in the regexp code uses the pad ! */
7633 		PL_op = oop;
7634 		PL_curcop = ocurcop;
7635                 regcp_restore(rex, ST.lastcp, &maxopenparen);
7636                 PL_curpm_under = PL_curpm;
7637                 PL_curpm = PL_reg_curpm;
7638 
7639 		if (logical != 2) {
7640                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
7641                                     script_run_begin);
7642 		    /* NOTREACHED */
7643                 }
7644 	    }
7645 
7646 		/* only /(??{})/  from now on */
7647 		logical = 0;
7648 		{
7649 		    /* extract RE object from returned value; compiling if
7650 		     * necessary */
7651 
7652 		    if (re_sv) {
7653 			re_sv = reg_temp_copy(NULL, re_sv);
7654 		    }
7655 		    else {
7656 			U32 pm_flags = 0;
7657 
7658 			if (SvUTF8(ret) && IN_BYTES) {
7659 			    /* In use 'bytes': make a copy of the octet
7660 			     * sequence, but without the flag on */
7661 			    STRLEN len;
7662 			    const char *const p = SvPV(ret, len);
7663 			    ret = newSVpvn_flags(p, len, SVs_TEMP);
7664 			}
7665 			if (rex->intflags & PREGf_USE_RE_EVAL)
7666 			    pm_flags |= PMf_USE_RE_EVAL;
7667 
7668 			/* if we got here, it should be an engine which
7669 			 * supports compiling code blocks and stuff */
7670 			assert(rex->engine && rex->engine->op_comp);
7671                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7672 			re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7673 				    rex->engine, NULL, NULL,
7674                                     /* copy /msixn etc to inner pattern */
7675                                     ARG2L(scan),
7676                                     pm_flags);
7677 
7678 			if (!(SvFLAGS(ret)
7679 			      & (SVs_TEMP | SVs_GMG | SVf_ROK))
7680 			 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7681 			    /* This isn't a first class regexp. Instead, it's
7682 			       caching a regexp onto an existing, Perl visible
7683 			       scalar.  */
7684 			    sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7685 			}
7686 		    }
7687 		    SAVEFREESV(re_sv);
7688 		    re = ReANY(re_sv);
7689 		}
7690                 RXp_MATCH_COPIED_off(re);
7691                 re->subbeg = rex->subbeg;
7692                 re->sublen = rex->sublen;
7693                 re->suboffset = rex->suboffset;
7694                 re->subcoffset = rex->subcoffset;
7695                 re->lastparen = 0;
7696                 re->lastcloseparen = 0;
7697 		rei = RXi_GET(re);
7698                 DEBUG_EXECUTE_r(
7699                     debug_start_match(re_sv, utf8_target, locinput,
7700                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
7701 		);
7702 		startpoint = rei->program + 1;
7703                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7704                                              * close_paren only for GOSUB */
7705                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7706                 /* Save all the seen positions so far. */
7707                 ST.cp = regcppush(rex, 0, maxopenparen);
7708                 REGCP_SET(ST.lastcp);
7709                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7710                 maxopenparen = 0;
7711                 /* run the pattern returned from (??{...}) */
7712 
7713               eval_recurse_doit: /* Share code with GOSUB below this line
7714                             * At this point we expect the stack context to be
7715                             * set up correctly */
7716 
7717                 /* invalidate the S-L poscache. We're now executing a
7718                  * different set of WHILEM ops (and their associated
7719                  * indexes) against the same string, so the bits in the
7720                  * cache are meaningless. Setting maxiter to zero forces
7721                  * the cache to be invalidated and zeroed before reuse.
7722 		 * XXX This is too dramatic a measure. Ideally we should
7723                  * save the old cache and restore when running the outer
7724                  * pattern again */
7725 		reginfo->poscache_maxiter = 0;
7726 
7727                 /* the new regexp might have a different is_utf8_pat than we do */
7728                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7729 
7730 		ST.prev_rex = rex_sv;
7731 		ST.prev_curlyx = cur_curlyx;
7732 		rex_sv = re_sv;
7733 		SET_reg_curpm(rex_sv);
7734 		rex = re;
7735 		rexi = rei;
7736 		cur_curlyx = NULL;
7737 		ST.B = next;
7738 		ST.prev_eval = cur_eval;
7739 		cur_eval = st;
7740 		/* now continue from first node in postoned RE */
7741 		PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
7742                                     loceol, script_run_begin);
7743 		NOT_REACHED; /* NOTREACHED */
7744 	}
7745 
7746 	case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7747             /* note: this is called twice; first after popping B, then A */
7748             DEBUG_STACK_r({
7749                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
7750                     depth, cur_eval, ST.prev_eval);
7751             });
7752 
7753 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7754             if ( cur_eval && CUR_EVAL.close_paren ) {\
7755                 DEBUG_STACK_r({ \
7756                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7757                         depth,    \
7758                         CUR_EVAL.close_paren - 1,\
7759                         cur_eval, \
7760                         VAL);     \
7761                 });               \
7762                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7763             }
7764 
7765             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7766 
7767 	    rex_sv = ST.prev_rex;
7768             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7769 	    SET_reg_curpm(rex_sv);
7770 	    rex = ReANY(rex_sv);
7771 	    rexi = RXi_GET(rex);
7772             {
7773                 /* preserve $^R across LEAVE's. See Bug 121070. */
7774                 SV *save_sv= GvSV(PL_replgv);
7775                 SV *replsv;
7776                 SvREFCNT_inc(save_sv);
7777                 regcpblow(ST.cp); /* LEAVE in disguise */
7778                 /* don't move this initialization up */
7779                 replsv = GvSV(PL_replgv);
7780                 sv_setsv(replsv, save_sv);
7781                 SvSETMAGIC(replsv);
7782                 SvREFCNT_dec(save_sv);
7783             }
7784 	    cur_eval = ST.prev_eval;
7785 	    cur_curlyx = ST.prev_curlyx;
7786 
7787 	    /* Invalidate cache. See "invalidate" comment above. */
7788 	    reginfo->poscache_maxiter = 0;
7789             if ( nochange_depth )
7790 	        nochange_depth--;
7791 
7792             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7793 	    sayYES;
7794 
7795 
7796 	case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7797 	    REGCP_UNWIND(ST.lastcp);
7798             sayNO;
7799 
7800 	case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7801 	    /* note: this is called twice; first after popping B, then A */
7802             DEBUG_STACK_r({
7803                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7804                     depth, cur_eval, ST.prev_eval);
7805             });
7806 
7807             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7808 
7809 	    rex_sv = ST.prev_rex;
7810             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7811 	    SET_reg_curpm(rex_sv);
7812 	    rex = ReANY(rex_sv);
7813 	    rexi = RXi_GET(rex);
7814 
7815 	    REGCP_UNWIND(ST.lastcp);
7816             regcppop(rex, &maxopenparen);
7817 	    cur_eval = ST.prev_eval;
7818 	    cur_curlyx = ST.prev_curlyx;
7819 
7820 	    /* Invalidate cache. See "invalidate" comment above. */
7821 	    reginfo->poscache_maxiter = 0;
7822 	    if ( nochange_depth )
7823 	        nochange_depth--;
7824 
7825             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7826             sayNO_SILENT;
7827 #undef ST
7828 
7829 	case OPEN: /*  (  */
7830 	    n = ARG(scan);  /* which paren pair */
7831 	    rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7832 	    if (n > maxopenparen)
7833 		maxopenparen = n;
7834             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7835 		"OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7836                 depth,
7837 		PTR2UV(rex),
7838 		PTR2UV(rex->offs),
7839 		(UV)n,
7840 		(IV)rex->offs[n].start_tmp,
7841 		(UV)maxopenparen
7842 	    ));
7843             lastopen = n;
7844 	    break;
7845 
7846         case SROPEN: /*  (*SCRIPT_RUN:  */
7847             script_run_begin = (U8 *) locinput;
7848             break;
7849 
7850 
7851 	case CLOSE:  /*  )  */
7852 	    n = ARG(scan);  /* which paren pair */
7853 	    CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7854                              locinput - reginfo->strbeg);
7855             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7856 	        goto fake_end;
7857 
7858 	    break;
7859 
7860         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
7861 
7862             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7863             {
7864                 sayNO;
7865             }
7866 
7867             break;
7868 
7869 
7870         case ACCEPT:  /*  (*ACCEPT)  */
7871             if (scan->flags)
7872                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7873             if (ARG2L(scan)){
7874                 regnode *cursor;
7875                 for (cursor=scan;
7876                      cursor && OP(cursor)!=END;
7877                      cursor=regnext(cursor))
7878                 {
7879                     if ( OP(cursor)==CLOSE ){
7880                         n = ARG(cursor);
7881                         if ( n <= lastopen ) {
7882 			    CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7883                                              locinput - reginfo->strbeg);
7884                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7885                                 break;
7886                         }
7887                     }
7888                 }
7889             }
7890 	    goto fake_end;
7891 	    /* NOTREACHED */
7892 
7893 	case GROUPP:  /*  (?(1))  */
7894 	    n = ARG(scan);  /* which paren pair */
7895 	    sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7896 	    break;
7897 
7898 	case GROUPPN:  /*  (?(<name>))  */
7899 	    /* reg_check_named_buff_matched returns 0 for no match */
7900 	    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7901 	    break;
7902 
7903         case INSUBP:   /*  (?(R))  */
7904             n = ARG(scan);
7905             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7906              * of SCAN is already set up as matches a eval.close_paren */
7907             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7908             break;
7909 
7910         case DEFINEP:  /*  (?(DEFINE))  */
7911             sw = 0;
7912             break;
7913 
7914 	case IFTHEN:   /*  (?(cond)A|B)  */
7915 	    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7916 	    if (sw)
7917 		next = NEXTOPER(NEXTOPER(scan));
7918 	    else {
7919 		next = scan + ARG(scan);
7920 		if (OP(next) == IFTHEN) /* Fake one. */
7921 		    next = NEXTOPER(NEXTOPER(next));
7922 	    }
7923 	    break;
7924 
7925 	case LOGICAL:  /* modifier for EVAL and IFMATCH */
7926 	    logical = scan->flags;
7927 	    break;
7928 
7929 /*******************************************************************
7930 
7931 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7932 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7933 STAR/PLUS/CURLY/CURLYN are used instead.)
7934 
7935 A*B is compiled as <CURLYX><A><WHILEM><B>
7936 
7937 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7938 state, which contains the current count, initialised to -1. It also sets
7939 cur_curlyx to point to this state, with any previous value saved in the
7940 state block.
7941 
7942 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7943 since the pattern may possibly match zero times (i.e. it's a while {} loop
7944 rather than a do {} while loop).
7945 
7946 Each entry to WHILEM represents a successful match of A. The count in the
7947 CURLYX block is incremented, another WHILEM state is pushed, and execution
7948 passes to A or B depending on greediness and the current count.
7949 
7950 For example, if matching against the string a1a2a3b (where the aN are
7951 substrings that match /A/), then the match progresses as follows: (the
7952 pushed states are interspersed with the bits of strings matched so far):
7953 
7954     <CURLYX cnt=-1>
7955     <CURLYX cnt=0><WHILEM>
7956     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7957     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7958     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7959     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7960 
7961 (Contrast this with something like CURLYM, which maintains only a single
7962 backtrack state:
7963 
7964     <CURLYM cnt=0> a1
7965     a1 <CURLYM cnt=1> a2
7966     a1 a2 <CURLYM cnt=2> a3
7967     a1 a2 a3 <CURLYM cnt=3> b
7968 )
7969 
7970 Each WHILEM state block marks a point to backtrack to upon partial failure
7971 of A or B, and also contains some minor state data related to that
7972 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
7973 overall state, such as the count, and pointers to the A and B ops.
7974 
7975 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7976 must always point to the *current* CURLYX block, the rules are:
7977 
7978 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7979 and set cur_curlyx to point the new block.
7980 
7981 When popping the CURLYX block after a successful or unsuccessful match,
7982 restore the previous cur_curlyx.
7983 
7984 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7985 to the outer one saved in the CURLYX block.
7986 
7987 When popping the WHILEM block after a successful or unsuccessful B match,
7988 restore the previous cur_curlyx.
7989 
7990 Here's an example for the pattern (AI* BI)*BO
7991 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7992 
7993 cur_
7994 curlyx backtrack stack
7995 ------ ---------------
7996 NULL
7997 CO     <CO prev=NULL> <WO>
7998 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7999 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8000 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8001 
8002 At this point the pattern succeeds, and we work back down the stack to
8003 clean up, restoring as we go:
8004 
8005 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8006 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8007 CO     <CO prev=NULL> <WO>
8008 NULL
8009 
8010 *******************************************************************/
8011 
8012 #define ST st->u.curlyx
8013 
8014 	case CURLYX:    /* start of /A*B/  (for complex A) */
8015 	{
8016 	    /* No need to save/restore up to this paren */
8017 	    I32 parenfloor = scan->flags;
8018 
8019 	    assert(next); /* keep Coverity happy */
8020 	    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
8021 		next += ARG(next);
8022 
8023 	    /* XXXX Probably it is better to teach regpush to support
8024 	       parenfloor > maxopenparen ... */
8025 	    if (parenfloor > (I32)rex->lastparen)
8026 		parenfloor = rex->lastparen; /* Pessimization... */
8027 
8028 	    ST.prev_curlyx= cur_curlyx;
8029 	    cur_curlyx = st;
8030 	    ST.cp = PL_savestack_ix;
8031 
8032 	    /* these fields contain the state of the current curly.
8033 	     * they are accessed by subsequent WHILEMs */
8034 	    ST.parenfloor = parenfloor;
8035 	    ST.me = scan;
8036 	    ST.B = next;
8037 	    ST.minmod = minmod;
8038 	    minmod = 0;
8039 	    ST.count = -1;	/* this will be updated by WHILEM */
8040 	    ST.lastloc = NULL;  /* this will be updated by WHILEM */
8041 
8042 	    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
8043                                 script_run_begin);
8044 	    NOT_REACHED; /* NOTREACHED */
8045 	}
8046 
8047 	case CURLYX_end: /* just finished matching all of A*B */
8048 	    cur_curlyx = ST.prev_curlyx;
8049 	    sayYES;
8050 	    NOT_REACHED; /* NOTREACHED */
8051 
8052 	case CURLYX_end_fail: /* just failed to match all of A*B */
8053 	    regcpblow(ST.cp);
8054 	    cur_curlyx = ST.prev_curlyx;
8055 	    sayNO;
8056 	    NOT_REACHED; /* NOTREACHED */
8057 
8058 
8059 #undef ST
8060 #define ST st->u.whilem
8061 
8062 	case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
8063 	{
8064 	    /* see the discussion above about CURLYX/WHILEM */
8065 	    I32 n;
8066 	    int min, max;
8067 	    regnode *A;
8068 
8069 	    assert(cur_curlyx); /* keep Coverity happy */
8070 
8071 	    min = ARG1(cur_curlyx->u.curlyx.me);
8072 	    max = ARG2(cur_curlyx->u.curlyx.me);
8073 	    A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
8074 	    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8075 	    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8076 	    ST.cache_offset = 0;
8077 	    ST.cache_mask = 0;
8078 
8079 
8080             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
8081                   depth, (long)n, min, max)
8082 	    );
8083 
8084 	    /* First just match a string of min A's. */
8085 
8086 	    if (n < min) {
8087                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8088 		cur_curlyx->u.curlyx.lastloc = locinput;
8089 		REGCP_SET(ST.lastcp);
8090 
8091 		PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8092                                 script_run_begin);
8093 		NOT_REACHED; /* NOTREACHED */
8094 	    }
8095 
8096 	    /* If degenerate A matches "", assume A done. */
8097 
8098 	    if (locinput == cur_curlyx->u.curlyx.lastloc) {
8099                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
8100                    depth)
8101 		);
8102 		goto do_whilem_B_max;
8103 	    }
8104 
8105 	    /* super-linear cache processing.
8106              *
8107              * The idea here is that for certain types of CURLYX/WHILEM -
8108              * principally those whose upper bound is infinity (and
8109              * excluding regexes that have things like \1 and other very
8110              * non-regular expresssiony things), then if a pattern like
8111              * /....A*.../ fails and we backtrack to the WHILEM, then we
8112              * make a note that this particular WHILEM op was at string
8113              * position 47 (say) when the rest of pattern failed. Then, if
8114              * we ever find ourselves back at that WHILEM, and at string
8115              * position 47 again, we can just fail immediately rather than
8116              * running the rest of the pattern again.
8117              *
8118              * This is very handy when patterns start to go
8119              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8120              * with a combinatorial explosion of backtracking.
8121              *
8122              * The cache is implemented as a bit array, with one bit per
8123              * string byte position per WHILEM op (up to 16) - so its
8124              * between 0.25 and 2x the string size.
8125              *
8126              * To avoid allocating a poscache buffer every time, we do an
8127              * initially countdown; only after we have  executed a WHILEM
8128              * op (string-length x #WHILEMs) times do we allocate the
8129              * cache.
8130              *
8131              * The top 4 bits of scan->flags byte say how many different
8132              * relevant CURLLYX/WHILEM op pairs there are, while the
8133              * bottom 4-bits is the identifying index number of this
8134              * WHILEM.
8135              */
8136 
8137 	    if (scan->flags) {
8138 
8139 		if (!reginfo->poscache_maxiter) {
8140 		    /* start the countdown: Postpone detection until we
8141 		     * know the match is not *that* much linear. */
8142 		    reginfo->poscache_maxiter
8143                         =    (reginfo->strend - reginfo->strbeg + 1)
8144                            * (scan->flags>>4);
8145 		    /* possible overflow for long strings and many CURLYX's */
8146 		    if (reginfo->poscache_maxiter < 0)
8147 			reginfo->poscache_maxiter = I32_MAX;
8148 		    reginfo->poscache_iter = reginfo->poscache_maxiter;
8149 		}
8150 
8151 		if (reginfo->poscache_iter-- == 0) {
8152 		    /* initialise cache */
8153 		    const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8154                     regmatch_info_aux *const aux = reginfo->info_aux;
8155 		    if (aux->poscache) {
8156 			if ((SSize_t)reginfo->poscache_size < size) {
8157 			    Renew(aux->poscache, size, char);
8158 			    reginfo->poscache_size = size;
8159 			}
8160 			Zero(aux->poscache, size, char);
8161 		    }
8162 		    else {
8163 			reginfo->poscache_size = size;
8164 			Newxz(aux->poscache, size, char);
8165 		    }
8166                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8167       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8168 			      PL_colors[4], PL_colors[5])
8169 		    );
8170 		}
8171 
8172 		if (reginfo->poscache_iter < 0) {
8173 		    /* have we already failed at this position? */
8174 		    SSize_t offset, mask;
8175 
8176                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8177 		    offset  = (scan->flags & 0xf) - 1
8178                                 +   (locinput - reginfo->strbeg)
8179                                   * (scan->flags>>4);
8180 		    mask    = 1 << (offset % 8);
8181 		    offset /= 8;
8182 		    if (reginfo->info_aux->poscache[offset] & mask) {
8183                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8184                             depth)
8185 			);
8186                         cur_curlyx->u.curlyx.count--;
8187 			sayNO; /* cache records failure */
8188 		    }
8189 		    ST.cache_offset = offset;
8190 		    ST.cache_mask   = mask;
8191 		}
8192 	    }
8193 
8194 	    /* Prefer B over A for minimal matching. */
8195 
8196 	    if (cur_curlyx->u.curlyx.minmod) {
8197 		ST.save_curlyx = cur_curlyx;
8198 		cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8199 		PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8200                                     locinput, loceol, script_run_begin);
8201 		NOT_REACHED; /* NOTREACHED */
8202 	    }
8203 
8204 	    /* Prefer A over B for maximal matching. */
8205 
8206 	    if (n < max) { /* More greed allowed? */
8207                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8208                             maxopenparen);
8209 		cur_curlyx->u.curlyx.lastloc = locinput;
8210 		REGCP_SET(ST.lastcp);
8211 		PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8212                                 script_run_begin);
8213 		NOT_REACHED; /* NOTREACHED */
8214 	    }
8215 	    goto do_whilem_B_max;
8216 	}
8217 	NOT_REACHED; /* NOTREACHED */
8218 
8219 	case WHILEM_B_min: /* just matched B in a minimal match */
8220 	case WHILEM_B_max: /* just matched B in a maximal match */
8221 	    cur_curlyx = ST.save_curlyx;
8222 	    sayYES;
8223 	    NOT_REACHED; /* NOTREACHED */
8224 
8225 	case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8226 	    cur_curlyx = ST.save_curlyx;
8227 	    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8228 	    cur_curlyx->u.curlyx.count--;
8229 	    CACHEsayNO;
8230 	    NOT_REACHED; /* NOTREACHED */
8231 
8232 	case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8233 	    /* FALLTHROUGH */
8234 	case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8235 	    REGCP_UNWIND(ST.lastcp);
8236             regcppop(rex, &maxopenparen);
8237 	    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8238 	    cur_curlyx->u.curlyx.count--;
8239 	    CACHEsayNO;
8240 	    NOT_REACHED; /* NOTREACHED */
8241 
8242 	case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8243 	    REGCP_UNWIND(ST.lastcp);
8244             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8245             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8246                 depth)
8247 	    );
8248 	  do_whilem_B_max:
8249 	    if (cur_curlyx->u.curlyx.count >= REG_INFTY
8250 		&& ckWARN(WARN_REGEXP)
8251 		&& !reginfo->warned)
8252 	    {
8253                 reginfo->warned	= TRUE;
8254 		Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8255 		     "Complex regular subexpression recursion limit (%d) "
8256 		     "exceeded",
8257 		     REG_INFTY - 1);
8258 	    }
8259 
8260 	    /* now try B */
8261 	    ST.save_curlyx = cur_curlyx;
8262 	    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8263 	    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8264                                 locinput, loceol, script_run_begin);
8265 	    NOT_REACHED; /* NOTREACHED */
8266 
8267 	case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8268 	    cur_curlyx = ST.save_curlyx;
8269 
8270 	    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8271 		/* Maximum greed exceeded */
8272 		if (cur_curlyx->u.curlyx.count >= REG_INFTY
8273 		    && ckWARN(WARN_REGEXP)
8274                     && !reginfo->warned)
8275 		{
8276                     reginfo->warned	= TRUE;
8277 		    Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8278 			"Complex regular subexpression recursion "
8279 			"limit (%d) exceeded",
8280 			REG_INFTY - 1);
8281 		}
8282 		cur_curlyx->u.curlyx.count--;
8283 		CACHEsayNO;
8284 	    }
8285 
8286             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8287 	    );
8288 	    /* Try grabbing another A and see if it helps. */
8289 	    cur_curlyx->u.curlyx.lastloc = locinput;
8290             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8291                             maxopenparen);
8292 	    REGCP_SET(ST.lastcp);
8293 	    PUSH_STATE_GOTO(WHILEM_A_min,
8294 		/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8295                 locinput, loceol, script_run_begin);
8296 	    NOT_REACHED; /* NOTREACHED */
8297 
8298 #undef  ST
8299 #define ST st->u.branch
8300 
8301 	case BRANCHJ:	    /*  /(...|A|...)/ with long next pointer */
8302 	    next = scan + ARG(scan);
8303 	    if (next == scan)
8304 		next = NULL;
8305 	    scan = NEXTOPER(scan);
8306 	    /* FALLTHROUGH */
8307 
8308 	case BRANCH:	    /*  /(...|A|...)/ */
8309 	    scan = NEXTOPER(scan); /* scan now points to inner node */
8310 	    ST.lastparen = rex->lastparen;
8311 	    ST.lastcloseparen = rex->lastcloseparen;
8312 	    ST.next_branch = next;
8313 	    REGCP_SET(ST.cp);
8314 
8315 	    /* Now go into the branch */
8316 	    if (has_cutgroup) {
8317 	        PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8318                                     script_run_begin);
8319 	    } else {
8320 	        PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8321                                 script_run_begin);
8322 	    }
8323 	    NOT_REACHED; /* NOTREACHED */
8324 
8325         case CUTGROUP:  /*  /(*THEN)/  */
8326             sv_yes_mark = st->u.mark.mark_name = scan->flags
8327                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8328                 : NULL;
8329             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8330                             script_run_begin);
8331             NOT_REACHED; /* NOTREACHED */
8332 
8333         case CUTGROUP_next_fail:
8334             do_cutgroup = 1;
8335             no_final = 1;
8336             if (st->u.mark.mark_name)
8337                 sv_commit = st->u.mark.mark_name;
8338             sayNO;
8339             NOT_REACHED; /* NOTREACHED */
8340 
8341         case BRANCH_next:
8342             sayYES;
8343             NOT_REACHED; /* NOTREACHED */
8344 
8345 	case BRANCH_next_fail: /* that branch failed; try the next, if any */
8346 	    if (do_cutgroup) {
8347 	        do_cutgroup = 0;
8348 	        no_final = 0;
8349 	    }
8350 	    REGCP_UNWIND(ST.cp);
8351             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8352 	    scan = ST.next_branch;
8353 	    /* no more branches? */
8354 	    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8355 	        DEBUG_EXECUTE_r({
8356                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8357                         depth,
8358 			PL_colors[4],
8359 			PL_colors[5] );
8360 		});
8361 		sayNO_SILENT;
8362             }
8363 	    continue; /* execute next BRANCH[J] op */
8364             /* NOTREACHED */
8365 
8366 	case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8367 	    minmod = 1;
8368 	    break;
8369 
8370 #undef  ST
8371 #define ST st->u.curlym
8372 
8373 	case CURLYM:	/* /A{m,n}B/ where A is fixed-length */
8374 
8375 	    /* This is an optimisation of CURLYX that enables us to push
8376 	     * only a single backtracking state, no matter how many matches
8377 	     * there are in {m,n}. It relies on the pattern being constant
8378 	     * length, with no parens to influence future backrefs
8379 	     */
8380 
8381 	    ST.me = scan;
8382 	    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8383 
8384 	    ST.lastparen      = rex->lastparen;
8385 	    ST.lastcloseparen = rex->lastcloseparen;
8386 
8387 	    /* if paren positive, emulate an OPEN/CLOSE around A */
8388 	    if (ST.me->flags) {
8389 		U32 paren = ST.me->flags;
8390 		if (paren > maxopenparen)
8391 		    maxopenparen = paren;
8392 		scan += NEXT_OFF(scan); /* Skip former OPEN. */
8393 	    }
8394 	    ST.A = scan;
8395 	    ST.B = next;
8396 	    ST.alen = 0;
8397 	    ST.count = 0;
8398 	    ST.minmod = minmod;
8399 	    minmod = 0;
8400 	    ST.c1 = CHRTEST_UNINIT;
8401 	    REGCP_SET(ST.cp);
8402 
8403 	    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8404 		goto curlym_do_B;
8405 
8406 	  curlym_do_A: /* execute the A in /A{m,n}B/  */
8407 	    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8408                                 script_run_begin);
8409 	    NOT_REACHED; /* NOTREACHED */
8410 
8411 	case CURLYM_A: /* we've just matched an A */
8412 	    ST.count++;
8413 	    /* after first match, determine A's length: u.curlym.alen */
8414 	    if (ST.count == 1) {
8415 		if (reginfo->is_utf8_target) {
8416 		    char *s = st->locinput;
8417 		    while (s < locinput) {
8418 			ST.alen++;
8419 			s += UTF8SKIP(s);
8420 		    }
8421 		}
8422 		else {
8423 		    ST.alen = locinput - st->locinput;
8424 		}
8425 		if (ST.alen == 0)
8426 		    ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8427 	    }
8428 	    DEBUG_EXECUTE_r(
8429                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8430                           depth, (IV) ST.count, (IV)ST.alen)
8431 	    );
8432 
8433             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8434 	        goto fake_end;
8435 
8436 	    {
8437 		I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8438 		if ( max == REG_INFTY || ST.count < max )
8439 		    goto curlym_do_A; /* try to match another A */
8440 	    }
8441 	    goto curlym_do_B; /* try to match B */
8442 
8443 	case CURLYM_A_fail: /* just failed to match an A */
8444 	    REGCP_UNWIND(ST.cp);
8445 
8446 
8447 	    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8448                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8449 		sayNO;
8450 
8451 	  curlym_do_B: /* execute the B in /A{m,n}B/  */
8452 	    if (ST.c1 == CHRTEST_UNINIT) {
8453 		/* calculate c1 and c2 for possible match of 1st char
8454 		 * following curly */
8455 		ST.c1 = ST.c2 = CHRTEST_VOID;
8456                 assert(ST.B);
8457 		if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8458 		    regnode *text_node = ST.B;
8459 		    if (! HAS_TEXT(text_node))
8460 			FIND_NEXT_IMPT(text_node);
8461 		    if (PL_regkind[OP(text_node)] == EXACT) {
8462                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8463                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8464                            reginfo))
8465                         {
8466                             sayNO;
8467                         }
8468 		    }
8469 		}
8470 	    }
8471 
8472 	    DEBUG_EXECUTE_r(
8473                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
8474                     depth, (IV)ST.count)
8475 		);
8476 	    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8477                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8478 
8479                            /* (We can use memEQ and memNE in this file without
8480                             * having to worry about one being shorter than the
8481                             * other, since the first byte of each gives the
8482                             * length of the character) */
8483                     if (   memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
8484                                                               reginfo->strend))
8485                         && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
8486                                                              reginfo->strend)))
8487                     {
8488                         /* simulate B failing */
8489                         DEBUG_OPTIMISE_r(
8490                             Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8491                                 depth,
8492                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8493                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8494                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8495                         );
8496                         state_num = CURLYM_B_fail;
8497                         goto reenter_switch;
8498                     }
8499                 }
8500                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8501                     /* simulate B failing */
8502                     DEBUG_OPTIMISE_r(
8503                         Perl_re_exec_indentf( aTHX_  "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8504                             depth,
8505                             (int) nextchr, ST.c1, ST.c2)
8506                     );
8507                     state_num = CURLYM_B_fail;
8508                     goto reenter_switch;
8509                 }
8510             }
8511 
8512 	    if (ST.me->flags) {
8513 		/* emulate CLOSE: mark current A as captured */
8514 		U32 paren = (U32)ST.me->flags;
8515 		if (ST.count) {
8516                     CLOSE_CAPTURE(paren,
8517 			HOPc(locinput, -ST.alen) - reginfo->strbeg,
8518 		        locinput - reginfo->strbeg);
8519 		}
8520 		else
8521 		    rex->offs[paren].end = -1;
8522 
8523                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8524 		{
8525 		    if (ST.count)
8526 	                goto fake_end;
8527 	            else
8528 	                sayNO;
8529 	        }
8530 	    }
8531 
8532 	    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
8533                             script_run_begin);
8534 	    NOT_REACHED; /* NOTREACHED */
8535 
8536 	case CURLYM_B_fail: /* just failed to match a B */
8537 	    REGCP_UNWIND(ST.cp);
8538             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8539 	    if (ST.minmod) {
8540 		I32 max = ARG2(ST.me);
8541 		if (max != REG_INFTY && ST.count == max)
8542 		    sayNO;
8543 		goto curlym_do_A; /* try to match a further A */
8544 	    }
8545 	    /* backtrack one A */
8546 	    if (ST.count == ARG1(ST.me) /* min */)
8547 		sayNO;
8548 	    ST.count--;
8549 	    SET_locinput(HOPc(locinput, -ST.alen));
8550 	    goto curlym_do_B; /* try to match B */
8551 
8552 #undef ST
8553 #define ST st->u.curly
8554 
8555 #define CURLY_SETPAREN(paren, success) \
8556     if (paren) { \
8557 	if (success) { \
8558             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8559 	                         locinput - reginfo->strbeg); \
8560 	} \
8561 	else { \
8562 	    rex->offs[paren].end = -1; \
8563 	    rex->lastparen      = ST.lastparen; \
8564 	    rex->lastcloseparen = ST.lastcloseparen; \
8565 	} \
8566     }
8567 
8568         case STAR:		/*  /A*B/ where A is width 1 char */
8569 	    ST.paren = 0;
8570 	    ST.min = 0;
8571 	    ST.max = REG_INFTY;
8572 	    scan = NEXTOPER(scan);
8573 	    goto repeat;
8574 
8575         case PLUS:		/*  /A+B/ where A is width 1 char */
8576 	    ST.paren = 0;
8577 	    ST.min = 1;
8578 	    ST.max = REG_INFTY;
8579 	    scan = NEXTOPER(scan);
8580 	    goto repeat;
8581 
8582 	case CURLYN:		/*  /(A){m,n}B/ where A is width 1 char */
8583             ST.paren = scan->flags;	/* Which paren to set */
8584             ST.lastparen      = rex->lastparen;
8585 	    ST.lastcloseparen = rex->lastcloseparen;
8586 	    if (ST.paren > maxopenparen)
8587 		maxopenparen = ST.paren;
8588 	    ST.min = ARG1(scan);  /* min to match */
8589 	    ST.max = ARG2(scan);  /* max to match */
8590             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8591 
8592             /* handle the single-char capture called as a GOSUB etc */
8593             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8594             {
8595                 char *li = locinput;
8596                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
8597 		    sayNO;
8598                 SET_locinput(li);
8599                 goto fake_end;
8600 	    }
8601 
8602 	    goto repeat;
8603 
8604 	case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
8605 	    ST.paren = 0;
8606 	    ST.min = ARG1(scan);  /* min to match */
8607 	    ST.max = ARG2(scan);  /* max to match */
8608 	    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8609 	  repeat:
8610 	    /*
8611 	    * Lookahead to avoid useless match attempts
8612 	    * when we know what character comes next.
8613 	    *
8614 	    * Used to only do .*x and .*?x, but now it allows
8615 	    * for )'s, ('s and (?{ ... })'s to be in the way
8616 	    * of the quantifier and the EXACT-like node.  -- japhy
8617 	    */
8618 
8619 	    assert(ST.min <= ST.max);
8620             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8621                 ST.c1 = ST.c2 = CHRTEST_VOID;
8622             }
8623             else {
8624 		regnode *text_node = next;
8625 
8626 		if (! HAS_TEXT(text_node))
8627 		    FIND_NEXT_IMPT(text_node);
8628 
8629 		if (! HAS_TEXT(text_node))
8630 		    ST.c1 = ST.c2 = CHRTEST_VOID;
8631 		else {
8632 		    if ( PL_regkind[OP(text_node)] != EXACT ) {
8633 			ST.c1 = ST.c2 = CHRTEST_VOID;
8634 		    }
8635 		    else {
8636                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8637                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8638                            reginfo))
8639                         {
8640                             sayNO;
8641                         }
8642                     }
8643 		}
8644 	    }
8645 
8646 	    ST.A = scan;
8647 	    ST.B = next;
8648 	    if (minmod) {
8649                 char *li = locinput;
8650 		minmod = 0;
8651 		if (ST.min &&
8652                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
8653                             < ST.min)
8654 		    sayNO;
8655                 SET_locinput(li);
8656 		ST.count = ST.min;
8657 		REGCP_SET(ST.cp);
8658 		if (ST.c1 == CHRTEST_VOID)
8659 		    goto curly_try_B_min;
8660 
8661 		ST.oldloc = locinput;
8662 
8663 		/* set ST.maxpos to the furthest point along the
8664 		 * string that could possibly match */
8665 		if  (ST.max == REG_INFTY) {
8666 		    ST.maxpos = loceol - 1;
8667 		    if (utf8_target)
8668 			while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8669 			    ST.maxpos--;
8670 		}
8671 		else if (utf8_target) {
8672 		    int m = ST.max - ST.min;
8673 		    for (ST.maxpos = locinput;
8674 			 m >0 && ST.maxpos <  loceol; m--)
8675 			ST.maxpos += UTF8SKIP(ST.maxpos);
8676 		}
8677 		else {
8678 		    ST.maxpos = locinput + ST.max - ST.min;
8679 		    if (ST.maxpos >=  loceol)
8680 			ST.maxpos =  loceol - 1;
8681 		}
8682 		goto curly_try_B_min_known;
8683 
8684 	    }
8685 	    else {
8686                 /* avoid taking address of locinput, so it can remain
8687                  * a register var */
8688                 char *li = locinput;
8689                 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
8690 		if (ST.count < ST.min)
8691 		    sayNO;
8692                 SET_locinput(li);
8693 		if ((ST.count > ST.min)
8694 		    && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8695 		{
8696 		    /* A{m,n} must come at the end of the string, there's
8697 		     * no point in backing off ... */
8698 		    ST.min = ST.count;
8699 		    /* ...except that $ and \Z can match before *and* after
8700 		       newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
8701 		       We may back off by one in this case. */
8702 		    if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8703 			ST.min--;
8704 		}
8705 		REGCP_SET(ST.cp);
8706 		goto curly_try_B_max;
8707 	    }
8708 	    NOT_REACHED; /* NOTREACHED */
8709 
8710 	case CURLY_B_min_fail:
8711 	    /* failed to find B in a non-greedy match.
8712              * Handles both cases where c1,c2 valid or not */
8713 
8714 	    REGCP_UNWIND(ST.cp);
8715             if (ST.paren) {
8716                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8717             }
8718 
8719             if (ST.c1 == CHRTEST_VOID) {
8720                 /* failed -- move forward one */
8721                 char *li = locinput;
8722                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
8723                     sayNO;
8724                 }
8725                 locinput = li;
8726                 ST.count++;
8727 		if (!(   ST.count <= ST.max
8728                         /* count overflow ? */
8729                      || (ST.max == REG_INFTY && ST.count > 0))
8730                 )
8731                     sayNO;
8732             }
8733             else {
8734 		int n;
8735                 /* Couldn't or didn't -- move forward. */
8736                 ST.oldloc = locinput;
8737                 if (utf8_target)
8738                     locinput += UTF8SKIP(locinput);
8739                 else
8740                     locinput++;
8741                 ST.count++;
8742 
8743               curly_try_B_min_known:
8744                 /* find the next place where 'B' could work, then call B */
8745 		if (utf8_target) {
8746 		    n = (ST.oldloc == locinput) ? 0 : 1;
8747 		    if (ST.c1 == ST.c2) {
8748 			/* set n to utf8_distance(oldloc, locinput) */
8749 			while (    locinput <= ST.maxpos
8750                                &&  locinput < loceol
8751                                &&  memNE(locinput, ST.c1_utf8,
8752                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8753                         {
8754 			    locinput += UTF8_SAFE_SKIP(locinput,
8755                                                        reginfo->strend);
8756 			    n++;
8757 			}
8758 		    }
8759 		    else {
8760 			/* set n to utf8_distance(oldloc, locinput) */
8761 			while (   locinput <= ST.maxpos
8762                                && locinput < loceol
8763                                && memNE(locinput, ST.c1_utf8,
8764                                      UTF8_SAFE_SKIP(locinput, reginfo->strend))
8765                                && memNE(locinput, ST.c2_utf8,
8766                                     UTF8_SAFE_SKIP(locinput, reginfo->strend)))
8767                         {
8768 			    locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
8769 			    n++;
8770 			}
8771 		    }
8772 		}
8773 		else {  /* Not utf8_target */
8774 		    if (ST.c1 == ST.c2) {
8775                         locinput = (char *) memchr(locinput,
8776                                                    ST.c1,
8777                                                    ST.maxpos + 1 - locinput);
8778                         if (! locinput) {
8779                             locinput = ST.maxpos + 1;
8780                         }
8781 		    }
8782                     else {
8783                         U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8784 
8785                         if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8786                             while (   locinput <= ST.maxpos
8787                                    && UCHARAT(locinput) != ST.c1
8788                                    && UCHARAT(locinput) != ST.c2)
8789                             {
8790                                 locinput++;
8791                             }
8792                         }
8793                         else {
8794                             /* If c1 and c2 only differ by a single bit, we can
8795                              * avoid a conditional each time through the loop,
8796                              * at the expense of a little preliminary setup and
8797                              * an extra mask each iteration.  By masking out
8798                              * that bit, we match exactly two characters, c1
8799                              * and c2, and so we don't have to test for both.
8800                              * On both ASCII and EBCDIC platforms, most of the
8801                              * ASCII-range and Latin1-range folded equivalents
8802                              * differ only in a single bit, so this is actually
8803                              * the most common case. (e.g. 'A' 0x41 vs 'a'
8804                              * 0x61). */
8805                             U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8806                             U8 c1_c2_mask = ~ c1_c2_bits_differing;
8807                             while (   locinput <= ST.maxpos
8808                                    && (UCHARAT(locinput) & c1_c2_mask)
8809                                                                 != c1_masked)
8810                             {
8811                                 locinput++;
8812                             }
8813                         }
8814                     }
8815 		    n = locinput - ST.oldloc;
8816 		}
8817 		if (locinput > ST.maxpos)
8818 		    sayNO;
8819 		if (n) {
8820                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8821                      * at b; check that everything between oldloc and
8822                      * locinput matches */
8823                     char *li = ST.oldloc;
8824 		    ST.count += n;
8825                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
8826 			sayNO;
8827                     assert(n == REG_INFTY || locinput == li);
8828 		}
8829 	    }
8830 
8831           curly_try_B_min:
8832             CURLY_SETPAREN(ST.paren, ST.count);
8833             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
8834                             script_run_begin);
8835 	    NOT_REACHED; /* NOTREACHED */
8836 
8837 
8838           curly_try_B_max:
8839 	    /* a successful greedy match: now try to match B */
8840 	    {
8841 		bool could_match = locinput <  loceol;
8842 
8843 		/* If it could work, try it. */
8844                 if (ST.c1 != CHRTEST_VOID && could_match) {
8845                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8846                     {
8847                         could_match =  memEQ(locinput, ST.c1_utf8,
8848                                              UTF8_SAFE_SKIP(locinput,
8849                                                             reginfo->strend))
8850                                     || memEQ(locinput, ST.c2_utf8,
8851                                              UTF8_SAFE_SKIP(locinput,
8852                                                             reginfo->strend));
8853                     }
8854                     else {
8855                         could_match =   UCHARAT(locinput) == ST.c1
8856                                      || UCHARAT(locinput) == ST.c2;
8857                     }
8858                 }
8859                 if (ST.c1 == CHRTEST_VOID || could_match) {
8860 		    CURLY_SETPAREN(ST.paren, ST.count);
8861 		    PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
8862                                     script_run_begin);
8863 		    NOT_REACHED; /* NOTREACHED */
8864 		}
8865 	    }
8866 	    /* FALLTHROUGH */
8867 
8868 	case CURLY_B_max_fail:
8869 	    /* failed to find B in a greedy match */
8870 
8871 	    REGCP_UNWIND(ST.cp);
8872             if (ST.paren) {
8873                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8874             }
8875 	    /*  back up. */
8876 	    if (--ST.count < ST.min)
8877 		sayNO;
8878 	    locinput = HOPc(locinput, -1);
8879 	    goto curly_try_B_max;
8880 
8881 #undef ST
8882 
8883 	case END: /*  last op of main pattern  */
8884           fake_end:
8885 	    if (cur_eval) {
8886 		/* we've just finished A in /(??{A})B/; now continue with B */
8887                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8888 		st->u.eval.prev_rex = rex_sv;		/* inner */
8889 
8890                 /* Save *all* the positions. */
8891                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8892                 rex_sv = CUR_EVAL.prev_rex;
8893 		is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8894 		SET_reg_curpm(rex_sv);
8895 		rex = ReANY(rex_sv);
8896 		rexi = RXi_GET(rex);
8897 
8898                 st->u.eval.prev_curlyx = cur_curlyx;
8899                 cur_curlyx = CUR_EVAL.prev_curlyx;
8900 
8901 		REGCP_SET(st->u.eval.lastcp);
8902 
8903 		/* Restore parens of the outer rex without popping the
8904 		 * savestack */
8905                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8906 
8907 		st->u.eval.prev_eval = cur_eval;
8908                 cur_eval = CUR_EVAL.prev_eval;
8909 		DEBUG_EXECUTE_r(
8910                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
8911                                       depth, cur_eval););
8912                 if ( nochange_depth )
8913 	            nochange_depth--;
8914 
8915                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8916 
8917                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
8918                                     st->u.eval.prev_eval->u.eval.B,
8919                                     locinput, loceol, script_run_begin);
8920 	    }
8921 
8922 	    if (locinput < reginfo->till) {
8923                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8924                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8925 				      PL_colors[4],
8926 				      (long)(locinput - startpos),
8927 				      (long)(reginfo->till - startpos),
8928 				      PL_colors[5]));
8929 
8930 		sayNO_SILENT;		/* Cannot match: too short. */
8931 	    }
8932 	    sayYES;			/* Success! */
8933 
8934 	case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8935 	    DEBUG_EXECUTE_r(
8936             Perl_re_exec_indentf( aTHX_  "%sSUCCEED: subpattern success...%s\n",
8937                 depth, PL_colors[4], PL_colors[5]));
8938 	    sayYES;			/* Success! */
8939 
8940 #undef  ST
8941 #define ST st->u.ifmatch
8942 
8943 	case SUSPEND:	/* (?>A) */
8944 	    ST.wanted = 1;
8945 	    ST.start = locinput;
8946 	    ST.end = loceol;
8947             ST.count = 1;
8948 	    goto do_ifmatch;
8949 
8950 	case UNLESSM:	/* -ve lookaround: (?!A), or with 'flags', (?<!A) */
8951 	    ST.wanted = 0;
8952 	    goto ifmatch_trivial_fail_test;
8953 
8954 	case IFMATCH:	/* +ve lookaround: (?=A), or with 'flags', (?<=A) */
8955 	    ST.wanted = 1;
8956 	  ifmatch_trivial_fail_test:
8957             ST.count = scan->next_off + 1; /* next_off repurposed to be
8958                                               lookbehind count, requires
8959                                               non-zero flags */
8960 	    if (! scan->flags) {    /* 'flags' zero means lookahed */
8961 
8962                 /* Lookahead starts here and ends at the normal place */
8963 		ST.start = locinput;
8964 		ST.end = loceol;
8965             }
8966 	    else {
8967                 PERL_UINT_FAST8_T back_count = scan->flags;
8968 		char * s;
8969 
8970                 /* Lookbehind can look beyond the current position */
8971 		ST.end = loceol;
8972 
8973                 /* ... and starts at the first place in the input that is in
8974                  * the range of the possible start positions */
8975                 for (; ST.count > 0; ST.count--, back_count--) {
8976                     s = HOPBACKc(locinput, back_count);
8977                     if (s) {
8978                         ST.start = s;
8979                         goto do_ifmatch;
8980                     }
8981                 }
8982 
8983                 /* If the lookbehind doesn't start in the actual string, is a
8984                  * trivial match failure */
8985                 if (logical) {
8986                     logical = 0;
8987                     sw = 1 - cBOOL(ST.wanted);
8988                 }
8989                 else if (ST.wanted)
8990                     sayNO;
8991 
8992                 /* Here, we didn't want it to match, so is actually success */
8993                 next = scan + ARG(scan);
8994                 if (next == scan)
8995                     next = NULL;
8996                 break;
8997 	    }
8998 
8999 	  do_ifmatch:
9000 	    ST.me = scan;
9001 	    ST.logical = logical;
9002 	    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9003 
9004 	    /* execute body of (?...A) */
9005 	    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
9006                                 ST.end, script_run_begin);
9007 	    NOT_REACHED; /* NOTREACHED */
9008 
9009         {
9010             bool matched;
9011 
9012 	case IFMATCH_A_fail: /* body of (?...A) failed */
9013 	    if (! ST.logical && ST.count > 1) {
9014 
9015                 /* It isn't a real failure until we've tried all starting
9016                  * positions.  Move to the next starting position and retry */
9017                 ST.count--;
9018                 ST.start = HOPc(ST.start, 1);
9019                 scan = ST.me;
9020                 logical = ST.logical;
9021                 goto do_ifmatch;
9022             }
9023 
9024             /* Here, all starting positions have been tried. */
9025 	    matched = FALSE;
9026 	    goto ifmatch_done;
9027 
9028 	case IFMATCH_A: /* body of (?...A) succeeded */
9029 	    matched = TRUE;
9030           ifmatch_done:
9031             sw = matched == ST.wanted;
9032 	    if (! ST.logical && !sw) {
9033                 sayNO;
9034             }
9035 
9036 	    if (OP(ST.me) != SUSPEND) {
9037                 /* restore old position except for (?>...) */
9038 		locinput = st->locinput;
9039                 loceol = st->loceol;
9040                 script_run_begin = st->sr0;
9041 	    }
9042 	    scan = ST.me + ARG(ST.me);
9043 	    if (scan == ST.me)
9044 		scan = NULL;
9045 	    continue; /* execute B */
9046         }
9047 
9048 #undef ST
9049 
9050 	case LONGJMP: /*  alternative with many branches compiles to
9051                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9052 	    next = scan + ARG(scan);
9053 	    if (next == scan)
9054 		next = NULL;
9055 	    break;
9056 
9057 	case COMMIT:  /*  (*COMMIT)  */
9058 	    reginfo->cutpoint = loceol;
9059 	    /* FALLTHROUGH */
9060 
9061 	case PRUNE:   /*  (*PRUNE)   */
9062             if (scan->flags)
9063 	        sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9064 	    PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9065                             script_run_begin);
9066 	    NOT_REACHED; /* NOTREACHED */
9067 
9068 	case COMMIT_next_fail:
9069 	    no_final = 1;
9070 	    /* FALLTHROUGH */
9071             sayNO;
9072             NOT_REACHED; /* NOTREACHED */
9073 
9074 	case OPFAIL:   /* (*FAIL)  */
9075             if (scan->flags)
9076                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9077             if (logical) {
9078                 /* deal with (?(?!)X|Y) properly,
9079                  * make sure we trigger the no branch
9080                  * of the trailing IFTHEN structure*/
9081                 sw= 0;
9082                 break;
9083             } else {
9084                 sayNO;
9085             }
9086 	    NOT_REACHED; /* NOTREACHED */
9087 
9088 #define ST st->u.mark
9089         case MARKPOINT: /*  (*MARK:foo)  */
9090             ST.prev_mark = mark_state;
9091             ST.mark_name = sv_commit = sv_yes_mark
9092                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9093             mark_state = st;
9094             ST.mark_loc = locinput;
9095             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9096                                 script_run_begin);
9097             NOT_REACHED; /* NOTREACHED */
9098 
9099         case MARKPOINT_next:
9100             mark_state = ST.prev_mark;
9101             sayYES;
9102             NOT_REACHED; /* NOTREACHED */
9103 
9104         case MARKPOINT_next_fail:
9105             if (popmark && sv_eq(ST.mark_name,popmark))
9106             {
9107                 if (ST.mark_loc > startpoint)
9108 	            reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9109                 popmark = NULL; /* we found our mark */
9110                 sv_commit = ST.mark_name;
9111 
9112                 DEBUG_EXECUTE_r({
9113                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9114                             depth,
9115 		            PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9116 		});
9117             }
9118             mark_state = ST.prev_mark;
9119             sv_yes_mark = mark_state ?
9120                 mark_state->u.mark.mark_name : NULL;
9121             sayNO;
9122             NOT_REACHED; /* NOTREACHED */
9123 
9124         case SKIP:  /*  (*SKIP)  */
9125             if (!scan->flags) {
9126                 /* (*SKIP) : if we fail we cut here*/
9127                 ST.mark_name = NULL;
9128                 ST.mark_loc = locinput;
9129                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9130                                 script_run_begin);
9131             } else {
9132                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9133                    otherwise do nothing.  Meaning we need to scan
9134                  */
9135                 regmatch_state *cur = mark_state;
9136                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9137 
9138                 while (cur) {
9139                     if ( sv_eq( cur->u.mark.mark_name,
9140                                 find ) )
9141                     {
9142                         ST.mark_name = find;
9143                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9144                                          script_run_begin);
9145                     }
9146                     cur = cur->u.mark.prev_mark;
9147                 }
9148             }
9149             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9150             break;
9151 
9152 	case SKIP_next_fail:
9153 	    if (ST.mark_name) {
9154 	        /* (*CUT:NAME) - Set up to search for the name as we
9155 	           collapse the stack*/
9156 	        popmark = ST.mark_name;
9157 	    } else {
9158 	        /* (*CUT) - No name, we cut here.*/
9159 	        if (ST.mark_loc > startpoint)
9160 	            reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9161 	        /* but we set sv_commit to latest mark_name if there
9162 	           is one so they can test to see how things lead to this
9163 	           cut */
9164                 if (mark_state)
9165                     sv_commit=mark_state->u.mark.mark_name;
9166             }
9167             no_final = 1;
9168             sayNO;
9169             NOT_REACHED; /* NOTREACHED */
9170 #undef ST
9171 
9172         case LNBREAK: /* \R */
9173             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9174                 locinput += n;
9175             } else
9176                 sayNO;
9177             break;
9178 
9179 	default:
9180 	    PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9181 			  PTR2UV(scan), OP(scan));
9182 	    Perl_croak(aTHX_ "regexp memory corruption");
9183 
9184         /* this is a point to jump to in order to increment
9185          * locinput by one character */
9186           increment_locinput:
9187             assert(!NEXTCHR_IS_EOS);
9188             if (utf8_target) {
9189                 locinput += PL_utf8skip[nextchr];
9190                 /* locinput is allowed to go 1 char off the end (signifying
9191                  * EOS), but not 2+ */
9192                 if (locinput >  loceol)
9193                     sayNO;
9194             }
9195             else
9196                 locinput++;
9197             break;
9198 
9199 	} /* end switch */
9200 
9201         /* switch break jumps here */
9202 	scan = next; /* prepare to execute the next op and ... */
9203 	continue;    /* ... jump back to the top, reusing st */
9204         /* NOTREACHED */
9205 
9206       push_yes_state:
9207 	/* push a state that backtracks on success */
9208 	st->u.yes.prev_yes_state = yes_state;
9209 	yes_state = st;
9210 	/* FALLTHROUGH */
9211       push_state:
9212 	/* push a new regex state, then continue at scan  */
9213 	{
9214 	    regmatch_state *newst;
9215             DECLARE_AND_GET_RE_DEBUG_FLAGS;
9216 
9217             DEBUG_r( /* DEBUG_STACK_r */
9218               if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9219 	        regmatch_state *cur = st;
9220 	        regmatch_state *curyes = yes_state;
9221 	        U32 i;
9222 	        regmatch_slab *slab = PL_regmatch_slab;
9223                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9224                     if (cur < SLAB_FIRST(slab)) {
9225                 	slab = slab->prev;
9226                 	cur = SLAB_LAST(slab);
9227                     }
9228                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9229                         depth,
9230                         i ? "    " : "push",
9231                         depth - i, PL_reg_name[cur->resume_state],
9232                         (curyes == cur) ? "yes" : ""
9233                     );
9234                     if (curyes == cur)
9235 	                curyes = cur->u.yes.prev_yes_state;
9236                 }
9237             } else {
9238                 DEBUG_STATE_pp("push")
9239             });
9240 	    depth++;
9241 	    st->locinput = locinput;
9242 	    st->loceol = loceol;
9243             st->sr0 = script_run_begin;
9244 	    newst = st+1;
9245 	    if (newst >  SLAB_LAST(PL_regmatch_slab))
9246 		newst = S_push_slab(aTHX);
9247 	    PL_regmatch_state = newst;
9248 
9249 	    locinput = pushinput;
9250             loceol = pusheol;
9251             script_run_begin = pushsr0;
9252 	    st = newst;
9253 	    continue;
9254             /* NOTREACHED */
9255 	}
9256     }
9257 #ifdef SOLARIS_BAD_OPTIMIZER
9258 #  undef PL_charclass
9259 #endif
9260 
9261     /*
9262     * We get here only if there's trouble -- normally "case END" is
9263     * the terminating point.
9264     */
9265     Perl_croak(aTHX_ "corrupted regexp pointers");
9266     NOT_REACHED; /* NOTREACHED */
9267 
9268   yes:
9269     if (yes_state) {
9270 	/* we have successfully completed a subexpression, but we must now
9271 	 * pop to the state marked by yes_state and continue from there */
9272 	assert(st != yes_state);
9273 #ifdef DEBUGGING
9274 	while (st != yes_state) {
9275 	    st--;
9276 	    if (st < SLAB_FIRST(PL_regmatch_slab)) {
9277 		PL_regmatch_slab = PL_regmatch_slab->prev;
9278 		st = SLAB_LAST(PL_regmatch_slab);
9279 	    }
9280 	    DEBUG_STATE_r({
9281 	        if (no_final) {
9282 	            DEBUG_STATE_pp("pop (no final)");
9283 	        } else {
9284 	            DEBUG_STATE_pp("pop (yes)");
9285 	        }
9286 	    });
9287 	    depth--;
9288 	}
9289 #else
9290 	while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9291 	    || yes_state > SLAB_LAST(PL_regmatch_slab))
9292 	{
9293 	    /* not in this slab, pop slab */
9294 	    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9295 	    PL_regmatch_slab = PL_regmatch_slab->prev;
9296 	    st = SLAB_LAST(PL_regmatch_slab);
9297 	}
9298 	depth -= (st - yes_state);
9299 #endif
9300 	st = yes_state;
9301 	yes_state = st->u.yes.prev_yes_state;
9302 	PL_regmatch_state = st;
9303 
9304         if (no_final) {
9305             locinput= st->locinput;
9306             loceol= st->loceol;
9307             script_run_begin = st->sr0;
9308         }
9309 	state_num = st->resume_state + no_final;
9310 	goto reenter_switch;
9311     }
9312 
9313     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9314 			  PL_colors[4], PL_colors[5]));
9315 
9316     if (reginfo->info_aux_eval) {
9317 	/* each successfully executed (?{...}) block does the equivalent of
9318 	 *   local $^R = do {...}
9319 	 * When popping the save stack, all these locals would be undone;
9320 	 * bypass this by setting the outermost saved $^R to the latest
9321 	 * value */
9322         /* I dont know if this is needed or works properly now.
9323          * see code related to PL_replgv elsewhere in this file.
9324          * Yves
9325          */
9326 	if (oreplsv != GvSV(PL_replgv)) {
9327 	    sv_setsv(oreplsv, GvSV(PL_replgv));
9328             SvSETMAGIC(oreplsv);
9329         }
9330     }
9331     result = 1;
9332     goto final_exit;
9333 
9334   no:
9335     DEBUG_EXECUTE_r(
9336         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9337             depth,
9338             PL_colors[4], PL_colors[5])
9339 	);
9340 
9341   no_silent:
9342     if (no_final) {
9343         if (yes_state) {
9344             goto yes;
9345         } else {
9346             goto final_exit;
9347         }
9348     }
9349     if (depth) {
9350 	/* there's a previous state to backtrack to */
9351 	st--;
9352 	if (st < SLAB_FIRST(PL_regmatch_slab)) {
9353 	    PL_regmatch_slab = PL_regmatch_slab->prev;
9354 	    st = SLAB_LAST(PL_regmatch_slab);
9355 	}
9356 	PL_regmatch_state = st;
9357 	locinput= st->locinput;
9358 	loceol= st->loceol;
9359         script_run_begin = st->sr0;
9360 
9361 	DEBUG_STATE_pp("pop");
9362 	depth--;
9363 	if (yes_state == st)
9364 	    yes_state = st->u.yes.prev_yes_state;
9365 
9366 	state_num = st->resume_state + 1; /* failure = success + 1 */
9367         PERL_ASYNC_CHECK();
9368 	goto reenter_switch;
9369     }
9370     result = 0;
9371 
9372   final_exit:
9373     if (rex->intflags & PREGf_VERBARG_SEEN) {
9374         SV *sv_err = get_sv("REGERROR", 1);
9375         SV *sv_mrk = get_sv("REGMARK", 1);
9376         if (result) {
9377             sv_commit = &PL_sv_no;
9378             if (!sv_yes_mark)
9379                 sv_yes_mark = &PL_sv_yes;
9380         } else {
9381             if (!sv_commit)
9382                 sv_commit = &PL_sv_yes;
9383             sv_yes_mark = &PL_sv_no;
9384         }
9385         assert(sv_err);
9386         assert(sv_mrk);
9387         sv_setsv(sv_err, sv_commit);
9388         sv_setsv(sv_mrk, sv_yes_mark);
9389     }
9390 
9391 
9392     if (last_pushed_cv) {
9393 	dSP;
9394         /* see "Some notes about MULTICALL" above */
9395 	POP_MULTICALL;
9396         PERL_UNUSED_VAR(SP);
9397     }
9398     else
9399         LEAVE_SCOPE(orig_savestack_ix);
9400 
9401     assert(!result ||  locinput - reginfo->strbeg >= 0);
9402     return result ?  locinput - reginfo->strbeg : -1;
9403 }
9404 
9405 /*
9406  - regrepeat - repeatedly match something simple, report how many
9407  *
9408  * What 'simple' means is a node which can be the operand of a quantifier like
9409  * '+', or {1,3}
9410  *
9411  * startposp - pointer to a pointer to the start position.  This is updated
9412  *             to point to the byte following the highest successful
9413  *             match.
9414  * p         - the regnode to be repeatedly matched against.
9415  * loceol    - pointer to the end position beyond which we aren't supposed to
9416  *             look.
9417  * reginfo   - struct holding match state, such as utf8_target
9418  * max       - maximum number of things to match.
9419  * depth     - (for debugging) backtracking depth.
9420  */
9421 STATIC I32
9422 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9423             char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9424 {
9425     dVAR;
9426     char *scan;     /* Pointer to current position in target string */
9427     I32 c;
9428     char *this_eol = loceol;   /* potentially adjusted version. */
9429     I32 hardcount = 0;  /* How many matches so far */
9430     bool utf8_target = reginfo->is_utf8_target;
9431     unsigned int to_complement = 0;  /* Invert the result? */
9432     UV utf8_flags = 0;
9433     _char_class_number classnum;
9434 
9435     PERL_ARGS_ASSERT_REGREPEAT;
9436 
9437     /* This routine is structured so that we switch on the input OP.  Each OP
9438      * case: statement contains a loop to repeatedly apply the OP, advancing
9439      * the input until it fails, or reaches the end of the input, or until it
9440      * reaches the upper limit of matches. */
9441 
9442     scan = *startposp;
9443     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
9444                                max */
9445 	max = I32_MAX;
9446     else if (! utf8_target && this_eol - scan > max)
9447 	this_eol = scan + max;
9448 
9449     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
9450      * to the maximum of how far we should go in it (leaving it set to the real
9451      * end, if the maximum permissible would take us beyond that).  This allows
9452      * us to make the loop exit condition that we haven't gone past <this_eol> to
9453      * also mean that we haven't exceeded the max permissible count, saving a
9454      * test each time through the loops.  But it assumes that the OP matches a
9455      * single byte, which is true for most of the OPs below when applied to a
9456      * non-UTF-8 target.  Those relatively few OPs that don't have this
9457      * characteristic will have to compensate.
9458      *
9459      * There is no adjustment for UTF-8 targets, as the number of bytes per
9460      * character varies.  OPs will have to test both that the count is less
9461      * than the max permissible (using <hardcount> to keep track), and that we
9462      * are still within the bounds of the string (using <this_eol>.  A few OPs
9463      * match a single byte no matter what the encoding.  They can omit the max
9464      * test if, for the UTF-8 case, they do the adjustment that was skipped
9465      * above.
9466      *
9467      * Thus, the code above sets things up for the common case; and exceptional
9468      * cases need extra work; the common case is to make sure <scan> doesn't
9469      * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9470      * count doesn't exceed the maximum permissible */
9471 
9472     switch (OP(p)) {
9473     case REG_ANY:
9474 	if (utf8_target) {
9475 	    while (scan < this_eol && hardcount < max && *scan != '\n') {
9476 		scan += UTF8SKIP(scan);
9477 		hardcount++;
9478 	    }
9479 	} else {
9480             scan = (char *) memchr(scan, '\n', this_eol - scan);
9481             if (! scan) {
9482                 scan = this_eol;
9483             }
9484 	}
9485 	break;
9486     case SANY:
9487         if (utf8_target) {
9488 	    while (scan < this_eol && hardcount < max) {
9489 	        scan += UTF8SKIP(scan);
9490 		hardcount++;
9491 	    }
9492 	}
9493 	else
9494 	    scan = this_eol;
9495 	break;
9496 
9497     case LEXACT_REQ8:
9498         if (! utf8_target) {
9499             break;
9500         }
9501         /* FALLTHROUGH */
9502 
9503     case LEXACT:
9504       {
9505         U8 * string;
9506         Size_t str_len;
9507 
9508 	string = (U8 *) STRINGl(p);
9509         str_len = STR_LENl(p);
9510         goto join_short_long_exact;
9511 
9512     case EXACTL:
9513         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9514         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9515             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9516         }
9517         goto do_exact;
9518 
9519     case EXACT_REQ8:
9520         if (! utf8_target) {
9521             break;
9522         }
9523         /* FALLTHROUGH */
9524     case EXACT:
9525       do_exact:
9526 	string = (U8 *) STRINGs(p);
9527         str_len = STR_LENs(p);
9528 
9529       join_short_long_exact:
9530         assert(str_len == reginfo->is_utf8_pat ? UTF8SKIP(string) : 1);
9531 
9532 	c = *string;
9533 
9534         /* Can use a simple find if the pattern char to match on is invariant
9535          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
9536          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9537          * true iff it doesn't matter if the argument is in UTF-8 or not */
9538         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9539             if (utf8_target && this_eol - scan > max) {
9540                 /* We didn't adjust <this_eol> because is UTF-8, but ok to do so,
9541                  * since here, to match at all, 1 char == 1 byte */
9542                 this_eol = scan + max;
9543             }
9544             scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9545 	}
9546 	else if (reginfo->is_utf8_pat) {
9547             if (utf8_target) {
9548                 STRLEN scan_char_len;
9549 
9550                 /* When both target and pattern are UTF-8, we have to do
9551                  * string EQ */
9552                 while (hardcount < max
9553                        && scan < this_eol
9554                        && (scan_char_len = UTF8SKIP(scan)) <= str_len
9555                        && memEQ(scan, string, scan_char_len))
9556                 {
9557                     scan += scan_char_len;
9558                     hardcount++;
9559                 }
9560             }
9561             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9562 
9563                 /* Target isn't utf8; convert the character in the UTF-8
9564                  * pattern to non-UTF8, and do a simple find */
9565                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(string + 1));
9566                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
9567             } /* else pattern char is above Latin1, can't possibly match the
9568                  non-UTF-8 target */
9569         }
9570         else {
9571 
9572             /* Here, the string must be utf8; pattern isn't, and <c> is
9573              * different in utf8 than not, so can't compare them directly.
9574              * Outside the loop, find the two utf8 bytes that represent c, and
9575              * then look for those in sequence in the utf8 string */
9576 	    U8 high = UTF8_TWO_BYTE_HI(c);
9577 	    U8 low = UTF8_TWO_BYTE_LO(c);
9578 
9579 	    while (hardcount < max
9580 		    && scan + 1 < this_eol
9581 		    && UCHARAT(scan) == high
9582 		    && UCHARAT(scan + 1) == low)
9583 	    {
9584 		scan += 2;
9585 		hardcount++;
9586 	    }
9587 	}
9588 	break;
9589       }
9590 
9591     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9592         assert(! reginfo->is_utf8_pat);
9593         /* FALLTHROUGH */
9594     case EXACTFAA:
9595         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9596         if (reginfo->is_utf8_pat || ! utf8_target) {
9597 
9598             /* The possible presence of a MICRO SIGN in the pattern forbids us
9599              * to view a non-UTF-8 pattern as folded when there is a UTF-8
9600              * target.  */
9601             utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
9602         }
9603         goto do_exactf;
9604 
9605     case EXACTFL:
9606         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9607 	utf8_flags = FOLDEQ_LOCALE;
9608 	goto do_exactf;
9609 
9610     case EXACTF:   /* This node only generated for non-utf8 patterns */
9611         assert(! reginfo->is_utf8_pat);
9612         goto do_exactf;
9613 
9614     case EXACTFLU8:
9615         if (! utf8_target) {
9616             break;
9617         }
9618         utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9619                                     | FOLDEQ_S2_FOLDS_SANE;
9620         goto do_exactf;
9621 
9622     case EXACTFU_REQ8:
9623         if (! utf8_target) {
9624             break;
9625         }
9626 	assert(reginfo->is_utf8_pat);
9627 	utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9628         goto do_exactf;
9629 
9630     case EXACTFU:
9631         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
9632         /* FALLTHROUGH */
9633 
9634     case EXACTFUP:
9635 
9636       do_exactf: {
9637         int c1, c2;
9638         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9639 
9640         assert(STR_LENs(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1);
9641 
9642         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9643                                         reginfo))
9644         {
9645             if (c1 == CHRTEST_VOID) {
9646                 /* Use full Unicode fold matching */
9647                 char *tmpeol = loceol;
9648                 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1;
9649                 while (hardcount < max
9650                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9651                                              STRINGs(p), NULL, pat_len,
9652                                              reginfo->is_utf8_pat, utf8_flags))
9653                 {
9654                     scan = tmpeol;
9655                     tmpeol = loceol;
9656                     hardcount++;
9657                 }
9658             }
9659             else if (utf8_target) {
9660                 if (c1 == c2) {
9661                     while (scan < this_eol
9662                            && hardcount < max
9663                            && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9664                                                                   loceol)))
9665                     {
9666                         scan += UTF8SKIP(c1_utf8);
9667                         hardcount++;
9668                     }
9669                 }
9670                 else {
9671                     while (scan < this_eol
9672                            && hardcount < max
9673                            && (   memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
9674                                                                      loceol))
9675                                || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
9676                                                                      loceol))))
9677                     {
9678                         scan += UTF8_SAFE_SKIP(scan, loceol);
9679                         hardcount++;
9680                     }
9681                 }
9682             }
9683             else if (c1 == c2) {
9684                 scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c1);
9685             }
9686             else {
9687                 /* See comments in regmatch() CURLY_B_min_known_fail.  We avoid
9688                  * a conditional each time through the loop if the characters
9689                  * differ only in a single bit, as is the usual situation */
9690                 U8 c1_c2_bits_differing = c1 ^ c2;
9691 
9692                 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9693                     U8 c1_c2_mask = ~ c1_c2_bits_differing;
9694 
9695                     scan = (char *) find_span_end_mask((U8 *) scan,
9696                                                        (U8 *) this_eol,
9697                                                        c1 & c1_c2_mask,
9698                                                        c1_c2_mask);
9699                 }
9700                 else {
9701                     while (    scan < this_eol
9702                            && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9703                     {
9704                         scan++;
9705                     }
9706                 }
9707             }
9708 	}
9709 	break;
9710     }
9711     case ANYOFPOSIXL:
9712     case ANYOFL:
9713         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9714 
9715         if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9716             Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9717         }
9718         /* FALLTHROUGH */
9719     case ANYOFD:
9720     case ANYOF:
9721 	if (utf8_target) {
9722 	    while (hardcount < max
9723                    && scan < this_eol
9724 		   && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
9725 	    {
9726 		scan += UTF8SKIP(scan);
9727 		hardcount++;
9728 	    }
9729 	}
9730         else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
9731 	    while (scan < this_eol
9732                     && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9733 		scan++;
9734         }
9735         else {
9736 	    while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9737 		scan++;
9738 	}
9739 	break;
9740 
9741     case ANYOFM:
9742         if (utf8_target && this_eol - scan > max) {
9743 
9744             /* We didn't adjust <this_eol> at the beginning of this routine
9745              * because is UTF-8, but it is actually ok to do so, since here, to
9746              * match, 1 char == 1 byte. */
9747             this_eol = scan + max;
9748         }
9749 
9750         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9751         break;
9752 
9753     case NANYOFM:
9754 	if (utf8_target) {
9755 	    while (     hardcount < max
9756                    &&   scan < this_eol
9757 		   &&  (*scan & FLAGS(p)) != ARG(p))
9758 	    {
9759 		scan += UTF8SKIP(scan);
9760 		hardcount++;
9761 	    }
9762 	}
9763         else {
9764             scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
9765 	}
9766         break;
9767 
9768     case ANYOFH:
9769         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9770             while (  hardcount < max
9771                    && scan < this_eol
9772                    && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
9773                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9774             {
9775                 scan += UTF8SKIP(scan);
9776                 hardcount++;
9777             }
9778         }
9779         break;
9780 
9781     case ANYOFHb:
9782         if (utf8_target) {  /* ANYOFHb only can match UTF-8 targets */
9783 
9784             /* we know the first byte must be the FLAGS field */
9785             while (   hardcount < max
9786                    && scan < this_eol
9787                    && (U8) *scan == ANYOF_FLAGS(p)
9788                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
9789                                                               TRUE))
9790             {
9791                 scan += UTF8SKIP(scan);
9792                 hardcount++;
9793             }
9794         }
9795         break;
9796 
9797     case ANYOFHr:
9798         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9799             while (  hardcount < max
9800                    && scan < this_eol
9801                    && inRANGE(NATIVE_UTF8_TO_I8(*scan),
9802                               LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
9803                               HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
9804                    && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
9805                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9806             {
9807                 scan += UTF8SKIP(scan);
9808                 hardcount++;
9809             }
9810         }
9811         break;
9812 
9813     case ANYOFHs:
9814         if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
9815             while (   hardcount < max
9816                    && scan + FLAGS(p) < this_eol
9817                    && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
9818                    && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
9819             {
9820                 scan += UTF8SKIP(scan);
9821                 hardcount++;
9822             }
9823         }
9824         break;
9825 
9826     case ANYOFR:
9827         if (utf8_target) {
9828             while (   hardcount < max
9829                    && scan < this_eol
9830                    && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
9831                    && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
9832                                                 (U8 *) this_eol,
9833                                                 NULL),
9834                                   ANYOFRbase(p), ANYOFRdelta(p)))
9835             {
9836                 scan += UTF8SKIP(scan);
9837                 hardcount++;
9838             }
9839         }
9840         else {
9841             while (   hardcount < max
9842                    && scan < this_eol
9843                    && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
9844             {
9845                 scan++;
9846                 hardcount++;
9847             }
9848         }
9849         break;
9850 
9851     case ANYOFRb:
9852         if (utf8_target) {
9853             while (   hardcount < max
9854                    && scan < this_eol
9855                    && (U8) *scan == ANYOF_FLAGS(p)
9856                    && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
9857                                                 (U8 *) this_eol,
9858                                                 NULL),
9859                                   ANYOFRbase(p), ANYOFRdelta(p)))
9860             {
9861                 scan += UTF8SKIP(scan);
9862                 hardcount++;
9863             }
9864         }
9865         else {
9866             while (   hardcount < max
9867                    && scan < this_eol
9868                    && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
9869             {
9870                 scan++;
9871                 hardcount++;
9872             }
9873         }
9874         break;
9875 
9876     /* The argument (FLAGS) to all the POSIX node types is the class number */
9877 
9878     case NPOSIXL:
9879         to_complement = 1;
9880         /* FALLTHROUGH */
9881 
9882     case POSIXL:
9883         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9884 	if (! utf8_target) {
9885 	    while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9886                                                                    *scan)))
9887             {
9888 		scan++;
9889             }
9890 	} else {
9891 	    while (hardcount < max && scan < this_eol
9892                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9893                                                                   (U8 *) scan,
9894                                                                   (U8 *) this_eol)))
9895             {
9896                 scan += UTF8SKIP(scan);
9897 		hardcount++;
9898 	    }
9899 	}
9900 	break;
9901 
9902     case POSIXD:
9903         if (utf8_target) {
9904             goto utf8_posix;
9905         }
9906         /* FALLTHROUGH */
9907 
9908     case POSIXA:
9909         if (utf8_target && this_eol - scan > max) {
9910 
9911             /* We didn't adjust <this_eol> at the beginning of this routine
9912              * because is UTF-8, but it is actually ok to do so, since here, to
9913              * match, 1 char == 1 byte. */
9914             this_eol = scan + max;
9915         }
9916         while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9917 	    scan++;
9918 	}
9919 	break;
9920 
9921     case NPOSIXD:
9922         if (utf8_target) {
9923             to_complement = 1;
9924             goto utf8_posix;
9925         }
9926         /* FALLTHROUGH */
9927 
9928     case NPOSIXA:
9929         if (! utf8_target) {
9930             while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9931                 scan++;
9932             }
9933         }
9934         else {
9935 
9936             /* The complement of something that matches only ASCII matches all
9937              * non-ASCII, plus everything in ASCII that isn't in the class. */
9938 	    while (hardcount < max && scan < this_eol
9939                    && (   ! isASCII_utf8_safe(scan, loceol)
9940                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9941             {
9942                 scan += UTF8SKIP(scan);
9943 		hardcount++;
9944 	    }
9945         }
9946         break;
9947 
9948     case NPOSIXU:
9949         to_complement = 1;
9950         /* FALLTHROUGH */
9951 
9952     case POSIXU:
9953 	if (! utf8_target) {
9954             while (scan < this_eol && to_complement
9955                                 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9956             {
9957                 scan++;
9958             }
9959 	}
9960 	else {
9961           utf8_posix:
9962             classnum = (_char_class_number) FLAGS(p);
9963             switch (classnum) {
9964                 default:
9965                     while (   hardcount < max && scan < this_eol
9966                            && to_complement ^ cBOOL(_invlist_contains_cp(
9967                                               PL_XPosix_ptrs[classnum],
9968                                               utf8_to_uvchr_buf((U8 *) scan,
9969                                                                 (U8 *) this_eol,
9970                                                                 NULL))))
9971                     {
9972                         scan += UTF8SKIP(scan);
9973                         hardcount++;
9974                     }
9975                     break;
9976 
9977                     /* For the classes below, the knowledge of how to handle
9978                      * every code point is compiled in to Perl via a macro.
9979                      * This code is written for making the loops as tight as
9980                      * possible.  It could be refactored to save space instead.
9981                      * */
9982 
9983                 case _CC_ENUM_SPACE:
9984                     while (hardcount < max
9985                            && scan < this_eol
9986                            && (to_complement
9987                                ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
9988                     {
9989                         scan += UTF8SKIP(scan);
9990                         hardcount++;
9991                     }
9992                     break;
9993                 case _CC_ENUM_BLANK:
9994                     while (hardcount < max
9995                            && scan < this_eol
9996                            && (to_complement
9997                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
9998                     {
9999                         scan += UTF8SKIP(scan);
10000                         hardcount++;
10001                     }
10002                     break;
10003                 case _CC_ENUM_XDIGIT:
10004                     while (hardcount < max
10005                            && scan < this_eol
10006                            && (to_complement
10007                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10008                     {
10009                         scan += UTF8SKIP(scan);
10010                         hardcount++;
10011                     }
10012                     break;
10013                 case _CC_ENUM_VERTSPACE:
10014                     while (hardcount < max
10015                            && scan < this_eol
10016                            && (to_complement
10017                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10018                     {
10019                         scan += UTF8SKIP(scan);
10020                         hardcount++;
10021                     }
10022                     break;
10023                 case _CC_ENUM_CNTRL:
10024                     while (hardcount < max
10025                            && scan < this_eol
10026                            && (to_complement
10027                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10028                     {
10029                         scan += UTF8SKIP(scan);
10030                         hardcount++;
10031                     }
10032                     break;
10033             }
10034 	}
10035         break;
10036 
10037     case LNBREAK:
10038         if (utf8_target) {
10039 	    while (hardcount < max && scan < this_eol &&
10040                     (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
10041 		scan += c;
10042 		hardcount++;
10043 	    }
10044 	} else {
10045             /* LNBREAK can match one or two latin chars, which is ok, but we
10046              * have to use hardcount in this situation, and throw away the
10047              * adjustment to <this_eol> done before the switch statement */
10048 	    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
10049 		scan+=c;
10050 		hardcount++;
10051 	    }
10052 	}
10053 	break;
10054 
10055     case BOUNDL:
10056     case NBOUNDL:
10057         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10058         /* FALLTHROUGH */
10059     case BOUND:
10060     case BOUNDA:
10061     case BOUNDU:
10062     case EOS:
10063     case GPOS:
10064     case KEEPS:
10065     case NBOUND:
10066     case NBOUNDA:
10067     case NBOUNDU:
10068     case OPFAIL:
10069     case SBOL:
10070     case SEOL:
10071         /* These are all 0 width, so match right here or not at all. */
10072         break;
10073 
10074     default:
10075         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
10076         NOT_REACHED; /* NOTREACHED */
10077 
10078     }
10079 
10080     if (hardcount)
10081 	c = hardcount;
10082     else
10083 	c = scan - *startposp;
10084     *startposp = scan;
10085 
10086     DEBUG_r({
10087 	DECLARE_AND_GET_RE_DEBUG_FLAGS;
10088 	DEBUG_EXECUTE_r({
10089 	    SV * const prop = sv_newmortal();
10090             regprop(prog, prop, p, reginfo, NULL);
10091             Perl_re_exec_indentf( aTHX_  "%s can match %" IVdf " times out of %" IVdf "...\n",
10092                         depth, SvPVX_const(prop),(IV)c,(IV)max);
10093 	});
10094     });
10095 
10096     return(c);
10097 }
10098 
10099 /*
10100  - reginclass - determine if a character falls into a character class
10101 
10102   n is the ANYOF-type regnode
10103   p is the target string
10104   p_end points to one byte beyond the end of the target string
10105   utf8_target tells whether p is in UTF-8.
10106 
10107   Returns true if matched; false otherwise.
10108 
10109   Note that this can be a synthetic start class, a combination of various
10110   nodes, so things you think might be mutually exclusive, such as locale,
10111   aren't.  It can match both locale and non-locale
10112 
10113  */
10114 
10115 STATIC bool
10116 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10117 {
10118     dVAR;
10119     const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10120                         ? 0
10121                         : ANYOF_FLAGS(n);
10122     bool match = FALSE;
10123     UV c = *p;
10124 
10125     PERL_ARGS_ASSERT_REGINCLASS;
10126 
10127     /* If c is not already the code point, get it.  Note that
10128      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10129     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10130         STRLEN c_len = 0;
10131         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10132 	c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10133 	if (c_len == (STRLEN)-1) {
10134             _force_out_malformed_utf8_message(p, p_end,
10135                                               utf8n_flags,
10136                                               1 /* 1 means die */ );
10137             NOT_REACHED; /* NOTREACHED */
10138         }
10139         if (     c > 255
10140             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10141             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
10142         {
10143             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10144         }
10145     }
10146 
10147     /* If this character is potentially in the bitmap, check it */
10148     if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10149 	if (ANYOF_BITMAP_TEST(n, c))
10150 	    match = TRUE;
10151 	else if ((flags
10152                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10153                   && OP(n) == ANYOFD
10154 		  && ! utf8_target
10155 		  && ! isASCII(c))
10156 	{
10157 	    match = TRUE;
10158 	}
10159 	else if (flags & ANYOF_LOCALE_FLAGS) {
10160 	    if (  (flags & ANYOFL_FOLD)
10161                 && c < sizeof(PL_fold_locale)
10162 		&& ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10163             {
10164                 match = TRUE;
10165             }
10166             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
10167                      && c <= U8_MAX  /* param to isFOO_lc() */
10168             ) {
10169 
10170                 /* The data structure is arranged so bits 0, 2, 4, ... are set
10171                  * if the class includes the Posix character class given by
10172                  * bit/2; and 1, 3, 5, ... are set if the class includes the
10173                  * complemented Posix class given by int(bit/2).  So we loop
10174                  * through the bits, each time changing whether we complement
10175                  * the result or not.  Suppose for the sake of illustration
10176                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
10177                  * is set, it means there is a match for this ANYOF node if the
10178                  * character is in the class given by the expression (0 / 2 = 0
10179                  * = \w).  If it is in that class, isFOO_lc() will return 1,
10180                  * and since 'to_complement' is 0, the result will stay TRUE,
10181                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
10182                  * bit 1 is 1.  That means there is a match if the character
10183                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
10184                  * but will on bit 1.  On the second iteration 'to_complement'
10185                  * will be 1, so the exclusive or will reverse things, so we
10186                  * are testing for \W.  On the third iteration, 'to_complement'
10187                  * will be 0, and we would be testing for \s; the fourth
10188                  * iteration would test for \S, etc.
10189                  *
10190                  * Note that this code assumes that all the classes are closed
10191                  * under folding.  For example, if a character matches \w, then
10192                  * its fold does too; and vice versa.  This should be true for
10193                  * any well-behaved locale for all the currently defined Posix
10194                  * classes, except for :lower: and :upper:, which are handled
10195                  * by the pseudo-class :cased: which matches if either of the
10196                  * other two does.  To get rid of this assumption, an outer
10197                  * loop could be used below to iterate over both the source
10198                  * character, and its fold (if different) */
10199 
10200                 int count = 0;
10201                 int to_complement = 0;
10202 
10203                 while (count < ANYOF_MAX) {
10204                     if (ANYOF_POSIXL_TEST(n, count)
10205                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
10206                     {
10207                         match = TRUE;
10208                         break;
10209                     }
10210                     count++;
10211                     to_complement ^= 1;
10212                 }
10213 	    }
10214 	}
10215     }
10216 
10217 
10218     /* If the bitmap didn't (or couldn't) match, and something outside the
10219      * bitmap could match, try that. */
10220     if (!match) {
10221 	if (c >= NUM_ANYOF_CODE_POINTS
10222             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
10223         {
10224 	    match = TRUE;	/* Everything above the bitmap matches */
10225 	}
10226             /* Here doesn't match everything above the bitmap.  If there is
10227              * some information available beyond the bitmap, we may find a
10228              * match in it.  If so, this is most likely because the code point
10229              * is outside the bitmap range.  But rarely, it could be because of
10230              * some other reason.  If so, various flags are set to indicate
10231              * this possibility.  On ANYOFD nodes, there may be matches that
10232              * happen only when the target string is UTF-8; or for other node
10233              * types, because runtime lookup is needed, regardless of the
10234              * UTF-8ness of the target string.  Finally, under /il, there may
10235              * be some matches only possible if the locale is a UTF-8 one. */
10236 	else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
10237                  && (   c >= NUM_ANYOF_CODE_POINTS
10238                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
10239                          && (   UNLIKELY(OP(n) != ANYOFD)
10240                              || (utf8_target && ! isASCII_uni(c)
10241 #                               if NUM_ANYOF_CODE_POINTS > 256
10242                                                                  && c < 256
10243 #                               endif
10244                                 )))
10245                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10246                          && IN_UTF8_CTYPE_LOCALE)))
10247         {
10248             SV* only_utf8_locale = NULL;
10249 	    SV * const definition =
10250 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
10251                 get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10252                                             &only_utf8_locale, NULL);
10253 #else
10254                 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
10255                                              &only_utf8_locale, NULL);
10256 #endif
10257 	    if (definition) {
10258                 U8 utf8_buffer[2];
10259 		U8 * utf8_p;
10260 		if (utf8_target) {
10261 		    utf8_p = (U8 *) p;
10262 		} else { /* Convert to utf8 */
10263 		    utf8_p = utf8_buffer;
10264                     append_utf8_from_native_byte(*p, &utf8_p);
10265 		    utf8_p = utf8_buffer;
10266 		}
10267 
10268                 /* Turkish locales have these hard-coded rules overriding
10269                  * normal ones */
10270                 if (   UNLIKELY(PL_in_utf8_turkic_locale)
10271                     && isALPHA_FOLD_EQ(*p, 'i'))
10272                 {
10273                     if (*p == 'i') {
10274                         if (_invlist_contains_cp(definition,
10275                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10276                         {
10277                             match = TRUE;
10278                         }
10279                     }
10280                     else if (*p == 'I') {
10281                         if (_invlist_contains_cp(definition,
10282                                                 LATIN_SMALL_LETTER_DOTLESS_I))
10283                         {
10284                             match = TRUE;
10285                         }
10286                     }
10287                 }
10288                 else if (_invlist_contains_cp(definition, c)) {
10289 		    match = TRUE;
10290                 }
10291 	    }
10292             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10293                 match = _invlist_contains_cp(only_utf8_locale, c);
10294             }
10295 	}
10296 
10297         /* In a Turkic locale under folding, hard-code the I i case pair
10298          * matches */
10299         if (     UNLIKELY(PL_in_utf8_turkic_locale)
10300             && ! match
10301             &&   (flags & ANYOFL_FOLD)
10302             &&   utf8_target)
10303         {
10304             if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10305 		if (ANYOF_BITMAP_TEST(n, 'i')) {
10306                     match = TRUE;
10307                 }
10308             }
10309             else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10310 		if (ANYOF_BITMAP_TEST(n, 'I')) {
10311                     match = TRUE;
10312                 }
10313             }
10314         }
10315 
10316         if (UNICODE_IS_SUPER(c)
10317             && (flags
10318                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10319             && OP(n) != ANYOFD
10320             && ckWARN_d(WARN_NON_UNICODE))
10321         {
10322             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10323                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10324         }
10325     }
10326 
10327 #if ANYOF_INVERT != 1
10328     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10329      * use it, don't */
10330 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10331 #endif
10332 
10333     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10334     return (flags & ANYOF_INVERT) ^ match;
10335 }
10336 
10337 STATIC U8 *
10338 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10339 {
10340     /* return the position 'off' UTF-8 characters away from 's', forward if
10341      * 'off' >= 0, backwards if negative.  But don't go outside of position
10342      * 'lim', which better be < s  if off < 0 */
10343 
10344     PERL_ARGS_ASSERT_REGHOP3;
10345 
10346     if (off >= 0) {
10347 	while (off-- && s < lim) {
10348 	    /* XXX could check well-formedness here */
10349 	    U8 *new_s = s + UTF8SKIP(s);
10350             if (new_s > lim) /* lim may be in the middle of a long character */
10351                 return s;
10352             s = new_s;
10353 	}
10354     }
10355     else {
10356         while (off++ && s > lim) {
10357             s--;
10358             if (UTF8_IS_CONTINUED(*s)) {
10359                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10360                     s--;
10361                 if (! UTF8_IS_START(*s)) {
10362                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10363                 }
10364 	    }
10365             /* XXX could check well-formedness here */
10366 	}
10367     }
10368     return s;
10369 }
10370 
10371 STATIC U8 *
10372 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10373 {
10374     PERL_ARGS_ASSERT_REGHOP4;
10375 
10376     if (off >= 0) {
10377         while (off-- && s < rlim) {
10378             /* XXX could check well-formedness here */
10379             s += UTF8SKIP(s);
10380         }
10381     }
10382     else {
10383         while (off++ && s > llim) {
10384             s--;
10385             if (UTF8_IS_CONTINUED(*s)) {
10386                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10387                     s--;
10388                 if (! UTF8_IS_START(*s)) {
10389                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10390                 }
10391             }
10392             /* XXX could check well-formedness here */
10393         }
10394     }
10395     return s;
10396 }
10397 
10398 /* like reghop3, but returns NULL on overrun, rather than returning last
10399  * char pos */
10400 
10401 STATIC U8 *
10402 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10403 {
10404     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10405 
10406     if (off >= 0) {
10407 	while (off-- && s < lim) {
10408 	    /* XXX could check well-formedness here */
10409 	    s += UTF8SKIP(s);
10410 	}
10411 	if (off >= 0)
10412 	    return NULL;
10413     }
10414     else {
10415         while (off++ && s > lim) {
10416             s--;
10417             if (UTF8_IS_CONTINUED(*s)) {
10418                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10419                     s--;
10420                 if (! UTF8_IS_START(*s)) {
10421                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10422                 }
10423 	    }
10424             /* XXX could check well-formedness here */
10425 	}
10426 	if (off <= 0)
10427 	    return NULL;
10428     }
10429     return s;
10430 }
10431 
10432 
10433 /* when executing a regex that may have (?{}), extra stuff needs setting
10434    up that will be visible to the called code, even before the current
10435    match has finished. In particular:
10436 
10437    * $_ is localised to the SV currently being matched;
10438    * pos($_) is created if necessary, ready to be updated on each call-out
10439      to code;
10440    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10441      isn't set until the current pattern is successfully finished), so that
10442      $1 etc of the match-so-far can be seen;
10443    * save the old values of subbeg etc of the current regex, and  set then
10444      to the current string (again, this is normally only done at the end
10445      of execution)
10446 */
10447 
10448 static void
10449 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10450 {
10451     MAGIC *mg;
10452     regexp *const rex = ReANY(reginfo->prog);
10453     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10454 
10455     eval_state->rex = rex;
10456     eval_state->sv  = reginfo->sv;
10457 
10458     if (reginfo->sv) {
10459         /* Make $_ available to executed code. */
10460         if (reginfo->sv != DEFSV) {
10461             SAVE_DEFSV;
10462             DEFSV_set(reginfo->sv);
10463         }
10464         /* will be dec'd by S_cleanup_regmatch_info_aux */
10465         SvREFCNT_inc_NN(reginfo->sv);
10466 
10467         if (!(mg = mg_find_mglob(reginfo->sv))) {
10468             /* prepare for quick setting of pos */
10469             mg = sv_magicext_mglob(reginfo->sv);
10470             mg->mg_len = -1;
10471         }
10472         eval_state->pos_magic = mg;
10473         eval_state->pos       = mg->mg_len;
10474         eval_state->pos_flags = mg->mg_flags;
10475     }
10476     else
10477         eval_state->pos_magic = NULL;
10478 
10479     if (!PL_reg_curpm) {
10480         /* PL_reg_curpm is a fake PMOP that we can attach the current
10481          * regex to and point PL_curpm at, so that $1 et al are visible
10482          * within a /(?{})/. It's just allocated once per interpreter the
10483          * first time its needed */
10484         Newxz(PL_reg_curpm, 1, PMOP);
10485 #ifdef USE_ITHREADS
10486         {
10487             SV* const repointer = &PL_sv_undef;
10488             /* this regexp is also owned by the new PL_reg_curpm, which
10489                will try to free it.  */
10490             av_push(PL_regex_padav, repointer);
10491             PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10492             PL_regex_pad = AvARRAY(PL_regex_padav);
10493         }
10494 #endif
10495     }
10496     SET_reg_curpm(reginfo->prog);
10497     eval_state->curpm = PL_curpm;
10498     PL_curpm_under = PL_curpm;
10499     PL_curpm = PL_reg_curpm;
10500     if (RXp_MATCH_COPIED(rex)) {
10501         /*  Here is a serious problem: we cannot rewrite subbeg,
10502             since it may be needed if this match fails.  Thus
10503             $` inside (?{}) could fail... */
10504         eval_state->subbeg     = rex->subbeg;
10505         eval_state->sublen     = rex->sublen;
10506         eval_state->suboffset  = rex->suboffset;
10507         eval_state->subcoffset = rex->subcoffset;
10508 #ifdef PERL_ANY_COW
10509         eval_state->saved_copy = rex->saved_copy;
10510 #endif
10511         RXp_MATCH_COPIED_off(rex);
10512     }
10513     else
10514         eval_state->subbeg = NULL;
10515     rex->subbeg = (char *)reginfo->strbeg;
10516     rex->suboffset = 0;
10517     rex->subcoffset = 0;
10518     rex->sublen = reginfo->strend - reginfo->strbeg;
10519 }
10520 
10521 
10522 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10523 
10524 static void
10525 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10526 {
10527     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10528     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10529     regmatch_slab *s;
10530 
10531     Safefree(aux->poscache);
10532 
10533     if (eval_state) {
10534 
10535         /* undo the effects of S_setup_eval_state() */
10536 
10537         if (eval_state->subbeg) {
10538             regexp * const rex = eval_state->rex;
10539             rex->subbeg     = eval_state->subbeg;
10540             rex->sublen     = eval_state->sublen;
10541             rex->suboffset  = eval_state->suboffset;
10542             rex->subcoffset = eval_state->subcoffset;
10543 #ifdef PERL_ANY_COW
10544             rex->saved_copy = eval_state->saved_copy;
10545 #endif
10546             RXp_MATCH_COPIED_on(rex);
10547         }
10548         if (eval_state->pos_magic)
10549         {
10550             eval_state->pos_magic->mg_len = eval_state->pos;
10551             eval_state->pos_magic->mg_flags =
10552                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10553                | (eval_state->pos_flags & MGf_BYTES);
10554         }
10555 
10556         PL_curpm = eval_state->curpm;
10557         SvREFCNT_dec(eval_state->sv);
10558     }
10559 
10560     PL_regmatch_state = aux->old_regmatch_state;
10561     PL_regmatch_slab  = aux->old_regmatch_slab;
10562 
10563     /* free all slabs above current one - this must be the last action
10564      * of this function, as aux and eval_state are allocated within
10565      * slabs and may be freed here */
10566 
10567     s = PL_regmatch_slab->next;
10568     if (s) {
10569         PL_regmatch_slab->next = NULL;
10570         while (s) {
10571             regmatch_slab * const osl = s;
10572             s = s->next;
10573             Safefree(osl);
10574         }
10575     }
10576 }
10577 
10578 
10579 STATIC void
10580 S_to_utf8_substr(pTHX_ regexp *prog)
10581 {
10582     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10583      * on the converted value */
10584 
10585     int i = 1;
10586 
10587     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10588 
10589     do {
10590 	if (prog->substrs->data[i].substr
10591 	    && !prog->substrs->data[i].utf8_substr) {
10592 	    SV* const sv = newSVsv(prog->substrs->data[i].substr);
10593 	    prog->substrs->data[i].utf8_substr = sv;
10594 	    sv_utf8_upgrade(sv);
10595 	    if (SvVALID(prog->substrs->data[i].substr)) {
10596 		if (SvTAIL(prog->substrs->data[i].substr)) {
10597 		    /* Trim the trailing \n that fbm_compile added last
10598 		       time.  */
10599 		    SvCUR_set(sv, SvCUR(sv) - 1);
10600 		    /* Whilst this makes the SV technically "invalid" (as its
10601 		       buffer is no longer followed by "\0") when fbm_compile()
10602 		       adds the "\n" back, a "\0" is restored.  */
10603 		    fbm_compile(sv, FBMcf_TAIL);
10604 		} else
10605 		    fbm_compile(sv, 0);
10606 	    }
10607 	    if (prog->substrs->data[i].substr == prog->check_substr)
10608 		prog->check_utf8 = sv;
10609 	}
10610     } while (i--);
10611 }
10612 
10613 STATIC bool
10614 S_to_byte_substr(pTHX_ regexp *prog)
10615 {
10616     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10617      * on the converted value; returns FALSE if can't be converted. */
10618 
10619     int i = 1;
10620 
10621     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10622 
10623     do {
10624 	if (prog->substrs->data[i].utf8_substr
10625 	    && !prog->substrs->data[i].substr) {
10626 	    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10627 	    if (! sv_utf8_downgrade(sv, TRUE)) {
10628                 SvREFCNT_dec_NN(sv);
10629                 return FALSE;
10630             }
10631             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10632                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10633                     /* Trim the trailing \n that fbm_compile added last
10634                         time.  */
10635                     SvCUR_set(sv, SvCUR(sv) - 1);
10636                     fbm_compile(sv, FBMcf_TAIL);
10637                 } else
10638                     fbm_compile(sv, 0);
10639             }
10640 	    prog->substrs->data[i].substr = sv;
10641 	    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10642 		prog->check_substr = sv;
10643 	}
10644     } while (i--);
10645 
10646     return TRUE;
10647 }
10648 
10649 #ifndef PERL_IN_XSUB_RE
10650 
10651 bool
10652 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10653 {
10654     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
10655      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
10656      * the larger string bounded by 'strbeg' and 'strend'.
10657      *
10658      * 'cp' needs to be assigned (if not, a future version of the Unicode
10659      * Standard could make it something that combines with adjacent characters,
10660      * so code using it would then break), and there has to be a GCB break
10661      * before and after the character. */
10662 
10663     dVAR;
10664 
10665     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10666     const U8 * prev_cp_start;
10667 
10668     PERL_ARGS_ASSERT_IS_GRAPHEME;
10669 
10670     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
10671         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10672     {
10673         /* These are considered graphemes */
10674         return TRUE;
10675     }
10676 
10677     /* Otherwise, unassigned code points are forbidden */
10678     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10679                                     _invlist_search(PL_Assigned_invlist, cp))))
10680     {
10681         return FALSE;
10682     }
10683 
10684     cp_gcb_val = getGCB_VAL_CP(cp);
10685 
10686     /* Find the GCB value of the previous code point in the input */
10687     prev_cp_start = utf8_hop_back(s, -1, strbeg);
10688     if (UNLIKELY(prev_cp_start == s)) {
10689         prev_cp_gcb_val = GCB_EDGE;
10690     }
10691     else {
10692         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10693     }
10694 
10695     /* And check that is a grapheme boundary */
10696     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10697                 TRUE /* is UTF-8 encoded */ ))
10698     {
10699         return FALSE;
10700     }
10701 
10702     /* Similarly verify there is a break between the current character and the
10703      * following one */
10704     s += UTF8SKIP(s);
10705     if (s >= strend) {
10706         next_cp_gcb_val = GCB_EDGE;
10707     }
10708     else {
10709         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10710     }
10711 
10712     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10713 }
10714 
10715 /*
10716 =head1 Unicode Support
10717 
10718 =for apidoc isSCRIPT_RUN
10719 
10720 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10721 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
10722 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
10723 two degenerate cases given below, this function returns TRUE iff all code
10724 points in it come from any combination of three "scripts" given by the Unicode
10725 "Script Extensions" property: Common, Inherited, and possibly one other.
10726 Additionally all decimal digits must come from the same consecutive sequence of
10727 10.
10728 
10729 For example, if all the characters in the sequence are Greek, or Common, or
10730 Inherited, this function will return TRUE, provided any decimal digits in it
10731 are from the same block of digits in Common.  (These are the ASCII digits
10732 "0".."9" and additionally a block for full width forms of these, and several
10733 others used in mathematical notation.)   For scripts (unlike Greek) that have
10734 their own digits defined this will accept either digits from that set or from
10735 one of the Common digit sets, but not a combination of the two.  Some scripts,
10736 such as Arabic, have more than one set of digits.  All digits must come from
10737 the same set for this function to return TRUE.
10738 
10739 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10740 contain the script found, using the C<SCX_enum> typedef.  Its value will be
10741 C<SCX_INVALID> if the function returns FALSE.
10742 
10743 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10744 will be C<SCX_INVALID>.
10745 
10746 If the sequence contains a single code point which is unassigned to a character
10747 in the version of Unicode being used, the function will return TRUE, and the
10748 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
10749 in the input sequence will result in the function treating the input as not
10750 being a script run.
10751 
10752 The returned script will be C<SCX_Inherited> iff all the code points in it are
10753 from the Inherited script.
10754 
10755 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10756 it are from the Inherited or Common scripts.
10757 
10758 =cut
10759 
10760 */
10761 
10762 bool
10763 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10764 {
10765     /* Basically, it looks at each character in the sequence to see if the
10766      * above conditions are met; if not it fails.  It uses an inversion map to
10767      * find the enum corresponding to the script of each character.  But this
10768      * is complicated by the fact that a few code points can be in any of
10769      * several scripts.  The data has been constructed so that there are
10770      * additional enum values (all negative) for these situations.  The
10771      * absolute value of those is an index into another table which contains
10772      * pointers to auxiliary tables for each such situation.  Each aux array
10773      * lists all the scripts for the given situation.  There is another,
10774      * parallel, table that gives the number of entries in each aux table.
10775      * These are all defined in charclass_invlists.h */
10776 
10777     /* XXX Here are the additional things UTS 39 says could be done:
10778      *
10779      * Forbid sequences of the same nonspacing mark
10780      *
10781      * Check to see that all the characters are in the sets of exemplar
10782      * characters for at least one language in the Unicode Common Locale Data
10783      * Repository [CLDR]. */
10784 
10785     dVAR;
10786 
10787     /* Things that match /\d/u */
10788     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10789     UV * decimals_array = invlist_array(decimals_invlist);
10790 
10791     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10792      * not currently known) */
10793     UV zero_of_run = 0;
10794 
10795     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
10796     SCX_enum script_of_char = SCX_INVALID;
10797 
10798     /* If the script remains not fully determined from iteration to iteration,
10799      * this is the current intersection of the possiblities.  */
10800     SCX_enum * intersection = NULL;
10801     PERL_UINT_FAST8_T intersection_len = 0;
10802 
10803     bool retval = TRUE;
10804     SCX_enum * ret_script = NULL;
10805 
10806     assert(send >= s);
10807 
10808     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10809 
10810     /* All code points in 0..255 are either Common or Latin, so must be a
10811      * script run.  We can return immediately unless we need to know which
10812      * script it is. */
10813     if (! utf8_target && LIKELY(send > s)) {
10814         if (ret_script == NULL) {
10815             return TRUE;
10816         }
10817 
10818         /* If any character is Latin, the run is Latin */
10819         while (s < send) {
10820             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10821                 *ret_script = SCX_Latin;
10822                 return TRUE;
10823             }
10824         }
10825 
10826         /* Here, all are Common */
10827         *ret_script = SCX_Common;
10828         return TRUE;
10829     }
10830 
10831     /* Look at each character in the sequence */
10832     while (s < send) {
10833         /* If the current character being examined is a digit, this is the code
10834          * point of the zero for its sequence of 10 */
10835         UV zero_of_char;
10836 
10837         UV cp;
10838 
10839         /* The code allows all scripts to use the ASCII digits.  This is
10840          * because they are in the Common script.  Hence any ASCII ones found
10841          * are ok, unless and until a digit from another set has already been
10842          * encountered.  digit ranges in Common are not similarly blessed) */
10843         if (UNLIKELY(isDIGIT(*s))) {
10844             if (UNLIKELY(script_of_run == SCX_Unknown)) {
10845                 retval = FALSE;
10846                 break;
10847             }
10848             if (zero_of_run) {
10849                 if (zero_of_run != '0') {
10850                     retval = FALSE;
10851                     break;
10852                 }
10853             }
10854             else {
10855                 zero_of_run = '0';
10856             }
10857             s++;
10858             continue;
10859         }
10860 
10861         /* Here, isn't an ASCII digit.  Find the code point of the character */
10862         if (! UTF8_IS_INVARIANT(*s)) {
10863             Size_t len;
10864             cp = valid_utf8_to_uvchr((U8 *) s, &len);
10865             s += len;
10866         }
10867         else {
10868             cp = *(s++);
10869         }
10870 
10871         /* If is within the range [+0 .. +9] of the script's zero, it also is a
10872          * digit in that script.  We can skip the rest of this code for this
10873          * character. */
10874         if (UNLIKELY(   zero_of_run
10875                      && cp >= zero_of_run
10876                      && cp - zero_of_run <= 9))
10877         {
10878             continue;
10879         }
10880 
10881         /* Find the character's script.  The correct values are hard-coded here
10882          * for small-enough code points. */
10883         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
10884                                unlikely to change */
10885             if (       cp > 255
10886                 || (   isALPHA_L1(cp)
10887                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
10888             {
10889                 script_of_char = SCX_Latin;
10890             }
10891             else {
10892                 script_of_char = SCX_Common;
10893             }
10894         }
10895         else {
10896             script_of_char = _Perl_SCX_invmap[
10897                                        _invlist_search(PL_SCX_invlist, cp)];
10898         }
10899 
10900         /* We arbitrarily accept a single unassigned character, but not in
10901          * combination with anything else, and not a run of them. */
10902         if (   UNLIKELY(script_of_run == SCX_Unknown)
10903             || UNLIKELY(   script_of_run != SCX_INVALID
10904                         && script_of_char == SCX_Unknown))
10905         {
10906             retval = FALSE;
10907             break;
10908         }
10909 
10910         /* For the first character, or the run is inherited, the run's script
10911          * is set to the char's */
10912         if (   UNLIKELY(script_of_run == SCX_INVALID)
10913             || UNLIKELY(script_of_run == SCX_Inherited))
10914         {
10915             script_of_run = script_of_char;
10916         }
10917 
10918         /* For the character's script to be Unknown, it must be the first
10919          * character in the sequence (for otherwise a test above would have
10920          * prevented us from reaching here), and we have set the run's script
10921          * to it.  Nothing further to be done for this character */
10922         if (UNLIKELY(script_of_char == SCX_Unknown)) {
10923             continue;
10924         }
10925 
10926         /* We accept 'inherited' script characters currently even at the
10927          * beginning.  (We know that no characters in Inherited are digits, or
10928          * we'd have to check for that) */
10929         if (UNLIKELY(script_of_char == SCX_Inherited)) {
10930             continue;
10931         }
10932 
10933         /* If the run so far is Common, and the new character isn't, change the
10934          * run's script to that of this character */
10935         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10936             script_of_run = script_of_char;
10937         }
10938 
10939         /* Now we can see if the script of the new character is the same as
10940          * that of the run */
10941         if (LIKELY(script_of_char == script_of_run)) {
10942             /* By far the most common case */
10943             goto scripts_match;
10944         }
10945 
10946         /* Here, the script of the run isn't Common.  But characters in Common
10947          * match any script */
10948         if (script_of_char == SCX_Common) {
10949             goto scripts_match;
10950         }
10951 
10952 #ifndef HAS_SCX_AUX_TABLES
10953 
10954         /* Too early a Unicode version to have a code point belonging to more
10955          * than one script, so, if the scripts don't exactly match, fail */
10956         PERL_UNUSED_VAR(intersection_len);
10957         retval = FALSE;
10958         break;
10959 
10960 #else
10961 
10962         /* Here there is no exact match between the character's script and the
10963          * run's.  And we've handled the special cases of scripts Unknown,
10964          * Inherited, and Common.
10965          *
10966          * Negative script numbers signify that the value may be any of several
10967          * scripts, and we need to look at auxiliary information to make our
10968          * deterimination.  But if both are non-negative, we can fail now */
10969         if (LIKELY(script_of_char >= 0)) {
10970             const SCX_enum * search_in;
10971             PERL_UINT_FAST8_T search_in_len;
10972             PERL_UINT_FAST8_T i;
10973 
10974             if (LIKELY(script_of_run >= 0)) {
10975                 retval = FALSE;
10976                 break;
10977             }
10978 
10979             /* Use the previously constructed set of possible scripts, if any.
10980              * */
10981             if (intersection) {
10982                 search_in = intersection;
10983                 search_in_len = intersection_len;
10984             }
10985             else {
10986                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10987                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10988             }
10989 
10990             for (i = 0; i < search_in_len; i++) {
10991                 if (search_in[i] == script_of_char) {
10992                     script_of_run = script_of_char;
10993                     goto scripts_match;
10994                 }
10995             }
10996 
10997             retval = FALSE;
10998             break;
10999         }
11000         else if (LIKELY(script_of_run >= 0)) {
11001             /* script of character could be one of several, but run is a single
11002              * script */
11003             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11004             const PERL_UINT_FAST8_T search_in_len
11005                                      = SCX_AUX_TABLE_lengths[-script_of_char];
11006             PERL_UINT_FAST8_T i;
11007 
11008             for (i = 0; i < search_in_len; i++) {
11009                 if (search_in[i] == script_of_run) {
11010                     script_of_char = script_of_run;
11011                     goto scripts_match;
11012                 }
11013             }
11014 
11015             retval = FALSE;
11016             break;
11017         }
11018         else {
11019             /* Both run and char could be in one of several scripts.  If the
11020              * intersection is empty, then this character isn't in this script
11021              * run.  Otherwise, we need to calculate the intersection to use
11022              * for future iterations of the loop, unless we are already at the
11023              * final character */
11024             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11025             const PERL_UINT_FAST8_T char_len
11026                                       = SCX_AUX_TABLE_lengths[-script_of_char];
11027             const SCX_enum * search_run;
11028             PERL_UINT_FAST8_T run_len;
11029 
11030             SCX_enum * new_overlap = NULL;
11031             PERL_UINT_FAST8_T i, j;
11032 
11033             if (intersection) {
11034                 search_run = intersection;
11035                 run_len = intersection_len;
11036             }
11037             else {
11038                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11039                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11040             }
11041 
11042             intersection_len = 0;
11043 
11044             for (i = 0; i < run_len; i++) {
11045                 for (j = 0; j < char_len; j++) {
11046                     if (search_run[i] == search_char[j]) {
11047 
11048                         /* Here, the script at i,j matches.  That means this
11049                          * character is in the run.  But continue on to find
11050                          * the complete intersection, for the next loop
11051                          * iteration, and for the digit check after it.
11052                          *
11053                          * On the first found common script, we malloc space
11054                          * for the intersection list for the worst case of the
11055                          * intersection, which is the minimum of the number of
11056                          * scripts remaining in each set. */
11057                         if (intersection_len == 0) {
11058                             Newx(new_overlap,
11059                                  MIN(run_len - i, char_len - j),
11060                                  SCX_enum);
11061                         }
11062                         new_overlap[intersection_len++] = search_run[i];
11063                     }
11064                 }
11065             }
11066 
11067             /* Here we've looked through everything.  If they have no scripts
11068              * in common, not a run */
11069             if (intersection_len == 0) {
11070                 retval = FALSE;
11071                 break;
11072             }
11073 
11074             /* If there is only a single script in common, set to that.
11075              * Otherwise, use the intersection going forward */
11076             Safefree(intersection);
11077             intersection = NULL;
11078             if (intersection_len == 1) {
11079                 script_of_run = script_of_char = new_overlap[0];
11080                 Safefree(new_overlap);
11081                 new_overlap = NULL;
11082             }
11083             else {
11084                 intersection = new_overlap;
11085             }
11086         }
11087 
11088 #endif
11089 
11090   scripts_match:
11091 
11092         /* Here, the script of the character is compatible with that of the
11093          * run.  That means that in most cases, it continues the script run.
11094          * Either it and the run match exactly, or one or both can be in any of
11095          * several scripts, and the intersection is not empty.  However, if the
11096          * character is a decimal digit, it could still mean failure if it is
11097          * from the wrong sequence of 10.  So, we need to look at if it's a
11098          * digit.  We've already handled the 10 decimal digits, and the next
11099          * lowest one is this one: */
11100         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11101             continue;   /* Not a digit; this character is part of the run */
11102         }
11103 
11104         /* If we have a definitive '0' for the script of this character, we
11105          * know that for this to be a digit, it must be in the range of +0..+9
11106          * of that zero. */
11107         if (   script_of_char >= 0
11108             && (zero_of_char = script_zeros[script_of_char]))
11109         {
11110             if (   cp < zero_of_char
11111                 || cp > zero_of_char + 9)
11112             {
11113                 continue;   /* Not a digit; this character is part of the run
11114                              */
11115             }
11116 
11117         }
11118         else {  /* Need to look up if this character is a digit or not */
11119             SSize_t index_of_zero_of_char;
11120             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11121             if (     UNLIKELY(index_of_zero_of_char < 0)
11122                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11123             {
11124                 continue;   /* Not a digit; this character is part of the run.
11125                              */
11126             }
11127 
11128             zero_of_char = decimals_array[index_of_zero_of_char];
11129         }
11130 
11131         /* Here, the character is a decimal digit, and the zero of its sequence
11132          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
11133          * they better be the same. */
11134         if (zero_of_run) {
11135             if (zero_of_run != zero_of_char) {
11136                 retval = FALSE;
11137                 break;
11138             }
11139         }
11140         else {  /* Otherwise we now have a zero for this run */
11141             zero_of_run = zero_of_char;
11142         }
11143     } /* end of looping through CLOSESR text */
11144 
11145     Safefree(intersection);
11146 
11147     if (ret_script != NULL) {
11148         if (retval) {
11149             *ret_script = script_of_run;
11150         }
11151         else {
11152             *ret_script = SCX_INVALID;
11153         }
11154     }
11155 
11156     return retval;
11157 }
11158 
11159 #endif /* ifndef PERL_IN_XSUB_RE */
11160 
11161 /*
11162  * ex: set ts=8 sts=4 sw=4 et:
11163  */
11164