xref: /openbsd-src/gnu/usr.bin/perl/regexec.c (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*    regexec.c
2  */
3 
4 /*
5  *	One Ring to rule them all, One Ring to find them
6  *
7  *     [p.v of _The Lord of the Rings_, opening poem]
8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10  */
11 
12 /* This file contains functions for executing a regular expression.  See
13  * also regcomp.c which funnily enough, contains functions for compiling
14  * a regular expression.
15  *
16  * This file is also copied at build time to ext/re/re_exec.c, where
17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18  * This causes the main functions to be compiled under new names and with
19  * debugging support added, which makes "use re 'debug'" work.
20  */
21 
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23  * confused with the original package (see point 3 below).  Thanks, Henry!
24  */
25 
26 /* Additional note: this code is very heavily munged from Henry's version
27  * in places.  In some spots I've traded clarity for efficiency, so don't
28  * blame Henry for some of the lack of readability.
29  */
30 
31 /* The names of the functions have been changed from regcomp and
32  * regexec to  pregcomp and pregexec in order to avoid conflicts
33  * with the POSIX routines of the same names.
34 */
35 
36 #ifdef PERL_EXT_RE_BUILD
37 #include "re_top.h"
38 #endif
39 
40 /*
41  * pregcomp and pregexec -- regsub and regerror are not used in perl
42  *
43  *	Copyright (c) 1986 by University of Toronto.
44  *	Written by Henry Spencer.  Not derived from licensed software.
45  *
46  *	Permission is granted to anyone to use this software for any
47  *	purpose on any computer system, and to redistribute it freely,
48  *	subject to the following restrictions:
49  *
50  *	1. The author is not responsible for the consequences of use of
51  *		this software, no matter how awful, even if they arise
52  *		from defects in it.
53  *
54  *	2. The origin of this software must not be misrepresented, either
55  *		by explicit claim or by omission.
56  *
57  *	3. Altered versions must be plainly marked as such, and must not
58  *		be misrepresented as being the original software.
59  *
60  ****    Alterations to Henry's code are...
61  ****
62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64  ****    by Larry Wall and others
65  ****
66  ****    You may distribute under the terms of either the GNU General Public
67  ****    License or the Artistic License, as specified in the README file.
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGEXEC_C
75 #include "perl.h"
76 
77 #ifdef PERL_IN_XSUB_RE
78 #  include "re_comp.h"
79 #else
80 #  include "regcomp.h"
81 #endif
82 
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
85 
86 static const char b_utf8_locale_required[] =
87  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
88                                                 "  Assuming a UTF-8 locale";
89 
90 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND                       \
91     STMT_START {                                                            \
92         if (! IN_UTF8_CTYPE_LOCALE) {                                       \
93           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
94                                                 b_utf8_locale_required);    \
95         }                                                                   \
96     } STMT_END
97 
98 static const char sets_utf8_locale_required[] =
99       "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
100 
101 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n)                     \
102     STMT_START {                                                            \
103         if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) {  \
104           Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),                       \
105                                              sets_utf8_locale_required);    \
106         }                                                                   \
107     } STMT_END
108 
109 #ifdef DEBUGGING
110 /* At least one required character in the target string is expressible only in
111  * UTF-8. */
112 static const char non_utf8_target_but_utf8_required[]
113                 = "Can't match, because target string needs to be in UTF-8\n";
114 #endif
115 
116 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START {           \
117     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%s", non_utf8_target_but_utf8_required));\
118     goto target;                                                         \
119 } STMT_END
120 
121 #ifndef STATIC
122 #define	STATIC	static
123 #endif
124 
125 /*
126  * Forwards.
127  */
128 
129 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
130 
131 #define HOPc(pos,off) \
132         (char *)(reginfo->is_utf8_target \
133             ? reghop3((U8*)pos, off, \
134                     (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
135             : (U8*)(pos + off))
136 
137 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
138 #define HOPBACK3(pos, off, lim) \
139         (reginfo->is_utf8_target                          \
140             ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
141             : (pos - off >= lim)	                         \
142                 ? (U8*)pos - off		                 \
143                 : NULL)
144 
145 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
146 
147 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
148 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
149 
150 /* lim must be +ve. Returns NULL on overshoot */
151 #define HOPMAYBE3(pos,off,lim) \
152         (reginfo->is_utf8_target                        \
153             ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
154             : ((U8*)pos + off <= lim)                   \
155                 ? (U8*)pos + off                        \
156                 : NULL)
157 
158 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
159  * off must be >=0; args should be vars rather than expressions */
160 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
161     ? reghop3((U8*)(pos), off, (U8*)(lim)) \
162     : (U8*)((pos + off) > lim ? lim : (pos + off)))
163 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
164 
165 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
166     ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
167     : (U8*)(pos + off))
168 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
169 
170 #define PLACEHOLDER	/* Something for the preprocessor to grab onto */
171 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
172 
173 /* for use after a quantifier and before an EXACT-like node -- japhy */
174 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
175  *
176  * NOTE that *nothing* that affects backtracking should be in here, specifically
177  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
178  * node that is in between two EXACT like nodes when ascertaining what the required
179  * "follow" character is. This should probably be moved to regex compile time
180  * although it may be done at run time beause of the REF possibility - more
181  * investigation required. -- demerphq
182 */
183 #define JUMPABLE(rn) (                                                             \
184     OP(rn) == OPEN ||                                                              \
185     (OP(rn) == CLOSE &&                                                            \
186      !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
187     OP(rn) == EVAL ||                                                              \
188     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
189     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
190     OP(rn) == KEEPS ||                                                             \
191     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
192 )
193 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
194 
195 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
196 
197 /*
198   Search for mandatory following text node; for lookahead, the text must
199   follow but for lookbehind (rn->flags != 0) we skip to the next step.
200 */
201 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
202     while (JUMPABLE(rn)) { \
203         const OPCODE type = OP(rn); \
204         if (type == SUSPEND || PL_regkind[type] == CURLY) \
205             rn = NEXTOPER(NEXTOPER(rn)); \
206         else if (type == PLUS) \
207             rn = NEXTOPER(rn); \
208         else if (type == IFMATCH) \
209             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
210         else rn += NEXT_OFF(rn); \
211     } \
212 } STMT_END
213 
214 #define SLAB_FIRST(s) (&(s)->states[0])
215 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
216 
217 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
218 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
219 static regmatch_state * S_push_slab(pTHX);
220 
221 #define REGCP_PAREN_ELEMS 3
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225  * are needed for the regexp context stack bookkeeping. */
226 
227 STATIC CHECKPOINT
228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
229 {
230     const int retval = PL_savestack_ix;
231     const int paren_elems_to_push =
232                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
233     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
234     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
235     I32 p;
236     DECLARE_AND_GET_RE_DEBUG_FLAGS;
237 
238     PERL_ARGS_ASSERT_REGCPPUSH;
239 
240     if (paren_elems_to_push < 0)
241         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
242                    (int)paren_elems_to_push, (int)maxopenparen,
243                    (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
244 
245     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
246         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
247                    " out of range (%lu-%ld)",
248                    total_elems,
249                    (unsigned long)maxopenparen,
250                    (long)parenfloor);
251 
252     SSGROW(total_elems + REGCP_FRAME_ELEMS);
253 
254     DEBUG_BUFFERS_r(
255         if ((int)maxopenparen > (int)parenfloor)
256             Perl_re_exec_indentf( aTHX_
257                 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
258                 depth,
259                 PTR2UV(rex),
260                 PTR2UV(rex->offs)
261             );
262     );
263     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
264 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
265         SSPUSHIV(rex->offs[p].end);
266         SSPUSHIV(rex->offs[p].start);
267         SSPUSHINT(rex->offs[p].start_tmp);
268         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
269             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
270             depth,
271             (UV)p,
272             (IV)rex->offs[p].start,
273             (IV)rex->offs[p].start_tmp,
274             (IV)rex->offs[p].end
275         ));
276     }
277 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
278     SSPUSHINT(maxopenparen);
279     SSPUSHINT(rex->lastparen);
280     SSPUSHINT(rex->lastcloseparen);
281     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
282 
283     return retval;
284 }
285 
286 /* These are needed since we do not localize EVAL nodes: */
287 #define REGCP_SET(cp)                                           \
288     DEBUG_STATE_r(                                              \
289         Perl_re_exec_indentf( aTHX_                             \
290             "Setting an EVAL scope, savestack=%" IVdf ",\n",    \
291             depth, (IV)PL_savestack_ix                          \
292         )                                                       \
293     );                                                          \
294     cp = PL_savestack_ix
295 
296 #define REGCP_UNWIND(cp)                                        \
297     DEBUG_STATE_r(                                              \
298         if (cp != PL_savestack_ix)                              \
299             Perl_re_exec_indentf( aTHX_                         \
300                 "Clearing an EVAL scope, savestack=%"           \
301                 IVdf "..%" IVdf "\n",                           \
302                 depth, (IV)(cp), (IV)PL_savestack_ix            \
303             )                                                   \
304     );                                                          \
305     regcpblow(cp)
306 
307 /* set the start and end positions of capture ix */
308 #define CLOSE_CAPTURE(ix, s, e)                                            \
309     rex->offs[ix].start = s;                                               \
310     rex->offs[ix].end = e;                                                 \
311     if (ix > rex->lastparen)                                               \
312         rex->lastparen = ix;                                               \
313     rex->lastcloseparen = ix;                                              \
314     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
315         "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
316         depth,                                                             \
317         PTR2UV(rex),                                                       \
318         PTR2UV(rex->offs),                                                 \
319         (UV)ix,                                                            \
320         (IV)rex->offs[ix].start,                                           \
321         (IV)rex->offs[ix].end,                                             \
322         (UV)rex->lastparen                                                 \
323     ))
324 
325 #define UNWIND_PAREN(lp, lcp)               \
326     DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
327         "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
328         depth,                              \
329         PTR2UV(rex),                        \
330         PTR2UV(rex->offs),                  \
331         (UV)(lp),                           \
332         (UV)(rex->lastparen),               \
333         (UV)(lcp)                           \
334     ));                                     \
335     for (n = rex->lastparen; n > lp; n--)   \
336         rex->offs[n].end = -1;              \
337     rex->lastparen = n;                     \
338     rex->lastcloseparen = lcp;
339 
340 
341 STATIC void
342 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
343 {
344     UV i;
345     U32 paren;
346     DECLARE_AND_GET_RE_DEBUG_FLAGS;
347 
348     PERL_ARGS_ASSERT_REGCPPOP;
349 
350     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
351     i = SSPOPUV;
352     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
353     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
354     rex->lastcloseparen = SSPOPINT;
355     rex->lastparen = SSPOPINT;
356     *maxopenparen_p = SSPOPINT;
357 
358     i -= REGCP_OTHER_ELEMS;
359     /* Now restore the parentheses context. */
360     DEBUG_BUFFERS_r(
361         if (i || rex->lastparen + 1 <= rex->nparens)
362             Perl_re_exec_indentf( aTHX_
363                 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
364                 depth,
365                 PTR2UV(rex),
366                 PTR2UV(rex->offs)
367             );
368     );
369     paren = *maxopenparen_p;
370     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
371         SSize_t tmps;
372         rex->offs[paren].start_tmp = SSPOPINT;
373         rex->offs[paren].start = SSPOPIV;
374         tmps = SSPOPIV;
375         if (paren <= rex->lastparen)
376             rex->offs[paren].end = tmps;
377         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
378             "    \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
379             depth,
380             (UV)paren,
381             (IV)rex->offs[paren].start,
382             (IV)rex->offs[paren].start_tmp,
383             (IV)rex->offs[paren].end,
384             (paren > rex->lastparen ? "(skipped)" : ""));
385         );
386         paren--;
387     }
388 #if 1
389     /* It would seem that the similar code in regtry()
390      * already takes care of this, and in fact it is in
391      * a better location to since this code can #if 0-ed out
392      * but the code in regtry() is needed or otherwise tests
393      * requiring null fields (pat.t#187 and split.t#{13,14}
394      * (as of patchlevel 7877)  will fail.  Then again,
395      * this code seems to be necessary or otherwise
396      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397      * --jhi updated by dapm */
398     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399         if (i > *maxopenparen_p)
400             rex->offs[i].start = -1;
401         rex->offs[i].end = -1;
402         DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
403             "    \\%" UVuf ": %s   ..-1 undeffing\n",
404             depth,
405             (UV)i,
406             (i > *maxopenparen_p) ? "-1" : "  "
407         ));
408     }
409 #endif
410 }
411 
412 /* restore the parens and associated vars at savestack position ix,
413  * but without popping the stack */
414 
415 STATIC void
416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
417 {
418     I32 tmpix = PL_savestack_ix;
419     PERL_ARGS_ASSERT_REGCP_RESTORE;
420 
421     PL_savestack_ix = ix;
422     regcppop(rex, maxopenparen_p);
423     PL_savestack_ix = tmpix;
424 }
425 
426 #define regcpblow(cp) LEAVE_SCOPE(cp)	/* Ignores regcppush()ed data. */
427 
428 STATIC bool
429 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
430 {
431     /* Returns a boolean as to whether or not 'character' is a member of the
432      * Posix character class given by 'classnum' that should be equivalent to a
433      * value in the typedef '_char_class_number'.
434      *
435      * Ideally this could be replaced by a just an array of function pointers
436      * to the C library functions that implement the macros this calls.
437      * However, to compile, the precise function signatures are required, and
438      * these may vary from platform to platform.  To avoid having to figure
439      * out what those all are on each platform, I (khw) am using this method,
440      * which adds an extra layer of function call overhead (unless the C
441      * optimizer strips it away).  But we don't particularly care about
442      * performance with locales anyway. */
443 
444     switch ((_char_class_number) classnum) {
445         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
446         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
447         case _CC_ENUM_ASCII:     return isASCII_LC(character);
448         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
449         case _CC_ENUM_CASED:     return    isLOWER_LC(character)
450                                         || isUPPER_LC(character);
451         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
452         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
453         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
454         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
455         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
456         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
457         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
458         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
459         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
460         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
461         default:    /* VERTSPACE should never occur in locales */
462             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
463     }
464 
465     NOT_REACHED; /* NOTREACHED */
466     return FALSE;
467 }
468 
469 PERL_STATIC_INLINE I32
470 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
471 {
472     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  s2 must already be
473      * folded.  Works on all folds representable without UTF-8, except for
474      * LATIN_SMALL_LETTER_SHARP_S, and does not check for this.  Nor does it
475      * check that the strings each have at least 'len' characters.
476      *
477      * There is almost an identical API function where s2 need not be folded:
478      * Perl_foldEQ_latin1() */
479 
480     const U8 *a = (const U8 *)s1;
481     const U8 *b = (const U8 *)s2;
482 
483     PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
484 
485     assert(len >= 0);
486 
487     while (len--) {
488         assert(! isUPPER_L1(*b));
489         if (toLOWER_L1(*a) != *b) {
490             return 0;
491         }
492         a++, b++;
493     }
494     return 1;
495 }
496 
497 STATIC bool
498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
499 {
500     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
501      * 'character' is a member of the Posix character class given by 'classnum'
502      * that should be equivalent to a value in the typedef
503      * '_char_class_number'.
504      *
505      * This just calls isFOO_lc on the code point for the character if it is in
506      * the range 0-255.  Outside that range, all characters use Unicode
507      * rules, ignoring any locale.  So use the Unicode function if this class
508      * requires an inversion list, and use the Unicode macro otherwise. */
509 
510 
511     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
512 
513     if (UTF8_IS_INVARIANT(*character)) {
514         return isFOO_lc(classnum, *character);
515     }
516     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
517         return isFOO_lc(classnum,
518                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
519     }
520 
521     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
522 
523     switch ((_char_class_number) classnum) {
524         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
525         case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
526         case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
527         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
528         default:
529             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
530                                         utf8_to_uvchr_buf(character, e, NULL));
531     }
532 
533     return FALSE; /* Things like CNTRL are always below 256 */
534 }
535 
536 STATIC U8 *
537 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
538 {
539     /* Returns the position of the first byte in the sequence between 's' and
540      * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
541      * */
542 
543     PERL_ARGS_ASSERT_FIND_SPAN_END;
544 
545     assert(send >= s);
546 
547     if ((STRLEN) (send - s) >= PERL_WORDSIZE
548                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
549                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
550     {
551         PERL_UINTMAX_T span_word;
552 
553         /* Process per-byte until reach word boundary.  XXX This loop could be
554          * eliminated if we knew that this platform had fast unaligned reads */
555         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
556             if (*s != span_byte) {
557                 return s;
558             }
559             s++;
560         }
561 
562         /* Create a word filled with the bytes we are spanning */
563         span_word = PERL_COUNT_MULTIPLIER * span_byte;
564 
565         /* Process per-word as long as we have at least a full word left */
566         do {
567 
568             /* Keep going if the whole word is composed of 'span_byte's */
569             if ((* (PERL_UINTMAX_T *) s) == span_word)  {
570                 s += PERL_WORDSIZE;
571                 continue;
572             }
573 
574             /* Here, at least one byte in the word isn't 'span_byte'. */
575 
576 #ifdef EBCDIC
577 
578             break;
579 
580 #else
581 
582             /* This xor leaves 1 bits only in those non-matching bytes */
583             span_word ^= * (PERL_UINTMAX_T *) s;
584 
585             /* Make sure the upper bit of each non-matching byte is set.  This
586              * makes each such byte look like an ASCII platform variant byte */
587             span_word |= span_word << 1;
588             span_word |= span_word << 2;
589             span_word |= span_word << 4;
590 
591             /* That reduces the problem to what this function solves */
592             return s + variant_byte_number(span_word);
593 
594 #endif
595 
596         } while (s + PERL_WORDSIZE <= send);
597     }
598 
599     /* Process the straggler bytes beyond the final word boundary */
600     while (s < send) {
601         if (*s != span_byte) {
602             return s;
603         }
604         s++;
605     }
606 
607     return s;
608 }
609 
610 STATIC U8 *
611 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
612 {
613     /* Returns the position of the first byte in the sequence between 's'
614      * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
615      * returns 'send' if none found.  It uses word-level operations instead of
616      * byte to speed up the process */
617 
618     PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
619 
620     assert(send >= s);
621     assert((byte & mask) == byte);
622 
623 #ifndef EBCDIC
624 
625     if ((STRLEN) (send - s) >= PERL_WORDSIZE
626                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
627                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
628     {
629         PERL_UINTMAX_T word, mask_word;
630 
631         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
632             if (((*s) & mask) == byte) {
633                 return s;
634             }
635             s++;
636         }
637 
638         word      = PERL_COUNT_MULTIPLIER * byte;
639         mask_word = PERL_COUNT_MULTIPLIER * mask;
640 
641         do {
642             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
643 
644             /* If 'masked' contains bytes with the bit pattern of 'byte' within
645              * it, xoring with 'word' will leave each of the 8 bits in such
646              * bytes be 0, and no byte containing any other bit pattern will be
647              * 0. */
648             masked ^= word;
649 
650             /* This causes the most significant bit to be set to 1 for any
651              * bytes in the word that aren't completely 0 */
652             masked |= masked << 1;
653             masked |= masked << 2;
654             masked |= masked << 4;
655 
656             /* The msbits are the same as what marks a byte as variant, so we
657              * can use this mask.  If all msbits are 1, the word doesn't
658              * contain 'byte' */
659             if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
660                 s += PERL_WORDSIZE;
661                 continue;
662             }
663 
664             /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
665              * and any that are, are 0.  Complement and re-AND to swap that */
666             masked = ~ masked;
667             masked &= PERL_VARIANTS_WORD_MASK;
668 
669             /* This reduces the problem to that solved by this function */
670             s += variant_byte_number(masked);
671             return s;
672 
673         } while (s + PERL_WORDSIZE <= send);
674     }
675 
676 #endif
677 
678     while (s < send) {
679         if (((*s) & mask) == byte) {
680             return s;
681         }
682         s++;
683     }
684 
685     return s;
686 }
687 
688 STATIC U8 *
689 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
690 {
691     /* Returns the position of the first byte in the sequence between 's' and
692      * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
693      * 'span_byte' should have been ANDed with 'mask' in the call of this
694      * function.  Returns 'send' if none found.  Works like find_span_end(),
695      * except for the AND */
696 
697     PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
698 
699     assert(send >= s);
700     assert((span_byte & mask) == span_byte);
701 
702     if ((STRLEN) (send - s) >= PERL_WORDSIZE
703                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
704                           - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
705     {
706         PERL_UINTMAX_T span_word, mask_word;
707 
708         while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
709             if (((*s) & mask) != span_byte) {
710                 return s;
711             }
712             s++;
713         }
714 
715         span_word = PERL_COUNT_MULTIPLIER * span_byte;
716         mask_word = PERL_COUNT_MULTIPLIER * mask;
717 
718         do {
719             PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
720 
721             if (masked == span_word) {
722                 s += PERL_WORDSIZE;
723                 continue;
724             }
725 
726 #ifdef EBCDIC
727 
728             break;
729 
730 #else
731 
732             masked ^= span_word;
733             masked |= masked << 1;
734             masked |= masked << 2;
735             masked |= masked << 4;
736             return s + variant_byte_number(masked);
737 
738 #endif
739 
740         } while (s + PERL_WORDSIZE <= send);
741     }
742 
743     while (s < send) {
744         if (((*s) & mask) != span_byte) {
745             return s;
746         }
747         s++;
748     }
749 
750     return s;
751 }
752 
753 /*
754  * pregexec and friends
755  */
756 
757 #ifndef PERL_IN_XSUB_RE
758 /*
759  - pregexec - match a regexp against a string
760  */
761 I32
762 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
763          char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
764 /* stringarg: the point in the string at which to begin matching */
765 /* strend:    pointer to null at end of string */
766 /* strbeg:    real beginning of string */
767 /* minend:    end of match must be >= minend bytes after stringarg. */
768 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
769  *            itself is accessed via the pointers above */
770 /* nosave:    For optimizations. */
771 {
772     PERL_ARGS_ASSERT_PREGEXEC;
773 
774     return
775         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
776                       nosave ? 0 : REXEC_COPY_STR);
777 }
778 #endif
779 
780 
781 
782 /* re_intuit_start():
783  *
784  * Based on some optimiser hints, try to find the earliest position in the
785  * string where the regex could match.
786  *
787  *   rx:     the regex to match against
788  *   sv:     the SV being matched: only used for utf8 flag; the string
789  *           itself is accessed via the pointers below. Note that on
790  *           something like an overloaded SV, SvPOK(sv) may be false
791  *           and the string pointers may point to something unrelated to
792  *           the SV itself.
793  *   strbeg: real beginning of string
794  *   strpos: the point in the string at which to begin matching
795  *   strend: pointer to the byte following the last char of the string
796  *   flags   currently unused; set to 0
797  *   data:   currently unused; set to NULL
798  *
799  * The basic idea of re_intuit_start() is to use some known information
800  * about the pattern, namely:
801  *
802  *   a) the longest known anchored substring (i.e. one that's at a
803  *      constant offset from the beginning of the pattern; but not
804  *      necessarily at a fixed offset from the beginning of the
805  *      string);
806  *   b) the longest floating substring (i.e. one that's not at a constant
807  *      offset from the beginning of the pattern);
808  *   c) Whether the pattern is anchored to the string; either
809  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
810  *      or anchored to pos(): /\G/;
811  *   d) A start class: a real or synthetic character class which
812  *      represents which characters are legal at the start of the pattern;
813  *
814  * to either quickly reject the match, or to find the earliest position
815  * within the string at which the pattern might match, thus avoiding
816  * running the full NFA engine at those earlier locations, only to
817  * eventually fail and retry further along.
818  *
819  * Returns NULL if the pattern can't match, or returns the address within
820  * the string which is the earliest place the match could occur.
821  *
822  * The longest of the anchored and floating substrings is called 'check'
823  * and is checked first. The other is called 'other' and is checked
824  * second. The 'other' substring may not be present.  For example,
825  *
826  *    /(abc|xyz)ABC\d{0,3}DEFG/
827  *
828  * will have
829  *
830  *   check substr (float)    = "DEFG", offset 6..9 chars
831  *   other substr (anchored) = "ABC",  offset 3..3 chars
832  *   stclass = [ax]
833  *
834  * Be aware that during the course of this function, sometimes 'anchored'
835  * refers to a substring being anchored relative to the start of the
836  * pattern, and sometimes to the pattern itself being anchored relative to
837  * the string. For example:
838  *
839  *   /\dabc/:   "abc" is anchored to the pattern;
840  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
841  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
842  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
843  *                    but the pattern is anchored to the string.
844  */
845 
846 char *
847 Perl_re_intuit_start(pTHX_
848                     REGEXP * const rx,
849                     SV *sv,
850                     const char * const strbeg,
851                     char *strpos,
852                     char *strend,
853                     const U32 flags,
854                     re_scream_pos_data *data)
855 {
856     struct regexp *const prog = ReANY(rx);
857     SSize_t start_shift = prog->check_offset_min;
858     /* Should be nonnegative! */
859     SSize_t end_shift   = 0;
860     /* current lowest pos in string where the regex can start matching */
861     char *rx_origin = strpos;
862     SV *check;
863     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
864     U8   other_ix = 1 - prog->substrs->check_ix;
865     bool ml_anch = 0;
866     char *other_last = strpos;/* latest pos 'other' substr already checked to */
867     char *check_at = NULL;		/* check substr found at this pos */
868     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
869     RXi_GET_DECL(prog,progi);
870     regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
871     regmatch_info *const reginfo = &reginfo_buf;
872     DECLARE_AND_GET_RE_DEBUG_FLAGS;
873 
874     PERL_ARGS_ASSERT_RE_INTUIT_START;
875     PERL_UNUSED_ARG(flags);
876     PERL_UNUSED_ARG(data);
877 
878     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
879                 "Intuit: trying to determine minimum start position...\n"));
880 
881     /* for now, assume that all substr offsets are positive. If at some point
882      * in the future someone wants to do clever things with lookbehind and
883      * -ve offsets, they'll need to fix up any code in this function
884      * which uses these offsets. See the thread beginning
885      * <20140113145929.GF27210@iabyn.com>
886      */
887     assert(prog->substrs->data[0].min_offset >= 0);
888     assert(prog->substrs->data[0].max_offset >= 0);
889     assert(prog->substrs->data[1].min_offset >= 0);
890     assert(prog->substrs->data[1].max_offset >= 0);
891     assert(prog->substrs->data[2].min_offset >= 0);
892     assert(prog->substrs->data[2].max_offset >= 0);
893 
894     /* for now, assume that if both present, that the floating substring
895      * doesn't start before the anchored substring.
896      * If you break this assumption (e.g. doing better optimisations
897      * with lookahead/behind), then you'll need to audit the code in this
898      * function carefully first
899      */
900     assert(
901             ! (  (prog->anchored_utf8 || prog->anchored_substr)
902               && (prog->float_utf8    || prog->float_substr))
903            || (prog->float_min_offset >= prog->anchored_offset));
904 
905     /* byte rather than char calculation for efficiency. It fails
906      * to quickly reject some cases that can't match, but will reject
907      * them later after doing full char arithmetic */
908     if (prog->minlen > strend - strpos) {
909         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
910                               "  String too short...\n"));
911         goto fail;
912     }
913 
914     RXp_MATCH_UTF8_set(prog, utf8_target);
915     reginfo->is_utf8_target = cBOOL(utf8_target);
916     reginfo->info_aux = NULL;
917     reginfo->strbeg = strbeg;
918     reginfo->strend = strend;
919     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
920     reginfo->intuit = 1;
921     /* not actually used within intuit, but zero for safety anyway */
922     reginfo->poscache_maxiter = 0;
923 
924     if (utf8_target) {
925         if ((!prog->anchored_utf8 && prog->anchored_substr)
926                 || (!prog->float_utf8 && prog->float_substr))
927             to_utf8_substr(prog);
928         check = prog->check_utf8;
929     } else {
930         if (!prog->check_substr && prog->check_utf8) {
931             if (! to_byte_substr(prog)) {
932                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
933             }
934         }
935         check = prog->check_substr;
936     }
937 
938     /* dump the various substring data */
939     DEBUG_OPTIMISE_MORE_r({
940         int i;
941         for (i=0; i<=2; i++) {
942             SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
943                                   : prog->substrs->data[i].substr);
944             if (!sv)
945                 continue;
946 
947             Perl_re_printf( aTHX_
948                 "  substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
949                 " useful=%" IVdf " utf8=%d [%s]\n",
950                 i,
951                 (IV)prog->substrs->data[i].min_offset,
952                 (IV)prog->substrs->data[i].max_offset,
953                 (IV)prog->substrs->data[i].end_shift,
954                 BmUSEFUL(sv),
955                 utf8_target ? 1 : 0,
956                 SvPEEK(sv));
957         }
958     });
959 
960     if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
961 
962         /* ml_anch: check after \n?
963          *
964          * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
965          * with /.*.../, these flags will have been added by the
966          * compiler:
967          *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
968          *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
969          */
970         ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
971                    && !(prog->intflags & PREGf_IMPLICIT);
972 
973         if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
974             /* we are only allowed to match at BOS or \G */
975 
976             /* trivially reject if there's a BOS anchor and we're not at BOS.
977              *
978              * Note that we don't try to do a similar quick reject for
979              * \G, since generally the caller will have calculated strpos
980              * based on pos() and gofs, so the string is already correctly
981              * anchored by definition; and handling the exceptions would
982              * be too fiddly (e.g. REXEC_IGNOREPOS).
983              */
984             if (   strpos != strbeg
985                 && (prog->intflags & PREGf_ANCH_SBOL))
986             {
987                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
988                                 "  Not at start...\n"));
989                 goto fail;
990             }
991 
992             /* in the presence of an anchor, the anchored (relative to the
993              * start of the regex) substr must also be anchored relative
994              * to strpos. So quickly reject if substr isn't found there.
995              * This works for \G too, because the caller will already have
996              * subtracted gofs from pos, and gofs is the offset from the
997              * \G to the start of the regex. For example, in /.abc\Gdef/,
998              * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
999              * caller will have set strpos=pos()-4; we look for the substr
1000              * at position pos()-4+1, which lines up with the "a" */
1001 
1002             if (prog->check_offset_min == prog->check_offset_max) {
1003                 /* Substring at constant offset from beg-of-str... */
1004                 SSize_t slen = SvCUR(check);
1005                 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1006 
1007                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1008                     "  Looking for check substr at fixed offset %" IVdf "...\n",
1009                     (IV)prog->check_offset_min));
1010 
1011                 if (SvTAIL(check)) {
1012                     /* In this case, the regex is anchored at the end too.
1013                      * Unless it's a multiline match, the lengths must match
1014                      * exactly, give or take a \n.  NB: slen >= 1 since
1015                      * the last char of check is \n */
1016                     if (!multiline
1017                         && (   strend - s > slen
1018                             || strend - s < slen - 1
1019                             || (strend - s == slen && strend[-1] != '\n')))
1020                     {
1021                         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1022                                             "  String too long...\n"));
1023                         goto fail_finish;
1024                     }
1025                     /* Now should match s[0..slen-2] */
1026                     slen--;
1027                 }
1028                 if (slen && (strend - s < slen
1029                     || *SvPVX_const(check) != *s
1030                     || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1031                 {
1032                     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033                                     "  String not equal...\n"));
1034                     goto fail_finish;
1035                 }
1036 
1037                 check_at = s;
1038                 goto success_at_start;
1039             }
1040         }
1041     }
1042 
1043     end_shift = prog->check_end_shift;
1044 
1045 #ifdef DEBUGGING	/* 7/99: reports of failure (with the older version) */
1046     if (end_shift < 0)
1047         Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1048                    (IV)end_shift, RX_PRECOMP(rx));
1049 #endif
1050 
1051   restart:
1052 
1053     /* This is the (re)entry point of the main loop in this function.
1054      * The goal of this loop is to:
1055      * 1) find the "check" substring in the region rx_origin..strend
1056      *    (adjusted by start_shift / end_shift). If not found, reject
1057      *    immediately.
1058      * 2) If it exists, look for the "other" substr too if defined; for
1059      *    example, if the check substr maps to the anchored substr, then
1060      *    check the floating substr, and vice-versa. If not found, go
1061      *    back to (1) with rx_origin suitably incremented.
1062      * 3) If we find an rx_origin position that doesn't contradict
1063      *    either of the substrings, then check the possible additional
1064      *    constraints on rx_origin of /^.../m or a known start class.
1065      *    If these fail, then depending on which constraints fail, jump
1066      *    back to here, or to various other re-entry points further along
1067      *    that skip some of the first steps.
1068      * 4) If we pass all those tests, update the BmUSEFUL() count on the
1069      *    substring. If the start position was determined to be at the
1070      *    beginning of the string  - so, not rejected, but not optimised,
1071      *    since we have to run regmatch from position 0 - decrement the
1072      *    BmUSEFUL() count. Otherwise increment it.
1073      */
1074 
1075 
1076     /* first, look for the 'check' substring */
1077 
1078     {
1079         U8* start_point;
1080         U8* end_point;
1081 
1082         DEBUG_OPTIMISE_MORE_r({
1083             Perl_re_printf( aTHX_
1084                 "  At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1085                 " Start shift: %" IVdf " End shift %" IVdf
1086                 " Real end Shift: %" IVdf "\n",
1087                 (IV)(rx_origin - strbeg),
1088                 (IV)prog->check_offset_min,
1089                 (IV)start_shift,
1090                 (IV)end_shift,
1091                 (IV)prog->check_end_shift);
1092         });
1093 
1094         end_point = HOPBACK3(strend, end_shift, rx_origin);
1095         if (!end_point)
1096             goto fail_finish;
1097         start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1098         if (!start_point)
1099             goto fail_finish;
1100 
1101 
1102         /* If the regex is absolutely anchored to either the start of the
1103          * string (SBOL) or to pos() (ANCH_GPOS), then
1104          * check_offset_max represents an upper bound on the string where
1105          * the substr could start. For the ANCH_GPOS case, we assume that
1106          * the caller of intuit will have already set strpos to
1107          * pos()-gofs, so in this case strpos + offset_max will still be
1108          * an upper bound on the substr.
1109          */
1110         if (!ml_anch
1111             && prog->intflags & PREGf_ANCH
1112             && prog->check_offset_max != SSize_t_MAX)
1113         {
1114             SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1115             const char * const anchor =
1116                         (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1117             SSize_t targ_len = (char*)end_point - anchor;
1118 
1119             if (check_len > targ_len) {
1120                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121                               "Target string too short to match required substring...\n"));
1122                 goto fail_finish;
1123             }
1124 
1125             /* do a bytes rather than chars comparison. It's conservative;
1126              * so it skips doing the HOP if the result can't possibly end
1127              * up earlier than the old value of end_point.
1128              */
1129             assert(anchor + check_len <= (char *)end_point);
1130             if (prog->check_offset_max + check_len < targ_len) {
1131                 end_point = HOP3lim((U8*)anchor,
1132                                 prog->check_offset_max,
1133                                 end_point - check_len
1134                             )
1135                             + check_len;
1136                 if (end_point < start_point)
1137                     goto fail_finish;
1138             }
1139         }
1140 
1141         check_at = fbm_instr( start_point, end_point,
1142                       check, multiline ? FBMrf_MULTILINE : 0);
1143 
1144         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1145             "  doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1146             (IV)((char*)start_point - strbeg),
1147             (IV)((char*)end_point   - strbeg),
1148             (IV)(check_at ? check_at - strbeg : -1)
1149         ));
1150 
1151         /* Update the count-of-usability, remove useless subpatterns,
1152             unshift s.  */
1153 
1154         DEBUG_EXECUTE_r({
1155             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1156                 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1157             Perl_re_printf( aTHX_  "  %s %s substr %s%s%s",
1158                               (check_at ? "Found" : "Did not find"),
1159                 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1160                     ? "anchored" : "floating"),
1161                 quoted,
1162                 RE_SV_TAIL(check),
1163                 (check_at ? " at offset " : "...\n") );
1164         });
1165 
1166         if (!check_at)
1167             goto fail_finish;
1168         /* set rx_origin to the minimum position where the regex could start
1169          * matching, given the constraint of the just-matched check substring.
1170          * But don't set it lower than previously.
1171          */
1172 
1173         if (check_at - rx_origin > prog->check_offset_max)
1174             rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1175         /* Finish the diagnostic message */
1176         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1177             "%ld (rx_origin now %" IVdf ")...\n",
1178             (long)(check_at - strbeg),
1179             (IV)(rx_origin - strbeg)
1180         ));
1181     }
1182 
1183 
1184     /* now look for the 'other' substring if defined */
1185 
1186     if (prog->substrs->data[other_ix].utf8_substr
1187         || prog->substrs->data[other_ix].substr)
1188     {
1189         /* Take into account the "other" substring. */
1190         char *last, *last1;
1191         char *s;
1192         SV* must;
1193         struct reg_substr_datum *other;
1194 
1195       do_other_substr:
1196         other = &prog->substrs->data[other_ix];
1197         if (!utf8_target && !other->substr) {
1198             if (!to_byte_substr(prog)) {
1199                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1200             }
1201         }
1202 
1203         /* if "other" is anchored:
1204          * we've previously found a floating substr starting at check_at.
1205          * This means that the regex origin must lie somewhere
1206          * between min (rx_origin): HOP3(check_at, -check_offset_max)
1207          * and max:                 HOP3(check_at, -check_offset_min)
1208          * (except that min will be >= strpos)
1209          * So the fixed  substr must lie somewhere between
1210          *  HOP3(min, anchored_offset)
1211          *  HOP3(max, anchored_offset) + SvCUR(substr)
1212          */
1213 
1214         /* if "other" is floating
1215          * Calculate last1, the absolute latest point where the
1216          * floating substr could start in the string, ignoring any
1217          * constraints from the earlier fixed match. It is calculated
1218          * as follows:
1219          *
1220          * strend - prog->minlen (in chars) is the absolute latest
1221          * position within the string where the origin of the regex
1222          * could appear. The latest start point for the floating
1223          * substr is float_min_offset(*) on from the start of the
1224          * regex.  last1 simply combines thee two offsets.
1225          *
1226          * (*) You might think the latest start point should be
1227          * float_max_offset from the regex origin, and technically
1228          * you'd be correct. However, consider
1229          *    /a\d{2,4}bcd\w/
1230          * Here, float min, max are 3,5 and minlen is 7.
1231          * This can match either
1232          *    /a\d\dbcd\w/
1233          *    /a\d\d\dbcd\w/
1234          *    /a\d\d\d\dbcd\w/
1235          * In the first case, the regex matches minlen chars; in the
1236          * second, minlen+1, in the third, minlen+2.
1237          * In the first case, the floating offset is 3 (which equals
1238          * float_min), in the second, 4, and in the third, 5 (which
1239          * equals float_max). In all cases, the floating string bcd
1240          * can never start more than 4 chars from the end of the
1241          * string, which equals minlen - float_min. As the substring
1242          * starts to match more than float_min from the start of the
1243          * regex, it makes the regex match more than minlen chars,
1244          * and the two cancel each other out. So we can always use
1245          * float_min - minlen, rather than float_max - minlen for the
1246          * latest position in the string.
1247          *
1248          * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1249          * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1250          */
1251 
1252         assert(prog->minlen >= other->min_offset);
1253         last1 = HOP3c(strend,
1254                         other->min_offset - prog->minlen, strbeg);
1255 
1256         if (other_ix) {/* i.e. if (other-is-float) */
1257             /* last is the latest point where the floating substr could
1258              * start, *given* any constraints from the earlier fixed
1259              * match. This constraint is that the floating string starts
1260              * <= float_max_offset chars from the regex origin (rx_origin).
1261              * If this value is less than last1, use it instead.
1262              */
1263             assert(rx_origin <= last1);
1264             last =
1265                 /* this condition handles the offset==infinity case, and
1266                  * is a short-cut otherwise. Although it's comparing a
1267                  * byte offset to a char length, it does so in a safe way,
1268                  * since 1 char always occupies 1 or more bytes,
1269                  * so if a string range is  (last1 - rx_origin) bytes,
1270                  * it will be less than or equal to  (last1 - rx_origin)
1271                  * chars; meaning it errs towards doing the accurate HOP3
1272                  * rather than just using last1 as a short-cut */
1273                 (last1 - rx_origin) < other->max_offset
1274                     ? last1
1275                     : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1276         }
1277         else {
1278             assert(strpos + start_shift <= check_at);
1279             last = HOP4c(check_at, other->min_offset - start_shift,
1280                         strbeg, strend);
1281         }
1282 
1283         s = HOP3c(rx_origin, other->min_offset, strend);
1284         if (s < other_last)	/* These positions already checked */
1285             s = other_last;
1286 
1287         must = utf8_target ? other->utf8_substr : other->substr;
1288         assert(SvPOK(must));
1289         {
1290             char *from = s;
1291             char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1292 
1293             if (to > strend)
1294                 to = strend;
1295             if (from > to) {
1296                 s = NULL;
1297                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298                     "  skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1299                     (IV)(from - strbeg),
1300                     (IV)(to   - strbeg)
1301                 ));
1302             }
1303             else {
1304                 s = fbm_instr(
1305                     (unsigned char*)from,
1306                     (unsigned char*)to,
1307                     must,
1308                     multiline ? FBMrf_MULTILINE : 0
1309                 );
1310                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1311                     "  doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1312                     (IV)(from - strbeg),
1313                     (IV)(to   - strbeg),
1314                     (IV)(s ? s - strbeg : -1)
1315                 ));
1316             }
1317         }
1318 
1319         DEBUG_EXECUTE_r({
1320             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1321                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1322             Perl_re_printf( aTHX_  "  %s %s substr %s%s",
1323                 s ? "Found" : "Contradicts",
1324                 other_ix ? "floating" : "anchored",
1325                 quoted, RE_SV_TAIL(must));
1326         });
1327 
1328 
1329         if (!s) {
1330             /* last1 is latest possible substr location. If we didn't
1331              * find it before there, we never will */
1332             if (last >= last1) {
1333                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1334                                         "; giving up...\n"));
1335                 goto fail_finish;
1336             }
1337 
1338             /* try to find the check substr again at a later
1339              * position. Maybe next time we'll find the "other" substr
1340              * in range too */
1341             other_last = HOP3c(last, 1, strend) /* highest failure */;
1342             rx_origin =
1343                 other_ix /* i.e. if other-is-float */
1344                     ? HOP3c(rx_origin, 1, strend)
1345                     : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1346             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1347                 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1348                 (other_ix ? "floating" : "anchored"),
1349                 (long)(HOP3c(check_at, 1, strend) - strbeg),
1350                 (IV)(rx_origin - strbeg)
1351             ));
1352             goto restart;
1353         }
1354         else {
1355             if (other_ix) { /* if (other-is-float) */
1356                 /* other_last is set to s, not s+1, since its possible for
1357                  * a floating substr to fail first time, then succeed
1358                  * second time at the same floating position; e.g.:
1359                  *     "-AB--AABZ" =~ /\wAB\d*Z/
1360                  * The first time round, anchored and float match at
1361                  * "-(AB)--AAB(Z)" then fail on the initial \w character
1362                  * class. Second time round, they match at "-AB--A(AB)(Z)".
1363                  */
1364                 other_last = s;
1365             }
1366             else {
1367                 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1368                 other_last = HOP3c(s, 1, strend);
1369             }
1370             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1371                 " at offset %ld (rx_origin now %" IVdf ")...\n",
1372                   (long)(s - strbeg),
1373                 (IV)(rx_origin - strbeg)
1374               ));
1375 
1376         }
1377     }
1378     else {
1379         DEBUG_OPTIMISE_MORE_r(
1380             Perl_re_printf( aTHX_
1381                 "  Check-only match: offset min:%" IVdf " max:%" IVdf
1382                 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1383                 " strend:%" IVdf "\n",
1384                 (IV)prog->check_offset_min,
1385                 (IV)prog->check_offset_max,
1386                 (IV)(check_at-strbeg),
1387                 (IV)(rx_origin-strbeg),
1388                 (IV)(rx_origin-check_at),
1389                 (IV)(strend-strbeg)
1390             )
1391         );
1392     }
1393 
1394   postprocess_substr_matches:
1395 
1396     /* handle the extra constraint of /^.../m if present */
1397 
1398     if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1399         char *s;
1400 
1401         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1402                         "  looking for /^/m anchor"));
1403 
1404         /* we have failed the constraint of a \n before rx_origin.
1405          * Find the next \n, if any, even if it's beyond the current
1406          * anchored and/or floating substrings. Whether we should be
1407          * scanning ahead for the next \n or the next substr is debatable.
1408          * On the one hand you'd expect rare substrings to appear less
1409          * often than \n's. On the other hand, searching for \n means
1410          * we're effectively flipping between check_substr and "\n" on each
1411          * iteration as the current "rarest" candidate string, which
1412          * means for example that we'll quickly reject the whole string if
1413          * hasn't got a \n, rather than trying every substr position
1414          * first
1415          */
1416 
1417         s = HOP3c(strend, - prog->minlen, strpos);
1418         if (s <= rx_origin ||
1419             ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1420         {
1421             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422                             "  Did not find /%s^%s/m...\n",
1423                             PL_colors[0], PL_colors[1]));
1424             goto fail_finish;
1425         }
1426 
1427         /* earliest possible origin is 1 char after the \n.
1428          * (since *rx_origin == '\n', it's safe to ++ here rather than
1429          * HOP(rx_origin, 1)) */
1430         rx_origin++;
1431 
1432         if (prog->substrs->check_ix == 0  /* check is anchored */
1433             || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1434         {
1435             /* Position contradicts check-string; either because
1436              * check was anchored (and thus has no wiggle room),
1437              * or check was float and rx_origin is above the float range */
1438             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1439                 "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1440                 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1441             goto restart;
1442         }
1443 
1444         /* if we get here, the check substr must have been float,
1445          * is in range, and we may or may not have had an anchored
1446          * "other" substr which still contradicts */
1447         assert(prog->substrs->check_ix); /* check is float */
1448 
1449         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1450             /* whoops, the anchored "other" substr exists, so we still
1451              * contradict. On the other hand, the float "check" substr
1452              * didn't contradict, so just retry the anchored "other"
1453              * substr */
1454             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1455                 "  Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1456                 PL_colors[0], PL_colors[1],
1457                 (IV)(rx_origin - strbeg + prog->anchored_offset),
1458                 (IV)(rx_origin - strbeg)
1459             ));
1460             goto do_other_substr;
1461         }
1462 
1463         /* success: we don't contradict the found floating substring
1464          * (and there's no anchored substr). */
1465         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1466             "  Found /%s^%s/m with rx_origin %ld...\n",
1467             PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1468     }
1469     else {
1470         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1471             "  (multiline anchor test skipped)\n"));
1472     }
1473 
1474   success_at_start:
1475 
1476 
1477     /* if we have a starting character class, then test that extra constraint.
1478      * (trie stclasses are too expensive to use here, we are better off to
1479      * leave it to regmatch itself) */
1480 
1481     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1482         const U8* const str = (U8*)STRING(progi->regstclass);
1483 
1484         /* XXX this value could be pre-computed */
1485         const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1486                     ?  (reginfo->is_utf8_pat
1487                         ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1488                         : (SSize_t)STR_LEN(progi->regstclass))
1489                     : 1);
1490         char * endpos;
1491         char *s;
1492         /* latest pos that a matching float substr constrains rx start to */
1493         char *rx_max_float = NULL;
1494 
1495         /* if the current rx_origin is anchored, either by satisfying an
1496          * anchored substring constraint, or a /^.../m constraint, then we
1497          * can reject the current origin if the start class isn't found
1498          * at the current position. If we have a float-only match, then
1499          * rx_origin is constrained to a range; so look for the start class
1500          * in that range. if neither, then look for the start class in the
1501          * whole rest of the string */
1502 
1503         /* XXX DAPM it's not clear what the minlen test is for, and why
1504          * it's not used in the floating case. Nothing in the test suite
1505          * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1506          * Here are some old comments, which may or may not be correct:
1507          *
1508          *   minlen == 0 is possible if regstclass is \b or \B,
1509          *   and the fixed substr is ''$.
1510          *   Since minlen is already taken into account, rx_origin+1 is
1511          *   before strend; accidentally, minlen >= 1 guaranties no false
1512          *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1513          *   0) below assumes that regstclass does not come from lookahead...
1514          *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1515          *   This leaves EXACTF-ish only, which are dealt with in
1516          *   find_byclass().
1517          */
1518 
1519         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1520             endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1521         else if (prog->float_substr || prog->float_utf8) {
1522             rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1523             endpos = HOP3clim(rx_max_float, cl_l, strend);
1524         }
1525         else
1526             endpos= strend;
1527 
1528         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1529             "  looking for class: start_shift: %" IVdf " check_at: %" IVdf
1530             " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1531               (IV)start_shift, (IV)(check_at - strbeg),
1532               (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1533 
1534         s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1535                             reginfo);
1536         if (!s) {
1537             if (endpos == strend) {
1538                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1539                                 "  Could not match STCLASS...\n") );
1540                 goto fail;
1541             }
1542             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1543                                "  This position contradicts STCLASS...\n") );
1544             if ((prog->intflags & PREGf_ANCH) && !ml_anch
1545                         && !(prog->intflags & PREGf_IMPLICIT))
1546                 goto fail;
1547 
1548             /* Contradict one of substrings */
1549             if (prog->anchored_substr || prog->anchored_utf8) {
1550                 if (prog->substrs->check_ix == 1) { /* check is float */
1551                     /* Have both, check_string is floating */
1552                     assert(rx_origin + start_shift <= check_at);
1553                     if (rx_origin + start_shift != check_at) {
1554                         /* not at latest position float substr could match:
1555                          * Recheck anchored substring, but not floating.
1556                          * The condition above is in bytes rather than
1557                          * chars for efficiency. It's conservative, in
1558                          * that it errs on the side of doing 'goto
1559                          * do_other_substr'. In this case, at worst,
1560                          * an extra anchored search may get done, but in
1561                          * practice the extra fbm_instr() is likely to
1562                          * get skipped anyway. */
1563                         DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1564                             "  about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1565                             (long)(other_last - strbeg),
1566                             (IV)(rx_origin - strbeg)
1567                         ));
1568                         goto do_other_substr;
1569                     }
1570                 }
1571             }
1572             else {
1573                 /* float-only */
1574 
1575                 if (ml_anch) {
1576                     /* In the presence of ml_anch, we might be able to
1577                      * find another \n without breaking the current float
1578                      * constraint. */
1579 
1580                     /* strictly speaking this should be HOP3c(..., 1, ...),
1581                      * but since we goto a block of code that's going to
1582                      * search for the next \n if any, its safe here */
1583                     rx_origin++;
1584                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1585                               "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1586                               PL_colors[0], PL_colors[1],
1587                               (long)(rx_origin - strbeg)) );
1588                     goto postprocess_substr_matches;
1589                 }
1590 
1591                 /* strictly speaking this can never be true; but might
1592                  * be if we ever allow intuit without substrings */
1593                 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1594                     goto fail;
1595 
1596                 rx_origin = rx_max_float;
1597             }
1598 
1599             /* at this point, any matching substrings have been
1600              * contradicted. Start again... */
1601 
1602             rx_origin = HOP3c(rx_origin, 1, strend);
1603 
1604             /* uses bytes rather than char calculations for efficiency.
1605              * It's conservative: it errs on the side of doing 'goto restart',
1606              * where there is code that does a proper char-based test */
1607             if (rx_origin + start_shift + end_shift > strend) {
1608                 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1609                                        "  Could not match STCLASS...\n") );
1610                 goto fail;
1611             }
1612             DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1613                 "  about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1614                 (prog->substrs->check_ix ? "floating" : "anchored"),
1615                 (long)(rx_origin + start_shift - strbeg),
1616                 (IV)(rx_origin - strbeg)
1617             ));
1618             goto restart;
1619         }
1620 
1621         /* Success !!! */
1622 
1623         if (rx_origin != s) {
1624             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1625                         "  By STCLASS: moving %ld --> %ld\n",
1626                                   (long)(rx_origin - strbeg), (long)(s - strbeg))
1627                    );
1628         }
1629         else {
1630             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1631                                   "  Does not contradict STCLASS...\n");
1632                    );
1633         }
1634     }
1635 
1636     /* Decide whether using the substrings helped */
1637 
1638     if (rx_origin != strpos) {
1639         /* Fixed substring is found far enough so that the match
1640            cannot start at strpos. */
1641 
1642         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  try at offset...\n"));
1643         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);	/* hooray/5 */
1644     }
1645     else {
1646         /* The found rx_origin position does not prohibit matching at
1647          * strpos, so calling intuit didn't gain us anything. Decrement
1648          * the BmUSEFUL() count on the check substring, and if we reach
1649          * zero, free it.  */
1650         if (!(prog->intflags & PREGf_NAUGHTY)
1651             && (utf8_target ? (
1652                 prog->check_utf8		/* Could be deleted already */
1653                 && --BmUSEFUL(prog->check_utf8) < 0
1654                 && (prog->check_utf8 == prog->float_utf8)
1655             ) : (
1656                 prog->check_substr		/* Could be deleted already */
1657                 && --BmUSEFUL(prog->check_substr) < 0
1658                 && (prog->check_substr == prog->float_substr)
1659             )))
1660         {
1661             /* If flags & SOMETHING - do not do it many times on the same match */
1662             DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "  ... Disabling check substring...\n"));
1663             /* XXX Does the destruction order has to change with utf8_target? */
1664             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1665             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1666             prog->check_substr = prog->check_utf8 = NULL;	/* disable */
1667             prog->float_substr = prog->float_utf8 = NULL;	/* clear */
1668             check = NULL;			/* abort */
1669             /* XXXX This is a remnant of the old implementation.  It
1670                     looks wasteful, since now INTUIT can use many
1671                     other heuristics. */
1672             prog->extflags &= ~RXf_USE_INTUIT;
1673         }
1674     }
1675 
1676     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1677             "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1678              PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1679 
1680     return rx_origin;
1681 
1682   fail_finish:				/* Substring not found */
1683     if (prog->check_substr || prog->check_utf8)		/* could be removed already */
1684         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1685   fail:
1686     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch rejected by optimizer%s\n",
1687                           PL_colors[4], PL_colors[5]));
1688     return NULL;
1689 }
1690 
1691 
1692 #define DECL_TRIE_TYPE(scan) \
1693     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1694                  trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1695                  trie_utf8l, trie_flu8, trie_flu8_latin }                           \
1696                     trie_type = ((scan->flags == EXACT)                             \
1697                                  ? (utf8_target ? trie_utf8 : trie_plain)           \
1698                                  : (scan->flags == EXACTL)                          \
1699                                     ? (utf8_target ? trie_utf8l : trie_plain)       \
1700                                     : (scan->flags == EXACTFAA)                     \
1701                                       ? (utf8_target                                \
1702                                          ? trie_utf8_exactfa_fold                   \
1703                                          : trie_latin_utf8_exactfa_fold)            \
1704                                       : (scan->flags == EXACTFLU8                   \
1705                                          ? (utf8_target                             \
1706                                            ? trie_flu8                              \
1707                                            : trie_flu8_latin)                       \
1708                                          : (utf8_target                             \
1709                                            ? trie_utf8_fold                         \
1710                                            : trie_latin_utf8_fold)))
1711 
1712 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1713  * 'foldbuf+sizeof(foldbuf)' */
1714 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1715 STMT_START {                                                                        \
1716     STRLEN skiplen;                                                                 \
1717     U8 flags = FOLD_FLAGS_FULL;                                                     \
1718     switch (trie_type) {                                                            \
1719     case trie_flu8:                                                                 \
1720         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1721         if (UTF8_IS_ABOVE_LATIN1(*uc)) {                                            \
1722             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1723         }                                                                           \
1724         goto do_trie_utf8_fold;                                                     \
1725     case trie_utf8_exactfa_fold:                                                    \
1726         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1727         /* FALLTHROUGH */                                                           \
1728     case trie_utf8_fold:                                                            \
1729       do_trie_utf8_fold:                                                            \
1730         if ( foldlen>0 ) {                                                          \
1731             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1732             foldlen -= len;                                                         \
1733             uscan += len;                                                           \
1734             len=0;                                                                  \
1735         } else {                                                                    \
1736             uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen,    \
1737                                                                             flags); \
1738             len = UTF8_SAFE_SKIP(uc, uc_end);                                       \
1739             skiplen = UVCHR_SKIP( uvc );                                            \
1740             foldlen -= skiplen;                                                     \
1741             uscan = foldbuf + skiplen;                                              \
1742         }                                                                           \
1743         break;                                                                      \
1744     case trie_flu8_latin:                                                           \
1745         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1746         goto do_trie_latin_utf8_fold;                                               \
1747     case trie_latin_utf8_exactfa_fold:                                              \
1748         flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1749         /* FALLTHROUGH */                                                           \
1750     case trie_latin_utf8_fold:                                                      \
1751       do_trie_latin_utf8_fold:                                                      \
1752         if ( foldlen>0 ) {                                                          \
1753             uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags );     \
1754             foldlen -= len;                                                         \
1755             uscan += len;                                                           \
1756             len=0;                                                                  \
1757         } else {                                                                    \
1758             len = 1;                                                                \
1759             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1760             skiplen = UVCHR_SKIP( uvc );                                            \
1761             foldlen -= skiplen;                                                     \
1762             uscan = foldbuf + skiplen;                                              \
1763         }                                                                           \
1764         break;                                                                      \
1765     case trie_utf8l:                                                                \
1766         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1767         if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1768             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end);                     \
1769         }                                                                           \
1770         /* FALLTHROUGH */                                                           \
1771     case trie_utf8:                                                                 \
1772         uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags );        \
1773         break;                                                                      \
1774     case trie_plain:                                                                \
1775         uvc = (UV)*uc;                                                              \
1776         len = 1;                                                                    \
1777     }                                                                               \
1778     if (uvc < 256) {                                                                \
1779         charid = trie->charmap[ uvc ];                                              \
1780     }                                                                               \
1781     else {                                                                          \
1782         charid = 0;                                                                 \
1783         if (widecharmap) {                                                          \
1784             SV** const svpp = hv_fetch(widecharmap,                                 \
1785                         (char*)&uvc, sizeof(UV), 0);                                \
1786             if (svpp)                                                               \
1787                 charid = (U16)SvIV(*svpp);                                          \
1788         }                                                                           \
1789     }                                                                               \
1790 } STMT_END
1791 
1792 #define DUMP_EXEC_POS(li,s,doutf8,depth)                    \
1793     dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1794                 startpos, doutf8, depth)
1795 
1796 #define REXEC_FBC_UTF8_SCAN(CODE)                           \
1797     STMT_START {                                            \
1798         while (s < strend) {                                \
1799             CODE                                            \
1800             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1801         }                                                   \
1802     } STMT_END
1803 
1804 #define REXEC_FBC_NON_UTF8_SCAN(CODE)                       \
1805     STMT_START {                                            \
1806         while (s < strend) {                                \
1807             CODE                                            \
1808             s++;                                            \
1809         }                                                   \
1810     } STMT_END
1811 
1812 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                     \
1813     STMT_START {                                            \
1814         while (s < strend) {                                \
1815             REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)            \
1816         }                                                   \
1817     } STMT_END
1818 
1819 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND)                 \
1820     STMT_START {                                            \
1821         while (s < strend) {                                \
1822             REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)        \
1823         }                                                   \
1824     } STMT_END
1825 
1826 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND)                   \
1827     if (COND) {                                                \
1828         FBC_CHECK_AND_TRY                                      \
1829         s += UTF8_SAFE_SKIP(s, reginfo->strend);               \
1830         previous_occurrence_end = s;                           \
1831     }                                                          \
1832     else {                                                     \
1833         s += UTF8SKIP(s);                                      \
1834     }
1835 
1836 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND)               \
1837     if (COND) {                                                \
1838         FBC_CHECK_AND_TRY                                      \
1839         s++;                                                   \
1840         previous_occurrence_end = s;                           \
1841     }                                                          \
1842     else {                                                     \
1843         s++;                                                   \
1844     }
1845 
1846 /* We keep track of where the next character should start after an occurrence
1847  * of the one we're looking for.  Knowing that, we can see right away if the
1848  * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
1849  * don't accept the 2nd and succeeding adjacent occurrences */
1850 #define FBC_CHECK_AND_TRY                                           \
1851         if (   (   doevery                                          \
1852                 || s != previous_occurrence_end)                    \
1853             && (   reginfo->intuit                                  \
1854                 || (s <= reginfo->strend && regtry(reginfo, &s))))  \
1855         {                                                           \
1856             goto got_it;                                            \
1857         }
1858 
1859 
1860 /* These differ from the above macros in that they call a function which
1861  * returns the next occurrence of the thing being looked for in 's'; and
1862  * 'strend' if there is no such occurrence.  'f' is something like fcn(a,b,c)
1863  * */
1864 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f)                    \
1865     while (s < strend) {                                    \
1866         s = (char *) (f);                                   \
1867         if (s >= strend) {                                  \
1868             break;                                          \
1869         }                                                   \
1870                                                             \
1871         FBC_CHECK_AND_TRY                                   \
1872         s += UTF8SKIP(s);                                   \
1873         previous_occurrence_end = s;                        \
1874     }
1875 
1876 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f)                \
1877     while (s < strend) {                                    \
1878         s = (char *) (f);                                   \
1879         if (s >= strend) {                                  \
1880             break;                                          \
1881         }                                                   \
1882                                                             \
1883         FBC_CHECK_AND_TRY                                   \
1884         s++;                                                \
1885         previous_occurrence_end = s;                        \
1886     }
1887 
1888 /* This is like the above macro except the function returns NULL if there is no
1889  * occurrence, and there is a further condition that must be matched besides
1890  * the function */
1891 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND)         \
1892     while (s < strend) {                                    \
1893         s = (char *) (f);                                     \
1894         if (s == NULL) {                                    \
1895             s = (char *) strend;                            \
1896             break;                                          \
1897         }                                                   \
1898                                                             \
1899         if (COND) {                                         \
1900             FBC_CHECK_AND_TRY                               \
1901             s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
1902             previous_occurrence_end = s;                    \
1903         }                                                   \
1904         else {                                              \
1905             s += UTF8SKIP(s);                               \
1906         }                                                   \
1907     }
1908 
1909 /* This differs from the above macros in that it is passed a single byte that
1910  * is known to begin the next occurrence of the thing being looked for in 's'.
1911  * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1912  * at that position. */
1913 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND)                  \
1914     REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s),     \
1915                                               COND)
1916 
1917 /* This is like the function above, but takes an entire string to look for
1918  * instead of a single byte */
1919 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND)      \
1920     REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(                                     \
1921                                      ninstr(s, strend, substr, substr_end), \
1922                                      COND)
1923 
1924 /* The four macros below are slightly different versions of the same logic.
1925  *
1926  * The first is for /a and /aa when the target string is UTF-8.  This can only
1927  * match ascii, but it must advance based on UTF-8.   The other three handle
1928  * the non-UTF-8 and the more generic UTF-8 cases.   In all four, we are
1929  * looking for the boundary (or non-boundary) between a word and non-word
1930  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1931  * must be different.  Find the "wordness" of the character just prior to this
1932  * one, and compare it with the wordness of this one.  If they differ, we have
1933  * a boundary.  At the beginning of the string, pretend that the previous
1934  * character was a new-line.
1935  *
1936  * All these macros uncleanly have side-effects with each other and outside
1937  * variables.  So far it's been too much trouble to clean-up
1938  *
1939  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1940  *               a word character or not.
1941  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1942  *               word/non-word
1943  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1944  *
1945  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1946  * are looking for a boundary or for a non-boundary.  If we are looking for a
1947  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1948  * see if this tentative match actually works, and if so, to quit the loop
1949  * here.  And vice-versa if we are looking for a non-boundary.
1950  *
1951  * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
1952  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1953  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1954  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1955  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1956  * complement.  But in that branch we complement tmp, meaning that at the
1957  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1958  * which means at the top of the loop in the next iteration, it is
1959  * TEST_NON_UTF8(s-1) */
1960 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1961     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1962     tmp = TEST_NON_UTF8(tmp);                                                  \
1963     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1964         if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1965             tmp = !tmp;                                                        \
1966             IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1967         }                                                                      \
1968         else {                                                                 \
1969             IF_FAIL;                                                           \
1970         }                                                                      \
1971     );                                                                         \
1972 
1973 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1974  * TEST_UTF8 is a macro that for the same input code points returns identically
1975  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1976  * end pointer as well) */
1977 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1978     if (s == reginfo->strbeg) {                                                \
1979         tmp = '\n';                                                            \
1980     }                                                                          \
1981     else { /* Back-up to the start of the previous character */                \
1982         U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1983         tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1984                                                        0, UTF8_ALLOW_DEFAULT); \
1985     }                                                                          \
1986     tmp = TEST_UV(tmp);                                                        \
1987     REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */                      \
1988         if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
1989             tmp = !tmp;                                                        \
1990             IF_SUCCESS;                                                        \
1991         }                                                                      \
1992         else {                                                                 \
1993             IF_FAIL;                                                           \
1994         }                                                                      \
1995     );
1996 
1997 /* Like the above two macros, for a UTF-8 target string.  UTF8_CODE is the
1998  * complete code for handling UTF-8.  Common to the BOUND and NBOUND cases,
1999  * set-up by the FBC_BOUND, etc macros below */
2000 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)   \
2001     UTF8_CODE;                                                                 \
2002     /* Here, things have been set up by the previous code so that tmp is the   \
2003      * return of TEST_NON_UTF8(s-1).  We also have to check if this matches    \
2004      * against the EOS, which we treat as a \n */                              \
2005     if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
2006         IF_SUCCESS;                                                            \
2007     }                                                                          \
2008     else {                                                                     \
2009         IF_FAIL;                                                               \
2010     }
2011 
2012 /* Same as the macro above, but the target isn't UTF-8 */
2013 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)       \
2014     tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                   \
2015     tmp = TEST_NON_UTF8(tmp);                                               \
2016     REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */               \
2017         if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) {                           \
2018             IF_SUCCESS;                                                     \
2019             tmp = !tmp;                                                     \
2020         }                                                                   \
2021         else {                                                              \
2022             IF_FAIL;                                                        \
2023         }                                                                   \
2024     );                                                                      \
2025     /* Here, things have been set up by the previous code so that tmp is    \
2026      * the return of TEST_NON_UTF8(s-1).   We also have to check if this    \
2027      * matches against the EOS, which we treat as a \n */                   \
2028     if (tmp == ! TEST_NON_UTF8('\n')) {                                     \
2029         IF_SUCCESS;                                                         \
2030     }                                                                       \
2031     else {                                                                  \
2032         IF_FAIL;                                                            \
2033     }
2034 
2035 /* This is the macro to use when we want to see if something that looks like it
2036  * could match, actually does, and if so exits the loop.  It needs to be used
2037  * only for bounds checking macros, as it allows for matching beyond the end of
2038  * string (which should be zero length without having to look at the string
2039  * contents) */
2040 #define REXEC_FBC_TRYIT                                                     \
2041     if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
2042         goto got_it
2043 
2044 /* The only difference between the BOUND and NBOUND cases is that
2045  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2046  * NBOUND.  This is accomplished by passing it as either the if or else clause,
2047  * with the other one being empty (PLACEHOLDER is defined as empty).
2048  *
2049  * The TEST_FOO parameters are for operating on different forms of input, but
2050  * all should be ones that return identically for the same underlying code
2051  * points */
2052 
2053 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                   \
2054     FBC_BOUND_COMMON_UTF8(                                                  \
2055           FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),       \
2056           TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2057 
2058 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8)                                   \
2059     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2060 
2061 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8)                                     \
2062     FBC_BOUND_COMMON_UTF8(                                                  \
2063                     FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2064                     TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2065 
2066 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8)                                 \
2067     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2068 
2069 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                  \
2070     FBC_BOUND_COMMON_UTF8(                                                  \
2071               FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),   \
2072               TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2073 
2074 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8)                                  \
2075     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2076 
2077 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8)                                    \
2078     FBC_BOUND_COMMON_UTF8(                                                  \
2079             FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),        \
2080             TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2081 
2082 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8)                                \
2083     FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2084 
2085 #ifdef DEBUGGING
2086 static IV
2087 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2088   IV cp_out = _invlist_search(invlist, cp_in);
2089   assert(cp_out >= 0);
2090   return cp_out;
2091 }
2092 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2093         invmap[S_get_break_val_cp_checked(invlist, cp)]
2094 #else
2095 #  define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2096         invmap[_invlist_search(invlist, cp)]
2097 #endif
2098 
2099 /* Takes a pointer to an inversion list, a pointer to its corresponding
2100  * inversion map, and a code point, and returns the code point's value
2101  * according to the two arrays.  It assumes that all code points have a value.
2102  * This is used as the base macro for macros for particular properties */
2103 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
2104         _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2105 
2106 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2107  * of a code point, returning the value for the first code point in the string.
2108  * And it takes the particular macro name that finds the desired value given a
2109  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
2110 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
2111              (__ASSERT_(pos < strend)                                          \
2112                  /* Note assumes is valid UTF-8 */                             \
2113              (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2114 
2115 /* Returns the GCB value for the input code point */
2116 #define getGCB_VAL_CP(cp)                                                      \
2117           _generic_GET_BREAK_VAL_CP(                                           \
2118                                     PL_GCB_invlist,                            \
2119                                     _Perl_GCB_invmap,                          \
2120                                     (cp))
2121 
2122 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2123  * bounded by pos and strend */
2124 #define getGCB_VAL_UTF8(pos, strend)                                           \
2125     _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2126 
2127 /* Returns the LB value for the input code point */
2128 #define getLB_VAL_CP(cp)                                                       \
2129           _generic_GET_BREAK_VAL_CP(                                           \
2130                                     PL_LB_invlist,                             \
2131                                     _Perl_LB_invmap,                           \
2132                                     (cp))
2133 
2134 /* Returns the LB value for the first code point in the UTF-8 encoded string
2135  * bounded by pos and strend */
2136 #define getLB_VAL_UTF8(pos, strend)                                            \
2137     _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2138 
2139 
2140 /* Returns the SB value for the input code point */
2141 #define getSB_VAL_CP(cp)                                                       \
2142           _generic_GET_BREAK_VAL_CP(                                           \
2143                                     PL_SB_invlist,                             \
2144                                     _Perl_SB_invmap,                     \
2145                                     (cp))
2146 
2147 /* Returns the SB value for the first code point in the UTF-8 encoded string
2148  * bounded by pos and strend */
2149 #define getSB_VAL_UTF8(pos, strend)                                            \
2150     _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2151 
2152 /* Returns the WB value for the input code point */
2153 #define getWB_VAL_CP(cp)                                                       \
2154           _generic_GET_BREAK_VAL_CP(                                           \
2155                                     PL_WB_invlist,                             \
2156                                     _Perl_WB_invmap,                         \
2157                                     (cp))
2158 
2159 /* Returns the WB value for the first code point in the UTF-8 encoded string
2160  * bounded by pos and strend */
2161 #define getWB_VAL_UTF8(pos, strend)                                            \
2162     _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2163 
2164 /* We know what class REx starts with.  Try to find this position... */
2165 /* if reginfo->intuit, its a dryrun */
2166 /* annoyingly all the vars in this routine have different names from their counterparts
2167    in regmatch. /grrr */
2168 STATIC char *
2169 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2170     const char *strend, regmatch_info *reginfo)
2171 {
2172 
2173     /* TRUE if x+ need not match at just the 1st pos of run of x's */
2174     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2175 
2176     char *pat_string;   /* The pattern's exactish string */
2177     char *pat_end;	    /* ptr to end char of pat_string */
2178     re_fold_t folder;	/* Function for computing non-utf8 folds */
2179     const U8 *fold_array;   /* array for folding ords < 256 */
2180     STRLEN ln;
2181     STRLEN lnc;
2182     U8 c1;
2183     U8 c2;
2184     char *e = NULL;
2185 
2186     /* In some cases we accept only the first occurence of 'x' in a sequence of
2187      * them.  This variable points to just beyond the end of the previous
2188      * occurrence of 'x', hence we can tell if we are in a sequence.  (Having
2189      * it point to beyond the 'x' allows us to work for UTF-8 without having to
2190      * hop back.) */
2191     char * previous_occurrence_end = 0;
2192 
2193     I32 tmp;            /* Scratch variable */
2194     const bool utf8_target = reginfo->is_utf8_target;
2195     UV utf8_fold_flags = 0;
2196     const bool is_utf8_pat = reginfo->is_utf8_pat;
2197     bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
2198                                    with a result inverts that result, as 0^1 =
2199                                    1 and 1^1 = 0 */
2200     _char_class_number classnum;
2201 
2202     RXi_GET_DECL(prog,progi);
2203 
2204     PERL_ARGS_ASSERT_FIND_BYCLASS;
2205 
2206     /* We know what class it must start with. The case statements below have
2207      * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2208      * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2209      * ('p8' and 'pb'. */
2210     switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2211 
2212       case ANYOFPOSIXL_t8_pb:
2213       case ANYOFPOSIXL_t8_p8:
2214       case ANYOFL_t8_pb:
2215       case ANYOFL_t8_p8:
2216         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2217         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2218 
2219         /* FALLTHROUGH */
2220 
2221       case ANYOFD_t8_pb:
2222       case ANYOFD_t8_p8:
2223       case ANYOF_t8_pb:
2224       case ANYOF_t8_p8:
2225         REXEC_FBC_UTF8_CLASS_SCAN(
2226                 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2227         break;
2228 
2229       case ANYOFPOSIXL_tb_pb:
2230       case ANYOFPOSIXL_tb_p8:
2231       case ANYOFL_tb_pb:
2232       case ANYOFL_tb_p8:
2233         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2234         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2235 
2236         /* FALLTHROUGH */
2237 
2238       case ANYOFD_tb_pb:
2239       case ANYOFD_tb_p8:
2240       case ANYOF_tb_pb:
2241       case ANYOF_tb_p8:
2242         if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2243             /* We know that s is in the bitmap range since the target isn't
2244              * UTF-8, so what happens for out-of-range values is not relevant,
2245              * so exclude that from the flags */
2246             REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2247                                                      0));
2248         }
2249         else {
2250             REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2251         }
2252         break;
2253 
2254       case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
2255       case ANYOFM_tb_p8:
2256         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2257              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2258         break;
2259 
2260       case ANYOFM_t8_pb:
2261       case ANYOFM_t8_p8:
2262         /* UTF-8ness doesn't matter because only matches UTF-8 invariants.  But
2263          * we do anyway for performance reasons, as otherwise we would have to
2264          * examine all the continuation characters */
2265         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2266              find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2267         break;
2268 
2269       case NANYOFM_tb_pb:
2270       case NANYOFM_tb_p8:
2271         REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2272            find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2273         break;
2274 
2275       case NANYOFM_t8_pb:
2276       case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2277                                   variants. */
2278         REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2279                         (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2280                                                     (U8) ARG(c), FLAGS(c)));
2281         break;
2282 
2283       /* These nodes all require at least one code point to be in UTF-8 to
2284        * match */
2285       case ANYOFH_tb_pb:
2286       case ANYOFH_tb_p8:
2287       case ANYOFHb_tb_pb:
2288       case ANYOFHb_tb_p8:
2289       case ANYOFHr_tb_pb:
2290       case ANYOFHr_tb_p8:
2291       case ANYOFHs_tb_pb:
2292       case ANYOFHs_tb_p8:
2293       case EXACTFLU8_tb_pb:
2294       case EXACTFLU8_tb_p8:
2295       case EXACTFU_REQ8_tb_pb:
2296       case EXACTFU_REQ8_tb_p8:
2297         break;
2298 
2299       case ANYOFH_t8_pb:
2300       case ANYOFH_t8_p8:
2301         REXEC_FBC_UTF8_CLASS_SCAN(
2302               (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2303                && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2304         break;
2305 
2306       case ANYOFHb_t8_pb:
2307       case ANYOFHb_t8_p8:
2308         {
2309             /* We know what the first byte of any matched string should be. */
2310             U8 first_byte = FLAGS(c);
2311 
2312             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2313                     reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2314         }
2315         break;
2316 
2317       case ANYOFHr_t8_pb:
2318       case ANYOFHr_t8_p8:
2319         REXEC_FBC_UTF8_CLASS_SCAN(
2320                     (   inRANGE(NATIVE_UTF8_TO_I8(*s),
2321                                 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2322                                 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2323                     && reginclass(prog, c, (U8*)s, (U8*) strend,
2324                                                            1 /* is utf8 */)));
2325         break;
2326 
2327       case ANYOFHs_t8_pb:
2328       case ANYOFHs_t8_p8:
2329         REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2330                         ((struct regnode_anyofhs *) c)->string,
2331                         /* Note FLAGS is the string length in this regnode */
2332                         ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2333                         reginclass(prog, c, (U8*)s, (U8*) strend,
2334                                    1 /* is utf8 */));
2335         break;
2336 
2337       case ANYOFR_tb_pb:
2338       case ANYOFR_tb_p8:
2339         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2340                                             ANYOFRbase(c), ANYOFRdelta(c)));
2341         break;
2342 
2343       case ANYOFR_t8_pb:
2344       case ANYOFR_t8_p8:
2345         REXEC_FBC_UTF8_CLASS_SCAN(
2346                             (   NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2347                              && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2348                                                               (U8 *) strend,
2349                                                               NULL),
2350                                             ANYOFRbase(c), ANYOFRdelta(c))));
2351         break;
2352 
2353       case ANYOFRb_tb_pb:
2354       case ANYOFRb_tb_p8:
2355         REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2356                                             ANYOFRbase(c), ANYOFRdelta(c)));
2357         break;
2358 
2359       case ANYOFRb_t8_pb:
2360       case ANYOFRb_t8_p8:
2361         {   /* We know what the first byte of any matched string should be */
2362             U8 first_byte = FLAGS(c);
2363 
2364             REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2365                                 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2366                                                               (U8 *) strend,
2367                                                               NULL),
2368                                             ANYOFRbase(c), ANYOFRdelta(c)));
2369         }
2370         break;
2371 
2372       case EXACTFAA_tb_pb:
2373 
2374         /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2375          * which these functions don't handle anyway */
2376         fold_array = PL_fold_latin1;
2377         folder = foldEQ_latin1_s2_folded;
2378         goto do_exactf_non_utf8;
2379 
2380       case EXACTF_tb_pb:
2381         fold_array = PL_fold;
2382         folder = foldEQ;
2383         goto do_exactf_non_utf8;
2384 
2385       case EXACTFL_tb_pb:
2386         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2387 
2388         if (IN_UTF8_CTYPE_LOCALE) {
2389             utf8_fold_flags = FOLDEQ_LOCALE;
2390             goto do_exactf_utf8;
2391         }
2392 
2393         fold_array = PL_fold_locale;
2394         folder = foldEQ_locale;
2395         goto do_exactf_non_utf8;
2396 
2397       case EXACTFU_tb_pb:
2398         /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2399          * don't have to worry here about this single special case in the
2400          * Latin1 range */
2401         fold_array = PL_fold_latin1;
2402         folder = foldEQ_latin1_s2_folded;
2403 
2404         /* FALLTHROUGH */
2405 
2406        do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2407                               are no glitches with fold-length differences
2408                               between the target string and pattern */
2409 
2410         /* The idea in the non-utf8 EXACTF* cases is to first find the first
2411          * character of the EXACTF* node and then, if necessary,
2412          * case-insensitively compare the full text of the node.  c1 is the
2413          * first character.  c2 is its fold.  This logic will not work for
2414          * Unicode semantics and the german sharp ss, which hence should not be
2415          * compiled into a node that gets here. */
2416         pat_string = STRINGs(c);
2417         ln  = STR_LENs(c);	/* length to match in octets/bytes */
2418 
2419         /* We know that we have to match at least 'ln' bytes (which is the same
2420          * as characters, since not utf8).  If we have to match 3 characters,
2421          * and there are only 2 availabe, we know without trying that it will
2422          * fail; so don't start a match past the required minimum number from
2423          * the far end */
2424         e = HOP3c(strend, -((SSize_t)ln), s);
2425         if (e < s)
2426             break;
2427 
2428         c1 = *pat_string;
2429         c2 = fold_array[c1];
2430         if (c1 == c2) { /* If char and fold are the same */
2431             while (s <= e) {
2432                 s = (char *) memchr(s, c1, e + 1 - s);
2433                 if (s == NULL) {
2434                     break;
2435                 }
2436 
2437                 /* Check that the rest of the node matches */
2438                 if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2439                     && (reginfo->intuit || regtry(reginfo, &s)) )
2440                 {
2441                     goto got_it;
2442                 }
2443                 s++;
2444             }
2445         }
2446         else {
2447             U8 bits_differing = c1 ^ c2;
2448 
2449             /* If the folds differ in one bit position only, we can mask to
2450              * match either of them, and can use this faster find method.  Both
2451              * ASCII and EBCDIC tend to have their case folds differ in only
2452              * one position, so this is very likely */
2453             if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2454                 bits_differing = ~ bits_differing;
2455                 while (s <= e) {
2456                     s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2457                                         (c1 & bits_differing), bits_differing);
2458                     if (s > e) {
2459                         break;
2460                     }
2461 
2462                     if (   (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2463                         && (reginfo->intuit || regtry(reginfo, &s)) )
2464                     {
2465                         goto got_it;
2466                     }
2467                     s++;
2468                 }
2469             }
2470             else {  /* Otherwise, stuck with looking byte-at-a-time.  This
2471                        should actually happen only in EXACTFL nodes */
2472                 while (s <= e) {
2473                     if (    (*(U8*)s == c1 || *(U8*)s == c2)
2474                         && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2475                         && (reginfo->intuit || regtry(reginfo, &s)) )
2476                     {
2477                         goto got_it;
2478                     }
2479                     s++;
2480                 }
2481             }
2482         }
2483         break;
2484 
2485       case EXACTFAA_tb_p8:
2486       case EXACTFAA_t8_p8:
2487         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2488                          |FOLDEQ_S2_ALREADY_FOLDED
2489                          |FOLDEQ_S2_FOLDS_SANE;
2490         goto do_exactf_utf8;
2491 
2492       case EXACTFAA_NO_TRIE_tb_pb:
2493       case EXACTFAA_NO_TRIE_t8_pb:
2494       case EXACTFAA_t8_pb:
2495 
2496         /* Here, and elsewhere in this file, the reason we can't consider a
2497          * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2498          * is because any MICRO SIGN in the pattern won't be folded.  Since the
2499          * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2500          * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2501         utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2502         goto do_exactf_utf8;
2503 
2504       case EXACTFL_tb_p8:
2505       case EXACTFL_t8_pb:
2506       case EXACTFL_t8_p8:
2507         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2508         utf8_fold_flags = FOLDEQ_LOCALE;
2509         goto do_exactf_utf8;
2510 
2511       case EXACTFLU8_t8_pb:
2512       case EXACTFLU8_t8_p8:
2513         utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2514                                          | FOLDEQ_S2_FOLDS_SANE;
2515         goto do_exactf_utf8;
2516 
2517       case EXACTFU_REQ8_t8_p8:
2518         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2519         goto do_exactf_utf8;
2520 
2521       case EXACTFU_tb_p8:
2522       case EXACTFU_t8_pb:
2523       case EXACTFU_t8_p8:
2524         utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2525         goto do_exactf_utf8;
2526 
2527       /* The following are problematic even though pattern isn't UTF-8.  Use
2528        * full functionality normally not done except for UTF-8. */
2529       case EXACTF_t8_pb:
2530       case EXACTFUP_tb_pb:
2531       case EXACTFUP_t8_pb:
2532 
2533        do_exactf_utf8:
2534         {
2535             unsigned expansion;
2536 
2537             /* If one of the operands is in utf8, we can't use the simpler
2538              * folding above, due to the fact that many different characters
2539              * can have the same fold, or portion of a fold, or different-
2540              * length fold */
2541             pat_string = STRINGs(c);
2542             ln  = STR_LENs(c);	/* length to match in octets/bytes */
2543             pat_end = pat_string + ln;
2544             lnc = is_utf8_pat       /* length to match in characters */
2545                   ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2546                   : ln;
2547 
2548             /* We have 'lnc' characters to match in the pattern, but because of
2549              * multi-character folding, each character in the target can match
2550              * up to 3 characters (Unicode guarantees it will never exceed
2551              * this) if it is utf8-encoded; and up to 2 if not (based on the
2552              * fact that the Latin 1 folds are already determined, and the only
2553              * multi-char fold in that range is the sharp-s folding to 'ss'.
2554              * Thus, a pattern character can match as little as 1/3 of a string
2555              * character.  Adjust lnc accordingly, rounding up, so that if we
2556              * need to match at least 4+1/3 chars, that really is 5. */
2557             expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2558             lnc = (lnc + expansion - 1) / expansion;
2559 
2560             /* As in the non-UTF8 case, if we have to match 3 characters, and
2561              * only 2 are left, it's guaranteed to fail, so don't start a match
2562              * that would require us to go beyond the end of the string */
2563             e = HOP3c(strend, -((SSize_t)lnc), s);
2564 
2565             /* XXX Note that we could recalculate e to stop the loop earlier,
2566              * as the worst case expansion above will rarely be met, and as we
2567              * go along we would usually find that e moves further to the left.
2568              * This would happen only after we reached the point in the loop
2569              * where if there were no expansion we should fail.  Unclear if
2570              * worth the expense */
2571 
2572             while (s <= e) {
2573                 char *my_strend= (char *)strend;
2574                 if (   foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
2575                                          pat_string, NULL, ln, is_utf8_pat,
2576                                          utf8_fold_flags)
2577                     && (reginfo->intuit || regtry(reginfo, &s)) )
2578                 {
2579                     goto got_it;
2580                 }
2581                 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2582             }
2583         }
2584         break;
2585 
2586       case BOUNDA_tb_pb:
2587       case BOUNDA_tb_p8:
2588       case BOUND_tb_pb:  /* /d without utf8 target is /a */
2589       case BOUND_tb_p8:
2590         /* regcomp.c makes sure that these only have the traditional \b
2591          * meaning. */
2592         assert(FLAGS(c) == TRADITIONAL_BOUND);
2593 
2594         FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2595         break;
2596 
2597       case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2598       case BOUNDA_t8_p8:
2599         /* regcomp.c makes sure that these only have the traditional \b
2600          * meaning. */
2601         assert(FLAGS(c) == TRADITIONAL_BOUND);
2602 
2603         FBC_BOUND_A_UTF8(isWORDCHAR_A);
2604         break;
2605 
2606       case NBOUNDA_tb_pb:
2607       case NBOUNDA_tb_p8:
2608       case NBOUND_tb_pb: /* /d without utf8 target is /a */
2609       case NBOUND_tb_p8:
2610         /* regcomp.c makes sure that these only have the traditional \b
2611          * meaning. */
2612         assert(FLAGS(c) == TRADITIONAL_BOUND);
2613 
2614         FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2615         break;
2616 
2617       case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2618       case NBOUNDA_t8_p8:
2619         /* regcomp.c makes sure that these only have the traditional \b
2620          * meaning. */
2621         assert(FLAGS(c) == TRADITIONAL_BOUND);
2622 
2623         FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2624         break;
2625 
2626       case NBOUNDU_tb_pb:
2627       case NBOUNDU_tb_p8:
2628         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2629             FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2630             break;
2631         }
2632 
2633         to_complement = 1;
2634         goto do_boundu_non_utf8;
2635 
2636       case NBOUNDL_tb_pb:
2637       case NBOUNDL_tb_p8:
2638         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2639         if (FLAGS(c) == TRADITIONAL_BOUND) {
2640             FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2641             break;
2642         }
2643 
2644         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2645 
2646         to_complement = 1;
2647         goto do_boundu_non_utf8;
2648 
2649       case BOUNDL_tb_pb:
2650       case BOUNDL_tb_p8:
2651         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2652         if (FLAGS(c) == TRADITIONAL_BOUND) {
2653             FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2654             break;
2655         }
2656 
2657         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2658 
2659         goto do_boundu_non_utf8;
2660 
2661       case BOUNDU_tb_pb:
2662       case BOUNDU_tb_p8:
2663         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2664             FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2665             break;
2666         }
2667 
2668       do_boundu_non_utf8:
2669         if (s == reginfo->strbeg) {
2670             if (reginfo->intuit || regtry(reginfo, &s))
2671             {
2672                 goto got_it;
2673             }
2674 
2675             /* Didn't match.  Try at the next position (if there is one) */
2676             s++;
2677             if (UNLIKELY(s >= reginfo->strend)) {
2678                 break;
2679             }
2680         }
2681 
2682         switch((bound_type) FLAGS(c)) {
2683           case TRADITIONAL_BOUND: /* Should have already been handled */
2684             assert(0);
2685             break;
2686 
2687           case GCB_BOUND:
2688             /* Not utf8.  Everything is a GCB except between CR and LF */
2689             while (s < strend) {
2690                 if ((to_complement ^ (   UCHARAT(s - 1) != '\r'
2691                                       || UCHARAT(s) != '\n'))
2692                     && (reginfo->intuit || regtry(reginfo, &s)))
2693                 {
2694                     goto got_it;
2695                 }
2696                 s++;
2697             }
2698 
2699             break;
2700 
2701           case LB_BOUND:
2702             {
2703                 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2704                 while (s < strend) {
2705                     LB_enum after = getLB_VAL_CP((U8) *s);
2706                     if (to_complement ^ isLB(before,
2707                                              after,
2708                                              (U8*) reginfo->strbeg,
2709                                              (U8*) s,
2710                                              (U8*) reginfo->strend,
2711                                              0 /* target not utf8 */ )
2712                         && (reginfo->intuit || regtry(reginfo, &s)))
2713                     {
2714                         goto got_it;
2715                     }
2716                     before = after;
2717                     s++;
2718                 }
2719             }
2720 
2721             break;
2722 
2723           case SB_BOUND:
2724             {
2725                 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2726                 while (s < strend) {
2727                     SB_enum after = getSB_VAL_CP((U8) *s);
2728                     if ((to_complement ^ isSB(before,
2729                                               after,
2730                                               (U8*) reginfo->strbeg,
2731                                               (U8*) s,
2732                                               (U8*) reginfo->strend,
2733                                              0 /* target not utf8 */ ))
2734                         && (reginfo->intuit || regtry(reginfo, &s)))
2735                     {
2736                         goto got_it;
2737                     }
2738                     before = after;
2739                     s++;
2740                 }
2741             }
2742 
2743             break;
2744 
2745           case WB_BOUND:
2746             {
2747                 WB_enum previous = WB_UNKNOWN;
2748                 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2749                 while (s < strend) {
2750                     WB_enum after = getWB_VAL_CP((U8) *s);
2751                     if ((to_complement ^ isWB(previous,
2752                                               before,
2753                                               after,
2754                                               (U8*) reginfo->strbeg,
2755                                               (U8*) s,
2756                                               (U8*) reginfo->strend,
2757                                                0 /* target not utf8 */ ))
2758                         && (reginfo->intuit || regtry(reginfo, &s)))
2759                     {
2760                         goto got_it;
2761                     }
2762                     previous = before;
2763                     before = after;
2764                     s++;
2765                 }
2766             }
2767         }
2768 
2769         /* Here are at the final position in the target string, which is a
2770          * boundary by definition, so matches, depending on other constraints.
2771          * */
2772         if (   reginfo->intuit
2773             || (s <= reginfo->strend && regtry(reginfo, &s)))
2774         {
2775             goto got_it;
2776         }
2777 
2778         break;
2779 
2780       case BOUNDL_t8_pb:
2781       case BOUNDL_t8_p8:
2782         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2783         if (FLAGS(c) == TRADITIONAL_BOUND) {
2784             FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2785                            isWORDCHAR_LC_utf8_safe);
2786             break;
2787         }
2788 
2789         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2790 
2791         to_complement = 1;
2792         goto do_boundu_utf8;
2793 
2794       case NBOUNDL_t8_pb:
2795       case NBOUNDL_t8_p8:
2796         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2797         if (FLAGS(c) == TRADITIONAL_BOUND) {
2798             FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2799                             isWORDCHAR_LC_utf8_safe);
2800             break;
2801         }
2802 
2803         CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2804 
2805         to_complement = 1;
2806         goto do_boundu_utf8;
2807 
2808       case NBOUND_t8_pb:
2809       case NBOUND_t8_p8:
2810         /* regcomp.c makes sure that these only have the traditional \b
2811          * meaning. */
2812         assert(FLAGS(c) == TRADITIONAL_BOUND);
2813 
2814         /* FALLTHROUGH */
2815 
2816       case NBOUNDU_t8_pb:
2817       case NBOUNDU_t8_p8:
2818         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2819             FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2820                             isWORDCHAR_utf8_safe);
2821             break;
2822         }
2823 
2824         to_complement = 1;
2825         goto do_boundu_utf8;
2826 
2827       case BOUND_t8_pb:
2828       case BOUND_t8_p8:
2829         /* regcomp.c makes sure that these only have the traditional \b
2830          * meaning. */
2831         assert(FLAGS(c) == TRADITIONAL_BOUND);
2832 
2833         /* FALLTHROUGH */
2834 
2835       case BOUNDU_t8_pb:
2836       case BOUNDU_t8_p8:
2837         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2838             FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2839             break;
2840         }
2841 
2842       do_boundu_utf8:
2843         if (s == reginfo->strbeg) {
2844             if (reginfo->intuit || regtry(reginfo, &s))
2845             {
2846                 goto got_it;
2847             }
2848 
2849             /* Didn't match.  Try at the next position (if there is one) */
2850             s += UTF8_SAFE_SKIP(s, reginfo->strend);
2851             if (UNLIKELY(s >= reginfo->strend)) {
2852                 break;
2853             }
2854         }
2855 
2856         switch((bound_type) FLAGS(c)) {
2857           case TRADITIONAL_BOUND: /* Should have already been handled */
2858             assert(0);
2859             break;
2860 
2861           case GCB_BOUND:
2862             {
2863                 GCB_enum before = getGCB_VAL_UTF8(
2864                                            reghop3((U8*)s, -1,
2865                                                    (U8*)(reginfo->strbeg)),
2866                                            (U8*) reginfo->strend);
2867                 while (s < strend) {
2868                     GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2869                                                     (U8*) reginfo->strend);
2870                     if (   (to_complement ^ isGCB(before,
2871                                                   after,
2872                                                   (U8*) reginfo->strbeg,
2873                                                   (U8*) s,
2874                                                   1 /* target is utf8 */ ))
2875                         && (reginfo->intuit || regtry(reginfo, &s)))
2876                     {
2877                         goto got_it;
2878                     }
2879                     before = after;
2880                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2881                 }
2882             }
2883             break;
2884 
2885           case LB_BOUND:
2886             {
2887                 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2888                                                         -1,
2889                                                         (U8*)(reginfo->strbeg)),
2890                                                    (U8*) reginfo->strend);
2891                 while (s < strend) {
2892                     LB_enum after = getLB_VAL_UTF8((U8*) s,
2893                                                    (U8*) reginfo->strend);
2894                     if (to_complement ^ isLB(before,
2895                                              after,
2896                                              (U8*) reginfo->strbeg,
2897                                              (U8*) s,
2898                                              (U8*) reginfo->strend,
2899                                              1 /* target is utf8 */ )
2900                         && (reginfo->intuit || regtry(reginfo, &s)))
2901                     {
2902                         goto got_it;
2903                     }
2904                     before = after;
2905                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2906                 }
2907             }
2908 
2909             break;
2910 
2911           case SB_BOUND:
2912             {
2913                 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2914                                                     -1,
2915                                                     (U8*)(reginfo->strbeg)),
2916                                                   (U8*) reginfo->strend);
2917                 while (s < strend) {
2918                     SB_enum after = getSB_VAL_UTF8((U8*) s,
2919                                                      (U8*) reginfo->strend);
2920                     if ((to_complement ^ isSB(before,
2921                                               after,
2922                                               (U8*) reginfo->strbeg,
2923                                               (U8*) s,
2924                                               (U8*) reginfo->strend,
2925                                               1 /* target is utf8 */ ))
2926                         && (reginfo->intuit || regtry(reginfo, &s)))
2927                     {
2928                         goto got_it;
2929                     }
2930                     before = after;
2931                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2932                 }
2933             }
2934 
2935             break;
2936 
2937           case WB_BOUND:
2938             {
2939                 /* We are at a boundary between char_sub_0 and char_sub_1.
2940                  * We also keep track of the value for char_sub_-1 as we
2941                  * loop through the line.   Context may be needed to make a
2942                  * determination, and if so, this can save having to
2943                  * recalculate it */
2944                 WB_enum previous = WB_UNKNOWN;
2945                 WB_enum before = getWB_VAL_UTF8(
2946                                           reghop3((U8*)s,
2947                                                   -1,
2948                                                   (U8*)(reginfo->strbeg)),
2949                                           (U8*) reginfo->strend);
2950                 while (s < strend) {
2951                     WB_enum after = getWB_VAL_UTF8((U8*) s,
2952                                                     (U8*) reginfo->strend);
2953                     if ((to_complement ^ isWB(previous,
2954                                               before,
2955                                               after,
2956                                               (U8*) reginfo->strbeg,
2957                                               (U8*) s,
2958                                               (U8*) reginfo->strend,
2959                                               1 /* target is utf8 */ ))
2960                         && (reginfo->intuit || regtry(reginfo, &s)))
2961                     {
2962                         goto got_it;
2963                     }
2964                     previous = before;
2965                     before = after;
2966                     s += UTF8_SAFE_SKIP(s, reginfo->strend);
2967                 }
2968             }
2969         }
2970 
2971         /* Here are at the final position in the target string, which is a
2972          * boundary by definition, so matches, depending on other constraints.
2973          * */
2974 
2975         if (   reginfo->intuit
2976             || (s <= reginfo->strend && regtry(reginfo, &s)))
2977         {
2978             goto got_it;
2979         }
2980         break;
2981 
2982       case LNBREAK_t8_pb:
2983       case LNBREAK_t8_p8:
2984         REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
2985         break;
2986 
2987       case LNBREAK_tb_pb:
2988       case LNBREAK_tb_p8:
2989         REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
2990         break;
2991 
2992       /* The argument to all the POSIX node types is the class number to pass
2993        * to _generic_isCC() to build a mask for searching in PL_charclass[] */
2994 
2995       case NPOSIXL_t8_pb:
2996       case NPOSIXL_t8_p8:
2997         to_complement = 1;
2998         /* FALLTHROUGH */
2999 
3000       case POSIXL_t8_pb:
3001       case POSIXL_t8_p8:
3002         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3003         REXEC_FBC_UTF8_CLASS_SCAN(
3004             to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3005                                                           (U8 *) strend)));
3006         break;
3007 
3008       case NPOSIXL_tb_pb:
3009       case NPOSIXL_tb_p8:
3010         to_complement = 1;
3011         /* FALLTHROUGH */
3012 
3013       case POSIXL_tb_pb:
3014       case POSIXL_tb_p8:
3015         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3016         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3017                                 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3018         break;
3019 
3020       case NPOSIXA_t8_pb:
3021       case NPOSIXA_t8_p8:
3022         /* The complement of something that matches only ASCII matches all
3023          * non-ASCII, plus everything in ASCII that isn't in the class. */
3024         REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
3025                                   || ! _generic_isCC_A(*s, FLAGS(c)));
3026         break;
3027 
3028       case POSIXA_t8_pb:
3029       case POSIXA_t8_p8:
3030         /* Don't need to worry about utf8, as it can match only a single
3031          * byte invariant character.  But we do anyway for performance reasons,
3032          * as otherwise we would have to examine all the continuation
3033          * characters */
3034         REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
3035         break;
3036 
3037       case NPOSIXD_tb_pb:
3038       case NPOSIXD_tb_p8:
3039       case NPOSIXA_tb_pb:
3040       case NPOSIXA_tb_p8:
3041         to_complement = 1;
3042         /* FALLTHROUGH */
3043 
3044       case POSIXD_tb_pb:
3045       case POSIXD_tb_p8:
3046       case POSIXA_tb_pb:
3047       case POSIXA_tb_p8:
3048         REXEC_FBC_NON_UTF8_CLASS_SCAN(
3049                         to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
3050         break;
3051 
3052       case NPOSIXU_tb_pb:
3053       case NPOSIXU_tb_p8:
3054         to_complement = 1;
3055         /* FALLTHROUGH */
3056 
3057       case POSIXU_tb_pb:
3058       case POSIXU_tb_p8:
3059             REXEC_FBC_NON_UTF8_CLASS_SCAN(
3060                                  to_complement ^ cBOOL(_generic_isCC(*s,
3061                                                                     FLAGS(c))));
3062         break;
3063 
3064       case NPOSIXD_t8_pb:
3065       case NPOSIXD_t8_p8:
3066       case NPOSIXU_t8_pb:
3067       case NPOSIXU_t8_p8:
3068         to_complement = 1;
3069         /* FALLTHROUGH */
3070 
3071       case POSIXD_t8_pb:
3072       case POSIXD_t8_p8:
3073       case POSIXU_t8_pb:
3074       case POSIXU_t8_p8:
3075         classnum = (_char_class_number) FLAGS(c);
3076         switch (classnum) {
3077           default:
3078             REXEC_FBC_UTF8_CLASS_SCAN(
3079                         to_complement ^ cBOOL(_invlist_contains_cp(
3080                                                 PL_XPosix_ptrs[classnum],
3081                                                 utf8_to_uvchr_buf((U8 *) s,
3082                                                                 (U8 *) strend,
3083                                                                 NULL))));
3084             break;
3085 
3086           case _CC_ENUM_SPACE:
3087             REXEC_FBC_UTF8_CLASS_SCAN(
3088                         to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3089             break;
3090 
3091           case _CC_ENUM_BLANK:
3092             REXEC_FBC_UTF8_CLASS_SCAN(
3093                         to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3094             break;
3095 
3096           case _CC_ENUM_XDIGIT:
3097             REXEC_FBC_UTF8_CLASS_SCAN(
3098                         to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3099             break;
3100 
3101           case _CC_ENUM_VERTSPACE:
3102             REXEC_FBC_UTF8_CLASS_SCAN(
3103                         to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3104             break;
3105 
3106           case _CC_ENUM_CNTRL:
3107             REXEC_FBC_UTF8_CLASS_SCAN(
3108                         to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3109             break;
3110         }
3111         break;
3112 
3113       case AHOCORASICKC_tb_pb:
3114       case AHOCORASICKC_tb_p8:
3115       case AHOCORASICKC_t8_pb:
3116       case AHOCORASICKC_t8_p8:
3117       case AHOCORASICK_tb_pb:
3118       case AHOCORASICK_tb_p8:
3119       case AHOCORASICK_t8_pb:
3120       case AHOCORASICK_t8_p8:
3121         {
3122             DECL_TRIE_TYPE(c);
3123             /* what trie are we using right now */
3124             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3125             reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3126             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3127 
3128             const char *last_start = strend - trie->minlen;
3129 #ifdef DEBUGGING
3130             const char *real_start = s;
3131 #endif
3132             STRLEN maxlen = trie->maxlen;
3133             SV *sv_points;
3134             U8 **points; /* map of where we were in the input string
3135                             when reading a given char. For ASCII this
3136                             is unnecessary overhead as the relationship
3137                             is always 1:1, but for Unicode, especially
3138                             case folded Unicode this is not true. */
3139             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3140             U8 *bitmap=NULL;
3141 
3142 
3143             DECLARE_AND_GET_RE_DEBUG_FLAGS;
3144 
3145             /* We can't just allocate points here. We need to wrap it in
3146              * an SV so it gets freed properly if there is a croak while
3147              * running the match */
3148             ENTER;
3149             SAVETMPS;
3150             sv_points=newSV(maxlen * sizeof(U8 *));
3151             SvCUR_set(sv_points,
3152                 maxlen * sizeof(U8 *));
3153             SvPOK_on(sv_points);
3154             sv_2mortal(sv_points);
3155             points=(U8**)SvPV_nolen(sv_points );
3156             if ( trie_type != trie_utf8_fold
3157                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
3158             {
3159                 if (trie->bitmap)
3160                     bitmap=(U8*)trie->bitmap;
3161                 else
3162                     bitmap=(U8*)ANYOF_BITMAP(c);
3163             }
3164             /* this is the Aho-Corasick algorithm modified a touch
3165                to include special handling for long "unknown char" sequences.
3166                The basic idea being that we use AC as long as we are dealing
3167                with a possible matching char, when we encounter an unknown char
3168                (and we have not encountered an accepting state) we scan forward
3169                until we find a legal starting char.
3170                AC matching is basically that of trie matching, except that when
3171                we encounter a failing transition, we fall back to the current
3172                states "fail state", and try the current char again, a process
3173                we repeat until we reach the root state, state 1, or a legal
3174                transition. If we fail on the root state then we can either
3175                terminate if we have reached an accepting state previously, or
3176                restart the entire process from the beginning if we have not.
3177 
3178              */
3179             while (s <= last_start) {
3180                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3181                 U8 *uc = (U8*)s;
3182                 U16 charid = 0;
3183                 U32 base = 1;
3184                 U32 state = 1;
3185                 UV uvc = 0;
3186                 STRLEN len = 0;
3187                 STRLEN foldlen = 0;
3188                 U8 *uscan = (U8*)NULL;
3189                 U8 *leftmost = NULL;
3190 #ifdef DEBUGGING
3191                 U32 accepted_word= 0;
3192 #endif
3193                 U32 pointpos = 0;
3194 
3195                 while ( state && uc <= (U8*)strend ) {
3196                     int failed=0;
3197                     U32 word = aho->states[ state ].wordnum;
3198 
3199                     if( state==1 ) {
3200                         if ( bitmap ) {
3201                             DEBUG_TRIE_EXECUTE_r(
3202                                 if (  uc <= (U8*)last_start
3203                                     && !BITMAP_TEST(bitmap,*uc) )
3204                                 {
3205                                     dump_exec_pos( (char *)uc, c, strend,
3206                                         real_start,
3207                                         (char *)uc, utf8_target, 0 );
3208                                     Perl_re_printf( aTHX_
3209                                         " Scanning for legal start char...\n");
3210                                 }
3211                             );
3212                             if (utf8_target) {
3213                                 while (  uc <= (U8*)last_start
3214                                        && !BITMAP_TEST(bitmap,*uc) )
3215                                 {
3216                                     uc += UTF8SKIP(uc);
3217                                 }
3218                             } else {
3219                                 while (  uc <= (U8*)last_start
3220                                        && ! BITMAP_TEST(bitmap,*uc) )
3221                                 {
3222                                     uc++;
3223                                 }
3224                             }
3225                             s= (char *)uc;
3226                         }
3227                         if (uc >(U8*)last_start) break;
3228                     }
3229 
3230                     if ( word ) {
3231                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3232                                                                     % maxlen ];
3233                         if (!leftmost || lpos < leftmost) {
3234                             DEBUG_r(accepted_word=word);
3235                             leftmost= lpos;
3236                         }
3237                         if (base==0) break;
3238 
3239                     }
3240                     points[pointpos++ % maxlen]= uc;
3241                     if (foldlen || uc < (U8*)strend) {
3242                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3243                                              (U8 *) strend, uscan, len, uvc,
3244                                              charid, foldlen, foldbuf,
3245                                              uniflags);
3246                         DEBUG_TRIE_EXECUTE_r({
3247                             dump_exec_pos( (char *)uc, c, strend,
3248                                         real_start, s, utf8_target, 0);
3249                             Perl_re_printf( aTHX_
3250                                 " Charid:%3u CP:%4" UVxf " ",
3251                                  charid, uvc);
3252                         });
3253                     }
3254                     else {
3255                         len = 0;
3256                         charid = 0;
3257                     }
3258 
3259 
3260                     do {
3261 #ifdef DEBUGGING
3262                         word = aho->states[ state ].wordnum;
3263 #endif
3264                         base = aho->states[ state ].trans.base;
3265 
3266                         DEBUG_TRIE_EXECUTE_r({
3267                             if (failed)
3268                                 dump_exec_pos((char *)uc, c, strend, real_start,
3269                                     s,   utf8_target, 0 );
3270                             Perl_re_printf( aTHX_
3271                                 "%sState: %4" UVxf ", word=%" UVxf,
3272                                 failed ? " Fail transition to " : "",
3273                                 (UV)state, (UV)word);
3274                         });
3275                         if ( base ) {
3276                             U32 tmp;
3277                             I32 offset;
3278                             if (charid &&
3279                                  ( ((offset = base + charid
3280                                     - 1 - trie->uniquecharcount)) >= 0)
3281                                  && ((U32)offset < trie->lasttrans)
3282                                  && trie->trans[offset].check == state
3283                                  && (tmp=trie->trans[offset].next))
3284                             {
3285                                 DEBUG_TRIE_EXECUTE_r(
3286                                     Perl_re_printf( aTHX_ " - legal\n"));
3287                                 state = tmp;
3288                                 break;
3289                             }
3290                             else {
3291                                 DEBUG_TRIE_EXECUTE_r(
3292                                     Perl_re_printf( aTHX_ " - fail\n"));
3293                                 failed = 1;
3294                                 state = aho->fail[state];
3295                             }
3296                         }
3297                         else {
3298                             /* we must be accepting here */
3299                             DEBUG_TRIE_EXECUTE_r(
3300                                     Perl_re_printf( aTHX_ " - accepting\n"));
3301                             failed = 1;
3302                             break;
3303                         }
3304                     } while(state);
3305                     uc += len;
3306                     if (failed) {
3307                         if (leftmost)
3308                             break;
3309                         if (!state) state = 1;
3310                     }
3311                 }
3312                 if ( aho->states[ state ].wordnum ) {
3313                     U8 *lpos = points[ (pointpos
3314                                       - trie->wordinfo[aho->states[ state ]
3315                                                     .wordnum].len) % maxlen ];
3316                     if (!leftmost || lpos < leftmost) {
3317                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3318                         leftmost = lpos;
3319                     }
3320                 }
3321                 if (leftmost) {
3322                     s = (char*)leftmost;
3323                     DEBUG_TRIE_EXECUTE_r({
3324                         Perl_re_printf( aTHX_  "Matches word #%" UVxf
3325                                         " at position %" IVdf ". Trying full"
3326                                         " pattern...\n",
3327                             (UV)accepted_word, (IV)(s - real_start)
3328                         );
3329                     });
3330                     if (reginfo->intuit || regtry(reginfo, &s)) {
3331                         FREETMPS;
3332                         LEAVE;
3333                         goto got_it;
3334                     }
3335                     if (s < reginfo->strend) {
3336                         s = HOPc(s,1);
3337                     }
3338                     DEBUG_TRIE_EXECUTE_r({
3339                         Perl_re_printf( aTHX_
3340                                        "Pattern failed. Looking for new start"
3341                                        " point...\n");
3342                     });
3343                 } else {
3344                     DEBUG_TRIE_EXECUTE_r(
3345                         Perl_re_printf( aTHX_ "No match.\n"));
3346                     break;
3347                 }
3348             }
3349             FREETMPS;
3350             LEAVE;
3351         }
3352         break;
3353 
3354       case EXACTFU_REQ8_t8_pb:
3355       case EXACTFUP_tb_p8:
3356       case EXACTFUP_t8_p8:
3357       case EXACTF_tb_p8:
3358       case EXACTF_t8_p8:   /* This node only generated for non-utf8 patterns */
3359       case EXACTFAA_NO_TRIE_tb_p8:
3360       case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3361                                       patterns */
3362         assert(0);
3363 
3364       default:
3365         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3366     } /* End of switch on node type */
3367 
3368     return 0;
3369 
3370   got_it:
3371     return s;
3372 }
3373 
3374 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3375  * flags have same meanings as with regexec_flags() */
3376 
3377 static void
3378 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3379                             char *strbeg,
3380                             char *strend,
3381                             SV *sv,
3382                             U32 flags,
3383                             bool utf8_target)
3384 {
3385     struct regexp *const prog = ReANY(rx);
3386 
3387     if (flags & REXEC_COPY_STR) {
3388 #ifdef PERL_ANY_COW
3389         if (SvCANCOW(sv)) {
3390             DEBUG_C(Perl_re_printf( aTHX_
3391                               "Copy on write: regexp capture, type %d\n",
3392                                     (int) SvTYPE(sv)));
3393             /* Create a new COW SV to share the match string and store
3394              * in saved_copy, unless the current COW SV in saved_copy
3395              * is valid and suitable for our purpose */
3396             if ((   prog->saved_copy
3397                  && SvIsCOW(prog->saved_copy)
3398                  && SvPOKp(prog->saved_copy)
3399                  && SvIsCOW(sv)
3400                  && SvPOKp(sv)
3401                  && SvPVX(sv) == SvPVX(prog->saved_copy)))
3402             {
3403                 /* just reuse saved_copy SV */
3404                 if (RXp_MATCH_COPIED(prog)) {
3405                     Safefree(prog->subbeg);
3406                     RXp_MATCH_COPIED_off(prog);
3407                 }
3408             }
3409             else {
3410                 /* create new COW SV to share string */
3411                 RXp_MATCH_COPY_FREE(prog);
3412                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3413             }
3414             prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3415             assert (SvPOKp(prog->saved_copy));
3416             prog->sublen  = strend - strbeg;
3417             prog->suboffset = 0;
3418             prog->subcoffset = 0;
3419         } else
3420 #endif
3421         {
3422             SSize_t min = 0;
3423             SSize_t max = strend - strbeg;
3424             SSize_t sublen;
3425 
3426             if (    (flags & REXEC_COPY_SKIP_POST)
3427                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3428                 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3429             ) { /* don't copy $' part of string */
3430                 U32 n = 0;
3431                 max = -1;
3432                 /* calculate the right-most part of the string covered
3433                  * by a capture. Due to lookahead, this may be to
3434                  * the right of $&, so we have to scan all captures */
3435                 while (n <= prog->lastparen) {
3436                     if (prog->offs[n].end > max)
3437                         max = prog->offs[n].end;
3438                     n++;
3439                 }
3440                 if (max == -1)
3441                     max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3442                             ? prog->offs[0].start
3443                             : 0;
3444                 assert(max >= 0 && max <= strend - strbeg);
3445             }
3446 
3447             if (    (flags & REXEC_COPY_SKIP_PRE)
3448                 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3449                 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3450             ) { /* don't copy $` part of string */
3451                 U32 n = 0;
3452                 min = max;
3453                 /* calculate the left-most part of the string covered
3454                  * by a capture. Due to lookbehind, this may be to
3455                  * the left of $&, so we have to scan all captures */
3456                 while (min && n <= prog->lastparen) {
3457                     if (   prog->offs[n].start != -1
3458                         && prog->offs[n].start < min)
3459                     {
3460                         min = prog->offs[n].start;
3461                     }
3462                     n++;
3463                 }
3464                 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3465                     && min >  prog->offs[0].end
3466                 )
3467                     min = prog->offs[0].end;
3468 
3469             }
3470 
3471             assert(min >= 0 && min <= max && min <= strend - strbeg);
3472             sublen = max - min;
3473 
3474             if (RXp_MATCH_COPIED(prog)) {
3475                 if (sublen > prog->sublen)
3476                     prog->subbeg =
3477                             (char*)saferealloc(prog->subbeg, sublen+1);
3478             }
3479             else
3480                 prog->subbeg = (char*)safemalloc(sublen+1);
3481             Copy(strbeg + min, prog->subbeg, sublen, char);
3482             prog->subbeg[sublen] = '\0';
3483             prog->suboffset = min;
3484             prog->sublen = sublen;
3485             RXp_MATCH_COPIED_on(prog);
3486         }
3487         prog->subcoffset = prog->suboffset;
3488         if (prog->suboffset && utf8_target) {
3489             /* Convert byte offset to chars.
3490              * XXX ideally should only compute this if @-/@+
3491              * has been seen, a la PL_sawampersand ??? */
3492 
3493             /* If there's a direct correspondence between the
3494              * string which we're matching and the original SV,
3495              * then we can use the utf8 len cache associated with
3496              * the SV. In particular, it means that under //g,
3497              * sv_pos_b2u() will use the previously cached
3498              * position to speed up working out the new length of
3499              * subcoffset, rather than counting from the start of
3500              * the string each time. This stops
3501              *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3502              * from going quadratic */
3503             if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3504                 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3505                                                 SV_GMAGIC|SV_CONST_RETURN);
3506             else
3507                 prog->subcoffset = utf8_length((U8*)strbeg,
3508                                     (U8*)(strbeg+prog->suboffset));
3509         }
3510     }
3511     else {
3512         RXp_MATCH_COPY_FREE(prog);
3513         prog->subbeg = strbeg;
3514         prog->suboffset = 0;
3515         prog->subcoffset = 0;
3516         prog->sublen = strend - strbeg;
3517     }
3518 }
3519 
3520 
3521 
3522 
3523 /*
3524  - regexec_flags - match a regexp against a string
3525  */
3526 I32
3527 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3528               char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3529 /* stringarg: the point in the string at which to begin matching */
3530 /* strend:    pointer to null at end of string */
3531 /* strbeg:    real beginning of string */
3532 /* minend:    end of match must be >= minend bytes after stringarg. */
3533 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
3534  *            itself is accessed via the pointers above */
3535 /* data:      May be used for some additional optimizations.
3536               Currently unused. */
3537 /* flags:     For optimizations. See REXEC_* in regexp.h */
3538 
3539 {
3540     struct regexp *const prog = ReANY(rx);
3541     char *s;
3542     regnode *c;
3543     char *startpos;
3544     SSize_t minlen;		/* must match at least this many chars */
3545     SSize_t dontbother = 0;	/* how many characters not to try at end */
3546     const bool utf8_target = cBOOL(DO_UTF8(sv));
3547     I32 multiline;
3548     RXi_GET_DECL(prog,progi);
3549     regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
3550     regmatch_info *const reginfo = &reginfo_buf;
3551     regexp_paren_pair *swap = NULL;
3552     I32 oldsave;
3553     DECLARE_AND_GET_RE_DEBUG_FLAGS;
3554 
3555     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3556     PERL_UNUSED_ARG(data);
3557 
3558     /* Be paranoid... */
3559     if (prog == NULL) {
3560         Perl_croak(aTHX_ "NULL regexp parameter");
3561     }
3562 
3563     DEBUG_EXECUTE_r(
3564         debug_start_match(rx, utf8_target, stringarg, strend,
3565         "Matching");
3566     );
3567 
3568     startpos = stringarg;
3569 
3570     /* set these early as they may be used by the HOP macros below */
3571     reginfo->strbeg = strbeg;
3572     reginfo->strend = strend;
3573     reginfo->is_utf8_target = cBOOL(utf8_target);
3574 
3575     if (prog->intflags & PREGf_GPOS_SEEN) {
3576         MAGIC *mg;
3577 
3578         /* set reginfo->ganch, the position where \G can match */
3579 
3580         reginfo->ganch =
3581             (flags & REXEC_IGNOREPOS)
3582             ? stringarg /* use start pos rather than pos() */
3583             : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3584               /* Defined pos(): */
3585             ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3586             : strbeg; /* pos() not defined; use start of string */
3587 
3588         DEBUG_GPOS_r(Perl_re_printf( aTHX_
3589             "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3590 
3591         /* in the presence of \G, we may need to start looking earlier in
3592          * the string than the suggested start point of stringarg:
3593          * if prog->gofs is set, then that's a known, fixed minimum
3594          * offset, such as
3595          * /..\G/:   gofs = 2
3596          * /ab|c\G/: gofs = 1
3597          * or if the minimum offset isn't known, then we have to go back
3598          * to the start of the string, e.g. /w+\G/
3599          */
3600 
3601         if (prog->intflags & PREGf_ANCH_GPOS) {
3602             if (prog->gofs) {
3603                 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3604                 if (!startpos ||
3605                     ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3606                 {
3607                     DEBUG_GPOS_r(Perl_re_printf( aTHX_
3608                             "fail: ganch-gofs before earliest possible start\n"));
3609                     return 0;
3610                 }
3611             }
3612             else
3613                 startpos = reginfo->ganch;
3614         }
3615         else if (prog->gofs) {
3616             startpos = HOPBACKc(startpos, prog->gofs);
3617             if (!startpos)
3618                 startpos = strbeg;
3619         }
3620         else if (prog->intflags & PREGf_GPOS_FLOAT)
3621             startpos = strbeg;
3622     }
3623 
3624     minlen = prog->minlen;
3625     if ((startpos + minlen) > strend || startpos < strbeg) {
3626         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3627                         "Regex match can't succeed, so not even tried\n"));
3628         return 0;
3629     }
3630 
3631     /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3632      * which will call destuctors to reset PL_regmatch_state, free higher
3633      * PL_regmatch_slabs, and clean up regmatch_info_aux and
3634      * regmatch_info_aux_eval */
3635 
3636     oldsave = PL_savestack_ix;
3637 
3638     s = startpos;
3639 
3640     if ((prog->extflags & RXf_USE_INTUIT)
3641         && !(flags & REXEC_CHECKED))
3642     {
3643         s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3644                                     flags, NULL);
3645         if (!s)
3646             return 0;
3647 
3648         if (prog->extflags & RXf_CHECK_ALL) {
3649             /* we can match based purely on the result of INTUIT.
3650              * Set up captures etc just for $& and $-[0]
3651              * (an intuit-only match wont have $1,$2,..) */
3652             assert(!prog->nparens);
3653 
3654             /* s/// doesn't like it if $& is earlier than where we asked it to
3655              * start searching (which can happen on something like /.\G/) */
3656             if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3657                     && (s < stringarg))
3658             {
3659                 /* this should only be possible under \G */
3660                 assert(prog->intflags & PREGf_GPOS_SEEN);
3661                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3662                     "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3663                 goto phooey;
3664             }
3665 
3666             /* match via INTUIT shouldn't have any captures.
3667              * Let @-, @+, $^N know */
3668             prog->lastparen = prog->lastcloseparen = 0;
3669             RXp_MATCH_UTF8_set(prog, utf8_target);
3670             prog->offs[0].start = s - strbeg;
3671             prog->offs[0].end = utf8_target
3672                 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3673                 : s - strbeg + prog->minlenret;
3674             if ( !(flags & REXEC_NOT_FIRST) )
3675                 S_reg_set_capture_string(aTHX_ rx,
3676                                         strbeg, strend,
3677                                         sv, flags, utf8_target);
3678 
3679             return 1;
3680         }
3681     }
3682 
3683     multiline = prog->extflags & RXf_PMf_MULTILINE;
3684 
3685     if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3686         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3687                               "String too short [regexec_flags]...\n"));
3688         goto phooey;
3689     }
3690 
3691     /* Check validity of program. */
3692     if (UCHARAT(progi->program) != REG_MAGIC) {
3693         Perl_croak(aTHX_ "corrupted regexp program");
3694     }
3695 
3696     RXp_MATCH_TAINTED_off(prog);
3697     RXp_MATCH_UTF8_set(prog, utf8_target);
3698 
3699     reginfo->prog = rx;	 /* Yes, sorry that this is confusing.  */
3700     reginfo->intuit = 0;
3701     reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3702     reginfo->warned = FALSE;
3703     reginfo->sv = sv;
3704     reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3705     /* see how far we have to get to not match where we matched before */
3706     reginfo->till = stringarg + minend;
3707 
3708     if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3709         /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3710            S_cleanup_regmatch_info_aux has executed (registered by
3711            SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
3712            magic belonging to this SV.
3713            Not newSVsv, either, as it does not COW.
3714         */
3715         reginfo->sv = newSV_type(SVt_NULL);
3716         SvSetSV_nosteal(reginfo->sv, sv);
3717         SAVEFREESV(reginfo->sv);
3718     }
3719 
3720     /* reserve next 2 or 3 slots in PL_regmatch_state:
3721      * slot N+0: may currently be in use: skip it
3722      * slot N+1: use for regmatch_info_aux struct
3723      * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3724      * slot N+3: ready for use by regmatch()
3725      */
3726 
3727     {
3728         regmatch_state *old_regmatch_state;
3729         regmatch_slab  *old_regmatch_slab;
3730         int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3731 
3732         /* on first ever match, allocate first slab */
3733         if (!PL_regmatch_slab) {
3734             Newx(PL_regmatch_slab, 1, regmatch_slab);
3735             PL_regmatch_slab->prev = NULL;
3736             PL_regmatch_slab->next = NULL;
3737             PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3738         }
3739 
3740         old_regmatch_state = PL_regmatch_state;
3741         old_regmatch_slab  = PL_regmatch_slab;
3742 
3743         for (i=0; i <= max; i++) {
3744             if (i == 1)
3745                 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3746             else if (i ==2)
3747                 reginfo->info_aux_eval =
3748                 reginfo->info_aux->info_aux_eval =
3749                             &(PL_regmatch_state->u.info_aux_eval);
3750 
3751             if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3752                 PL_regmatch_state = S_push_slab(aTHX);
3753         }
3754 
3755         /* note initial PL_regmatch_state position; at end of match we'll
3756          * pop back to there and free any higher slabs */
3757 
3758         reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3759         reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3760         reginfo->info_aux->poscache = NULL;
3761 
3762         SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3763 
3764         if ((prog->extflags & RXf_EVAL_SEEN))
3765             S_setup_eval_state(aTHX_ reginfo);
3766         else
3767             reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3768     }
3769 
3770     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3771         /* We have to be careful. If the previous successful match
3772            was from this regex we don't want a subsequent partially
3773            successful match to clobber the old results.
3774            So when we detect this possibility we add a swap buffer
3775            to the re, and switch the buffer each match. If we fail,
3776            we switch it back; otherwise we leave it swapped.
3777         */
3778         swap = prog->offs;
3779         /* avoid leak if we die, or clean up anyway if match completes */
3780         SAVEFREEPV(swap);
3781         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3782         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3783             "rex=0x%" UVxf " saving  offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3784             0,
3785             PTR2UV(prog),
3786             PTR2UV(swap),
3787             PTR2UV(prog->offs)
3788         ));
3789     }
3790 
3791     if (prog->recurse_locinput)
3792         Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3793 
3794     /* Simplest case: anchored match (but not \G) need be tried only once,
3795      * or with MBOL, only at the beginning of each line.
3796      *
3797      * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3798      * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3799      * match at the start of the string then it won't match anywhere else
3800      * either; while with /.*.../, if it doesn't match at the beginning,
3801      * the earliest it could match is at the start of the next line */
3802 
3803     if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3804         char *end;
3805 
3806         if (regtry(reginfo, &s))
3807             goto got_it;
3808 
3809         if (!(prog->intflags & PREGf_ANCH_MBOL))
3810             goto phooey;
3811 
3812         /* didn't match at start, try at other newline positions */
3813 
3814         if (minlen)
3815             dontbother = minlen - 1;
3816         end = HOP3c(strend, -dontbother, strbeg) - 1;
3817 
3818         /* skip to next newline */
3819 
3820         while (s <= end) { /* note it could be possible to match at the end of the string */
3821             /* NB: newlines are the same in unicode as they are in latin */
3822             if (*s++ != '\n')
3823                 continue;
3824             if (prog->check_substr || prog->check_utf8) {
3825             /* note that with PREGf_IMPLICIT, intuit can only fail
3826              * or return the start position, so it's of limited utility.
3827              * Nevertheless, I made the decision that the potential for
3828              * quick fail was still worth it - DAPM */
3829                 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3830                 if (!s)
3831                     goto phooey;
3832             }
3833             if (regtry(reginfo, &s))
3834                 goto got_it;
3835         }
3836         goto phooey;
3837     } /* end anchored search */
3838 
3839     /* anchored \G match */
3840     if (prog->intflags & PREGf_ANCH_GPOS)
3841     {
3842         /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3843         assert(prog->intflags & PREGf_GPOS_SEEN);
3844         /* For anchored \G, the only position it can match from is
3845          * (ganch-gofs); we already set startpos to this above; if intuit
3846          * moved us on from there, we can't possibly succeed */
3847         assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3848         if (s == startpos && regtry(reginfo, &s))
3849             goto got_it;
3850         goto phooey;
3851     }
3852 
3853     /* Messy cases:  unanchored match. */
3854 
3855     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3856         /* we have /x+whatever/ */
3857         /* it must be a one character string (XXXX Except is_utf8_pat?) */
3858         char ch;
3859 #ifdef DEBUGGING
3860         int did_match = 0;
3861 #endif
3862         if (utf8_target) {
3863             if (! prog->anchored_utf8) {
3864                 to_utf8_substr(prog);
3865             }
3866             ch = SvPVX_const(prog->anchored_utf8)[0];
3867             REXEC_FBC_UTF8_SCAN(
3868                 if (*s == ch) {
3869                     DEBUG_EXECUTE_r( did_match = 1 );
3870                     if (regtry(reginfo, &s)) goto got_it;
3871                     s += UTF8_SAFE_SKIP(s, strend);
3872                     while (s < strend && *s == ch)
3873                         s += UTF8SKIP(s);
3874                 }
3875             );
3876 
3877         }
3878         else {
3879             if (! prog->anchored_substr) {
3880                 if (! to_byte_substr(prog)) {
3881                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3882                 }
3883             }
3884             ch = SvPVX_const(prog->anchored_substr)[0];
3885             REXEC_FBC_NON_UTF8_SCAN(
3886                 if (*s == ch) {
3887                     DEBUG_EXECUTE_r( did_match = 1 );
3888                     if (regtry(reginfo, &s)) goto got_it;
3889                     s++;
3890                     while (s < strend && *s == ch)
3891                         s++;
3892                 }
3893             );
3894         }
3895         DEBUG_EXECUTE_r(if (!did_match)
3896                 Perl_re_printf( aTHX_
3897                                   "Did not find anchored character...\n")
3898                );
3899     }
3900     else if (prog->anchored_substr != NULL
3901               || prog->anchored_utf8 != NULL
3902               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3903                   && prog->float_max_offset < strend - s)) {
3904         SV *must;
3905         SSize_t back_max;
3906         SSize_t back_min;
3907         char *last;
3908         char *last1;		/* Last position checked before */
3909 #ifdef DEBUGGING
3910         int did_match = 0;
3911 #endif
3912         if (prog->anchored_substr || prog->anchored_utf8) {
3913             if (utf8_target) {
3914                 if (! prog->anchored_utf8) {
3915                     to_utf8_substr(prog);
3916                 }
3917                 must = prog->anchored_utf8;
3918             }
3919             else {
3920                 if (! prog->anchored_substr) {
3921                     if (! to_byte_substr(prog)) {
3922                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3923                     }
3924                 }
3925                 must = prog->anchored_substr;
3926             }
3927             back_max = back_min = prog->anchored_offset;
3928         } else {
3929             if (utf8_target) {
3930                 if (! prog->float_utf8) {
3931                     to_utf8_substr(prog);
3932                 }
3933                 must = prog->float_utf8;
3934             }
3935             else {
3936                 if (! prog->float_substr) {
3937                     if (! to_byte_substr(prog)) {
3938                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3939                     }
3940                 }
3941                 must = prog->float_substr;
3942             }
3943             back_max = prog->float_max_offset;
3944             back_min = prog->float_min_offset;
3945         }
3946 
3947         if (back_min<0) {
3948             last = strend;
3949         } else {
3950             last = HOP3c(strend,	/* Cannot start after this */
3951                   -(SSize_t)(CHR_SVLEN(must)
3952                          - (SvTAIL(must) != 0) + back_min), strbeg);
3953         }
3954         if (s > reginfo->strbeg)
3955             last1 = HOPc(s, -1);
3956         else
3957             last1 = s - 1;	/* bogus */
3958 
3959         /* XXXX check_substr already used to find "s", can optimize if
3960            check_substr==must. */
3961         dontbother = 0;
3962         strend = HOPc(strend, -dontbother);
3963         while ( (s <= last) &&
3964                 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3965                                   (unsigned char*)strend, must,
3966                                   multiline ? FBMrf_MULTILINE : 0)) ) {
3967             DEBUG_EXECUTE_r( did_match = 1 );
3968             if (HOPc(s, -back_max) > last1) {
3969                 last1 = HOPc(s, -back_min);
3970                 s = HOPc(s, -back_max);
3971             }
3972             else {
3973                 char * const t = (last1 >= reginfo->strbeg)
3974                                     ? HOPc(last1, 1) : last1 + 1;
3975 
3976                 last1 = HOPc(s, -back_min);
3977                 s = t;
3978             }
3979             if (utf8_target) {
3980                 while (s <= last1) {
3981                     if (regtry(reginfo, &s))
3982                         goto got_it;
3983                     if (s >= last1) {
3984                         s++; /* to break out of outer loop */
3985                         break;
3986                     }
3987                     s += UTF8SKIP(s);
3988                 }
3989             }
3990             else {
3991                 while (s <= last1) {
3992                     if (regtry(reginfo, &s))
3993                         goto got_it;
3994                     s++;
3995                 }
3996             }
3997         }
3998         DEBUG_EXECUTE_r(if (!did_match) {
3999             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4000                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4001             Perl_re_printf( aTHX_  "Did not find %s substr %s%s...\n",
4002                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
4003                                ? "anchored" : "floating"),
4004                 quoted, RE_SV_TAIL(must));
4005         });
4006         goto phooey;
4007     }
4008     else if ( (c = progi->regstclass) ) {
4009         if (minlen) {
4010             const OPCODE op = OP(progi->regstclass);
4011             /* don't bother with what can't match */
4012             if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
4013                 strend = HOPc(strend, -(minlen - 1));
4014         }
4015         DEBUG_EXECUTE_r({
4016             SV * const prop = sv_newmortal();
4017             regprop(prog, prop, c, reginfo, NULL);
4018             {
4019                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4020                     s,strend-s,PL_dump_re_max_len);
4021                 Perl_re_printf( aTHX_
4022                     "Matching stclass %.*s against %s (%d bytes)\n",
4023                     (int)SvCUR(prop), SvPVX_const(prop),
4024                      quoted, (int)(strend - s));
4025             }
4026         });
4027         if (find_byclass(prog, c, s, strend, reginfo))
4028             goto got_it;
4029         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "Contradicts stclass... [regexec_flags]\n"));
4030     }
4031     else {
4032         dontbother = 0;
4033         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4034             /* Trim the end. */
4035             char *last= NULL;
4036             SV* float_real;
4037             STRLEN len;
4038             const char *little;
4039 
4040             if (utf8_target) {
4041                 if (! prog->float_utf8) {
4042                     to_utf8_substr(prog);
4043                 }
4044                 float_real = prog->float_utf8;
4045             }
4046             else {
4047                 if (! prog->float_substr) {
4048                     if (! to_byte_substr(prog)) {
4049                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4050                     }
4051                 }
4052                 float_real = prog->float_substr;
4053             }
4054 
4055             little = SvPV_const(float_real, len);
4056             if (SvTAIL(float_real)) {
4057                     /* This means that float_real contains an artificial \n on
4058                      * the end due to the presence of something like this:
4059                      * /foo$/ where we can match both "foo" and "foo\n" at the
4060                      * end of the string.  So we have to compare the end of the
4061                      * string first against the float_real without the \n and
4062                      * then against the full float_real with the string.  We
4063                      * have to watch out for cases where the string might be
4064                      * smaller than the float_real or the float_real without
4065                      * the \n. */
4066                     char *checkpos= strend - len;
4067                     DEBUG_OPTIMISE_r(
4068                         Perl_re_printf( aTHX_
4069                             "%sChecking for float_real.%s\n",
4070                             PL_colors[4], PL_colors[5]));
4071                     if (checkpos + 1 < strbeg) {
4072                         /* can't match, even if we remove the trailing \n
4073                          * string is too short to match */
4074                         DEBUG_EXECUTE_r(
4075                             Perl_re_printf( aTHX_
4076                                 "%sString shorter than required trailing substring, cannot match.%s\n",
4077                                 PL_colors[4], PL_colors[5]));
4078                         goto phooey;
4079                     } else if (memEQ(checkpos + 1, little, len - 1)) {
4080                         /* can match, the end of the string matches without the
4081                          * "\n" */
4082                         last = checkpos + 1;
4083                     } else if (checkpos < strbeg) {
4084                         /* cant match, string is too short when the "\n" is
4085                          * included */
4086                         DEBUG_EXECUTE_r(
4087                             Perl_re_printf( aTHX_
4088                                 "%sString does not contain required trailing substring, cannot match.%s\n",
4089                                 PL_colors[4], PL_colors[5]));
4090                         goto phooey;
4091                     } else if (!multiline) {
4092                         /* non multiline match, so compare with the "\n" at the
4093                          * end of the string */
4094                         if (memEQ(checkpos, little, len)) {
4095                             last= checkpos;
4096                         } else {
4097                             DEBUG_EXECUTE_r(
4098                                 Perl_re_printf( aTHX_
4099                                     "%sString does not contain required trailing substring, cannot match.%s\n",
4100                                     PL_colors[4], PL_colors[5]));
4101                             goto phooey;
4102                         }
4103                     } else {
4104                         /* multiline match, so we have to search for a place
4105                          * where the full string is located */
4106                         goto find_last;
4107                     }
4108             } else {
4109                   find_last:
4110                     if (len)
4111                         last = rninstr(s, strend, little, little + len);
4112                     else
4113                         last = strend;	/* matching "$" */
4114             }
4115             if (!last) {
4116                 /* at one point this block contained a comment which was
4117                  * probably incorrect, which said that this was a "should not
4118                  * happen" case.  Even if it was true when it was written I am
4119                  * pretty sure it is not anymore, so I have removed the comment
4120                  * and replaced it with this one. Yves */
4121                 DEBUG_EXECUTE_r(
4122                     Perl_re_printf( aTHX_
4123                         "%sString does not contain required substring, cannot match.%s\n",
4124                         PL_colors[4], PL_colors[5]
4125                     ));
4126                 goto phooey;
4127             }
4128             dontbother = strend - last + prog->float_min_offset;
4129         }
4130         if (minlen && (dontbother < minlen))
4131             dontbother = minlen - 1;
4132         strend -= dontbother; 		   /* this one's always in bytes! */
4133         /* We don't know much -- general case. */
4134         if (utf8_target) {
4135             for (;;) {
4136                 if (regtry(reginfo, &s))
4137                     goto got_it;
4138                 if (s >= strend)
4139                     break;
4140                 s += UTF8SKIP(s);
4141             };
4142         }
4143         else {
4144             do {
4145                 if (regtry(reginfo, &s))
4146                     goto got_it;
4147             } while (s++ < strend);
4148         }
4149     }
4150 
4151     /* Failure. */
4152     goto phooey;
4153 
4154   got_it:
4155     /* s/// doesn't like it if $& is earlier than where we asked it to
4156      * start searching (which can happen on something like /.\G/) */
4157     if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
4158             && (prog->offs[0].start < stringarg - strbeg))
4159     {
4160         /* this should only be possible under \G */
4161         assert(prog->intflags & PREGf_GPOS_SEEN);
4162         DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4163             "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4164         goto phooey;
4165     }
4166 
4167     /* clean up; this will trigger destructors that will free all slabs
4168      * above the current one, and cleanup the regmatch_info_aux
4169      * and regmatch_info_aux_eval sructs */
4170 
4171     LEAVE_SCOPE(oldsave);
4172 
4173     if (RXp_PAREN_NAMES(prog))
4174         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4175 
4176     /* make sure $`, $&, $', and $digit will work later */
4177     if ( !(flags & REXEC_NOT_FIRST) )
4178         S_reg_set_capture_string(aTHX_ rx,
4179                                     strbeg, reginfo->strend,
4180                                     sv, flags, utf8_target);
4181 
4182     return 1;
4183 
4184   phooey:
4185     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch failed%s\n",
4186                           PL_colors[4], PL_colors[5]));
4187 
4188     if (swap) {
4189         /* we failed :-( roll it back.
4190          * Since the swap buffer will be freed on scope exit which follows
4191          * shortly, restore the old captures by copying 'swap's original
4192          * data to the new offs buffer
4193          */
4194         DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4195             "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4196             0,
4197             PTR2UV(prog),
4198             PTR2UV(prog->offs),
4199             PTR2UV(swap)
4200         ));
4201 
4202         Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
4203     }
4204 
4205     /* clean up; this will trigger destructors that will free all slabs
4206      * above the current one, and cleanup the regmatch_info_aux
4207      * and regmatch_info_aux_eval sructs */
4208 
4209     LEAVE_SCOPE(oldsave);
4210 
4211     return 0;
4212 }
4213 
4214 
4215 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4216  * Do inc before dec, in case old and new rex are the same */
4217 #define SET_reg_curpm(Re2)                          \
4218     if (reginfo->info_aux_eval) {                   \
4219         (void)ReREFCNT_inc(Re2);		    \
4220         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));	    \
4221         PM_SETRE((PL_reg_curpm), (Re2));	    \
4222     }
4223 
4224 
4225 /*
4226  - regtry - try match at specific point
4227  */
4228 STATIC bool			/* 0 failure, 1 success */
4229 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4230 {
4231     CHECKPOINT lastcp;
4232     REGEXP *const rx = reginfo->prog;
4233     regexp *const prog = ReANY(rx);
4234     SSize_t result;
4235 #ifdef DEBUGGING
4236     U32 depth = 0; /* used by REGCP_SET */
4237 #endif
4238     RXi_GET_DECL(prog,progi);
4239     DECLARE_AND_GET_RE_DEBUG_FLAGS;
4240 
4241     PERL_ARGS_ASSERT_REGTRY;
4242 
4243     reginfo->cutpoint=NULL;
4244 
4245     prog->offs[0].start = *startposp - reginfo->strbeg;
4246     prog->lastparen = 0;
4247     prog->lastcloseparen = 0;
4248 
4249     /* XXXX What this code is doing here?!!!  There should be no need
4250        to do this again and again, prog->lastparen should take care of
4251        this!  --ilya*/
4252 
4253     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4254      * Actually, the code in regcppop() (which Ilya may be meaning by
4255      * prog->lastparen), is not needed at all by the test suite
4256      * (op/regexp, op/pat, op/split), but that code is needed otherwise
4257      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4258      * Meanwhile, this code *is* needed for the
4259      * above-mentioned test suite tests to succeed.  The common theme
4260      * on those tests seems to be returning null fields from matches.
4261      * --jhi updated by dapm */
4262 
4263     /* After encountering a variant of the issue mentioned above I think
4264      * the point Ilya was making is that if we properly unwind whenever
4265      * we set lastparen to a smaller value then we should not need to do
4266      * this every time, only when needed. So if we have tests that fail if
4267      * we remove this, then it suggests somewhere else we are improperly
4268      * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4269      * places it is called, and related regcp() routines. - Yves */
4270 #if 1
4271     if (prog->nparens) {
4272         regexp_paren_pair *pp = prog->offs;
4273         I32 i;
4274         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4275             ++pp;
4276             pp->start = -1;
4277             pp->end = -1;
4278         }
4279     }
4280 #endif
4281     REGCP_SET(lastcp);
4282     result = regmatch(reginfo, *startposp, progi->program + 1);
4283     if (result != -1) {
4284         prog->offs[0].end = result;
4285         return 1;
4286     }
4287     if (reginfo->cutpoint)
4288         *startposp= reginfo->cutpoint;
4289     REGCP_UNWIND(lastcp);
4290     return 0;
4291 }
4292 
4293 /* this is used to determine how far from the left messages like
4294    'failed...' are printed in regexec.c. It should be set such that
4295    messages are inline with the regop output that created them.
4296 */
4297 #define REPORT_CODE_OFF 29
4298 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4299 #ifdef DEBUGGING
4300 int
4301 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4302 {
4303     va_list ap;
4304     int result;
4305     PerlIO *f= Perl_debug_log;
4306     PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4307     va_start(ap, depth);
4308     PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4309     result = PerlIO_vprintf(f, fmt, ap);
4310     va_end(ap);
4311     return result;
4312 }
4313 #endif /* DEBUGGING */
4314 
4315 /* grab a new slab and return the first slot in it */
4316 
4317 STATIC regmatch_state *
4318 S_push_slab(pTHX)
4319 {
4320     regmatch_slab *s = PL_regmatch_slab->next;
4321     if (!s) {
4322         Newx(s, 1, regmatch_slab);
4323         s->prev = PL_regmatch_slab;
4324         s->next = NULL;
4325         PL_regmatch_slab->next = s;
4326     }
4327     PL_regmatch_slab = s;
4328     return SLAB_FIRST(s);
4329 }
4330 
4331 #ifdef DEBUGGING
4332 
4333 STATIC void
4334 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4335     const char *start, const char *end, const char *blurb)
4336 {
4337     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4338 
4339     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4340 
4341     if (!PL_colorset)
4342             reginitcolors();
4343     {
4344         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4345             RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4346 
4347         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4348             start, end - start, PL_dump_re_max_len);
4349 
4350         Perl_re_printf( aTHX_
4351             "%s%s REx%s %s against %s\n",
4352                        PL_colors[4], blurb, PL_colors[5], s0, s1);
4353 
4354         if (utf8_target||utf8_pat)
4355             Perl_re_printf( aTHX_  "UTF-8 %s%s%s...\n",
4356                 utf8_pat ? "pattern" : "",
4357                 utf8_pat && utf8_target ? " and " : "",
4358                 utf8_target ? "string" : ""
4359             );
4360     }
4361 }
4362 
4363 STATIC void
4364 S_dump_exec_pos(pTHX_ const char *locinput,
4365                       const regnode *scan,
4366                       const char *loc_regeol,
4367                       const char *loc_bostr,
4368                       const char *loc_reg_starttry,
4369                       const bool utf8_target,
4370                       const U32 depth
4371                 )
4372 {
4373     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4374     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4375     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4376     /* The part of the string before starttry has one color
4377        (pref0_len chars), between starttry and current
4378        position another one (pref_len - pref0_len chars),
4379        after the current position the third one.
4380        We assume that pref0_len <= pref_len, otherwise we
4381        decrease pref0_len.  */
4382     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4383         ? (5 + taill) - l : locinput - loc_bostr;
4384     int pref0_len;
4385 
4386     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4387 
4388     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4389         pref_len++;
4390     pref0_len = pref_len  - (locinput - loc_reg_starttry);
4391     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4392         l = ( loc_regeol - locinput > (5 + taill) - pref_len
4393               ? (5 + taill) - pref_len : loc_regeol - locinput);
4394     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4395         l--;
4396     if (pref0_len < 0)
4397         pref0_len = 0;
4398     if (pref0_len > pref_len)
4399         pref0_len = pref_len;
4400     {
4401         const int is_uni = utf8_target ? 1 : 0;
4402 
4403         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4404             (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4405 
4406         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4407                     (locinput - pref_len + pref0_len),
4408                     pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4409 
4410         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4411                     locinput, loc_regeol - locinput, 10, 0, 1);
4412 
4413         const STRLEN tlen=len0+len1+len2;
4414         Perl_re_printf( aTHX_
4415                     "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ",
4416                     (IV)(locinput - loc_bostr),
4417                     len0, s0,
4418                     len1, s1,
4419                     (docolor ? "" : "> <"),
4420                     len2, s2,
4421                     (int)(tlen > 19 ? 0 :  19 - tlen),
4422                     "",
4423                     (UV)depth);
4424     }
4425 }
4426 
4427 #endif
4428 
4429 /* reg_check_named_buff_matched()
4430  * Checks to see if a named buffer has matched. The data array of
4431  * buffer numbers corresponding to the buffer is expected to reside
4432  * in the regexp->data->data array in the slot stored in the ARG() of
4433  * node involved. Note that this routine doesn't actually care about the
4434  * name, that information is not preserved from compilation to execution.
4435  * Returns the index of the leftmost defined buffer with the given name
4436  * or 0 if non of the buffers matched.
4437  */
4438 STATIC I32
4439 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4440 {
4441     I32 n;
4442     RXi_GET_DECL(rex,rexi);
4443     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4444     I32 *nums=(I32*)SvPVX(sv_dat);
4445 
4446     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4447 
4448     for ( n=0; n<SvIVX(sv_dat); n++ ) {
4449         if ((I32)rex->lastparen >= nums[n] &&
4450             rex->offs[nums[n]].end != -1)
4451         {
4452             return nums[n];
4453         }
4454     }
4455     return 0;
4456 }
4457 
4458 static bool
4459 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4460                           struct next_matchable_info * m,
4461                           regmatch_info *reginfo)
4462 {
4463     /* This function determines various characteristics about every possible
4464      * initial match of the passed-in EXACTish <text_node>, and stores them in
4465      * <*m>.
4466      *
4467      * That includes a match string and a parallel mask, such that if you AND
4468      * the target string with the mask and compare with the match string,
4469      * you'll have a pretty good idea, perhaps even perfect, if that portion of
4470      * the target matches or not.
4471      *
4472      * The motivation behind this function is to allow the caller to set up
4473      * tight loops for matching.  Consider patterns like '.*B' or '.*?B' where
4474      * B is an arbitrary EXACTish node.  To find the end of .*, we look for the
4475      * beginning oF B, which is the passed in <text_node>  That's where this
4476      * function comes in.  The values it returns can quickly be used to rule
4477      * out many, or all, cases of possible matches not actually being the
4478      * beginning of B, <text_node>.  It is also used in regrepeat() where we
4479      * have 'A*', for arbitrary 'A'.  This sets up criteria to more efficiently
4480      * determine where the span of 'A's stop.
4481      *
4482      * If <text_node> is of type EXACT, there is only one possible character
4483      * that can match its first character, and so the situation is quite
4484      * simple.  But things can get much more complicated if folding is
4485      * involved.  It may be that the first character of an EXACTFish node
4486      * doesn't participate in any possible fold, e.g., punctuation, so it can
4487      * be matched only by itself.  The vast majority of characters that are in
4488      * folds match just two things, their lower and upper-case equivalents.
4489      * But not all are like that; some have multiple possible matches, or match
4490      * sequences of more than one character.  This function sorts all that out.
4491      *
4492      * It returns information about all possibilities of what the first
4493      * character(s) of <text_node> could look like.  Again, if <text_node> is a
4494      * plain EXACT node, that's just the actual first bytes of the first
4495      * character; but otherwise it is the bytes, that when masked, match all
4496      * possible combinations of all the initial bytes of all the characters
4497      * that could match, folded.  (Actually, this is a slight over promise.  It
4498      * handles only up to the initial 5 bytes, which is enough for all Unicode
4499      * characters, but not for all non-Unicode ones.)
4500      *
4501      * Here's an example to clarify.  Suppose the first character of
4502      * <text_node> is the letter 'C', and we are under /i matching.  That means
4503      * 'c' also matches.  The representations of these two characters differ in
4504      * just one bit, so the mask would be a zero in that position and ones in
4505      * the other 7.  And the returned string would be the AND of these two
4506      * characters, and would be one byte long, since these characters are each
4507      * a single byte.  ANDing the target <text_node> with this mask will yield
4508      * the returned string if and only if <text_node> begins with one of these
4509      * two characters.  So, the function would also return that the definitive
4510      * length matched is 1 byte.
4511      *
4512      * Now, suppose instead of the letter 'C',  <text_node> begins with the
4513      * letter 'F'.  The situation is much more complicated because there are
4514      * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4515      * begins with 'f', and hence could match.  We add these into the returned
4516      * string and mask, but the result isn't definitive; the caller has to
4517      * check further if its AND and compare pass.  But the failure of that
4518      * compare will quickly rule out most possible inputs.
4519      *
4520      * Much of this could be done in regcomp.c at compile time, except for
4521      * locale-dependent, and UTF-8 target dependent data.  Extra data fields
4522      * could be used for one or the other eventualities.
4523      *
4524      * If this function determines that no possible character in the target
4525      * string can match, it returns FALSE; otherwise TRUE.  (The FALSE
4526      * situation occurs if the first character in <text_node> requires UTF-8 to
4527      * represent, and the target string isn't in UTF-8.)
4528      *
4529      * Some analysis is in GH #18414, located at the time of this writing at:
4530      * https://github.com/Perl/perl5/issues/18414
4531      */
4532 
4533     const bool utf8_target = reginfo->is_utf8_target;
4534     bool utf8_pat = reginfo->is_utf8_pat;
4535 
4536     PERL_UINT_FAST8_T i;
4537 
4538     /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4539      */
4540     U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4541     U8 lengths[MAX_MATCHES] = { 0 };
4542 
4543     U8 index_of_longest = 0;
4544 
4545     U8 *pat = (U8*)STRING(text_node);
4546     Size_t pat_len = STR_LEN(text_node);
4547     U8 op = OP(text_node);
4548 
4549     U8 byte_mask[5]  = {0};
4550     U8 byte_anded[5] = {0};
4551 
4552     /* There are some folds in Unicode to multiple characters.  This will hold
4553      * such characters that could fold to the beginning of 'text_node' */
4554     UV multi_fold_from = 0;
4555 
4556     /* We may have to create a modified copy of the pattern */
4557     U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4558 
4559     m->max_length = 0;
4560     m->min_length = 255;
4561     m->count = 0;
4562 
4563     /* Even if the first character in the node can match something in Latin1,
4564      * if there is anything in the node that can't, the match must fail */
4565     if (! utf8_target && isEXACT_REQ8(op)) {
4566         return FALSE;
4567     }
4568 
4569 /* Define a temporary op for use in this function, using an existing one that
4570  * should never be a real op during execution */
4571 #define TURKISH  PSEUDO
4572 
4573     /* What to do about these two nodes had to be deferred to runtime (which is
4574      * now).  If the extra information we now have so indicates, turn them into
4575      * EXACTFU nodes */
4576     if (   (op == EXACTF && utf8_target)
4577         || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4578     {
4579         if (op == EXACTFL && PL_in_utf8_turkic_locale) {
4580             op = TURKISH;
4581         }
4582         else {
4583             op = EXACTFU;
4584         }
4585 
4586         /* And certain situations are better handled if we create a modified
4587          * version of the pattern */
4588         if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4589                            specific problematic characters */
4590             if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4591 
4592                 /* The node could start with characters that are the first ones
4593                  * of a multi-character fold. */
4594                 multi_fold_from
4595                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4596                 if (multi_fold_from) {
4597 
4598                     /* Here, they do form a sequence that matches the fold of a
4599                      * single character.  That single character then is a
4600                      * possible match.  Below we will look again at this, but
4601                      * the code below is expecting every character in the
4602                      * pattern to be folded, which the input isn't required to
4603                      * be in this case.  So, just fold the single character,
4604                      * and the result will be in the expected form. */
4605                     _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4606                                        FOLD_FLAGS_FULL);
4607                     pat = mod_pat;
4608                 }
4609                          /* Turkish has a couple extra possibilities. */
4610                 else if (   UNLIKELY(op == TURKISH)
4611                          &&  pat_len >= 3
4612                          &&  isALPHA_FOLD_EQ(pat[0], 'f')
4613                          && (   memBEGINs(pat + 1, pat_len - 1,
4614                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4615                              || (   pat_len >= 4
4616                                  && isALPHA_FOLD_EQ(pat[1], 'f')
4617                                  && memBEGINs(pat + 2, pat_len - 2,
4618                                     LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4619                 ))) {
4620                     /* The macros for finding a multi-char fold don't include
4621                      * the Turkish possibilities, in which U+130 folds to 'i'.
4622                      * Hard-code these.  It's very unlikely that Unicode will
4623                      * ever add any others.  */
4624                     if (pat[1] == 'f') {
4625                         pat_len = 3;
4626                         Copy("ffi", mod_pat, pat_len, U8);
4627                     }
4628                     else {
4629                         pat_len = 2;
4630                         Copy("fi", mod_pat, pat_len, U8);
4631                     }
4632                     pat = mod_pat;
4633                 }
4634                 else if (    UTF8_IS_DOWNGRADEABLE_START(*pat)
4635                          &&  LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4636                          &&  LIKELY(memNEs(pat, pat_len,
4637                                            LATIN_SMALL_LETTER_SHARP_S_UTF8))
4638                          && (LIKELY(op != TURKISH || *pat != 'I')))
4639                 {
4640                     /* For all cases of things between 0-255, except the ones
4641                      * in the conditional above, the fold is just the lower
4642                      * case, which is faster than the more general case. */
4643                     mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4644                                                                      pat[1]));
4645                     pat_len = 1;
4646                     pat = mod_pat;
4647                     utf8_pat = FALSE;
4648                 }
4649                 else {  /* Code point above 255, or needs special handling */
4650                     _to_utf8_fold_flags(pat, pat + pat_len,
4651                                         mod_pat, &pat_len,
4652                                         FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4653                     pat = mod_pat;
4654                 }
4655             }
4656         }
4657         else if /* Below is not a UTF-8 pattern; there's a somewhat different
4658                    set of problematic characters */
4659                 ((multi_fold_from
4660                           = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4661         {
4662             /* We may have to canonicalize a multi-char fold, as in the UTF-8
4663              * case */
4664             _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4665                                FOLD_FLAGS_FULL);
4666             pat = mod_pat;
4667         }
4668         else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4669             mod_pat[0] = mod_pat[1] = 's';
4670             pat_len = 2;
4671             utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4672                                        chars, and speeds copying */
4673             pat = mod_pat;
4674         }
4675         else if (LIKELY(op != TURKISH || *pat != 'I')) {
4676             mod_pat[0] = toLOWER_L1(*pat);
4677             pat_len = 1;
4678             pat = mod_pat;
4679         }
4680     }
4681     else if /* Below isn't a node that we convert to UTF-8 */
4682             (     utf8_target
4683              && ! utf8_pat
4684              &&   op == EXACTFAA_NO_TRIE
4685              &&  *pat == LATIN_SMALL_LETTER_SHARP_S)
4686     {
4687         /* A very special case.  Folding U+DF goes to U+17F under /iaa.  We
4688          * did this at compile time when the pattern was UTF-8 , but otherwise
4689          * we couldn't do it earlier, because it requires a UTF-8 target for
4690          * this match to be legal. */
4691         pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4692         Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4693              LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4694         pat = mod_pat;
4695         utf8_pat = TRUE;
4696     }
4697 
4698     /* Here, we have taken care of the initial work for a few very problematic
4699      * situations, possibly creating a modified pattern.
4700      *
4701      * Now ready for the general case.  We build up all the possible things
4702      * that could match the first character of the pattern into the elements of
4703      * 'matches[]'
4704      *
4705      * Everything generally matches at least itself.  But if there is a
4706      * UTF8ness mismatch, we have to convert to that of the target string. */
4707     if (UTF8_IS_INVARIANT(*pat)) {  /* Immaterial if either is in UTF-8 */
4708         matches[0][0] = pat[0];
4709         lengths[0] = 1;
4710         m->count++;
4711     }
4712     else if (utf8_target) {
4713         if (utf8_pat) {
4714             lengths[0] = UTF8SKIP(pat);
4715             Copy(pat, matches[0], lengths[0], U8);
4716             m->count++;
4717         }
4718         else {  /* target is UTF-8, pattern isn't */
4719             matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4720             matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4721             lengths[0] = 2;
4722             m->count++;
4723         }
4724     }
4725     else if (! utf8_pat) {  /* Neither is UTF-8 */
4726         matches[0][0] = pat[0];
4727         lengths[0] = 1;
4728         m->count++;
4729     }
4730     else     /* target isn't UTF-8; pattern is.  No match possible unless the
4731                 pattern's first character can fit in a byte */
4732          if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4733     {
4734         matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4735         lengths[0] = 1;
4736         m->count++;
4737     }
4738 
4739     /* Here we have taken care of any necessary node-type changes */
4740 
4741     if (m->count) {
4742         m->max_length = lengths[0];
4743         m->min_length = lengths[0];
4744     }
4745 
4746     /* For non-folding nodes, there are no other possible candidate matches,
4747      * but for foldable ones, we have to look further. */
4748     if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4749         UV folded;  /* The first character in the pattern, folded */
4750         U32 first_fold_from;    /* A character that folds to it */
4751         const U32 * remaining_fold_froms;   /* The remaining characters that
4752                                                fold to it, if any */
4753         Size_t folds_to_count;  /* The total number of characters that fold to
4754                                    'folded' */
4755 
4756         /* If the node begins with a sequence of more than one character that
4757          * together form the fold of a single character, it is called a
4758          * 'multi-character fold', and the normal functions don't handle this
4759          * case.  We set 'multi_fold_from' to the single folded-from character,
4760          * which is handled in an extra iteration below */
4761         if (utf8_pat) {
4762             folded = valid_utf8_to_uvchr(pat, NULL);
4763             multi_fold_from
4764                           = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4765         }
4766         else {
4767             folded = *pat;
4768 
4769             /* This may generate illegal combinations for things like EXACTF,
4770              * but rather than repeat the logic and exclude them here, all such
4771              * illegalities are checked for and skipped below in the loop */
4772             multi_fold_from
4773                         = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4774         }
4775 
4776         /* Everything matches at least itself; initialize to that because the
4777          * only the branches below that set it are the ones where the number
4778          * isn't 1. */
4779         folds_to_count = 1;
4780 
4781         /* There are a few special cases for locale-dependent nodes, where the
4782          * run-time context was needed before we could know what matched */
4783         if (UNLIKELY(op == EXACTFL) && folded < 256)  {
4784             first_fold_from = PL_fold_locale[folded];
4785         }
4786         else if (   op == EXACTFL && utf8_target && utf8_pat
4787                  && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4788                                             LATIN_SMALL_LETTER_LONG_S_UTF8))
4789         {
4790             first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4791         }
4792         else if (UNLIKELY(    op == TURKISH
4793                           && (   isALPHA_FOLD_EQ(folded, 'i')
4794                               || inRANGE(folded,
4795                                          LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4796                                          LATIN_SMALL_LETTER_DOTLESS_I))))
4797         {   /* Turkish folding requires special handling */
4798             if (folded == 'i')
4799                 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4800             else if (folded == 'I')
4801                 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4802             else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4803                 first_fold_from = 'i';
4804             else first_fold_from = 'I';
4805         }
4806         else {
4807             /* Here, isn't a special case: use the generic function to
4808              * calculate what folds to this */
4809           redo_multi:
4810             /* Look up what code points (besides itself) fold to 'folded';
4811              * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4812             folds_to_count = _inverse_folds(folded, &first_fold_from,
4813                                                        &remaining_fold_froms);
4814         }
4815 
4816         /* Add each character that folds to 'folded' to the list of them,
4817          * subject to limitations based on the node type and target UTF8ness.
4818          * If there was a character that folded to multiple characters, do an
4819          * extra iteration for it.  (Note the extra iteration if there is a
4820          * multi-character fold) */
4821         for (i = 0; i < folds_to_count
4822                       + UNLIKELY(multi_fold_from != 0); i++)
4823         {
4824             UV fold_from = 0;
4825 
4826             if (i >= folds_to_count) {  /* Final iteration: handle the
4827                                            multi-char */
4828                 fold_from = multi_fold_from;
4829             }
4830             else if (i == 0) {
4831                 fold_from = first_fold_from;
4832             }
4833             else if (i < folds_to_count) {
4834                 fold_from = remaining_fold_froms[i-1];
4835             }
4836 
4837             if (folded == fold_from) {  /* We already added the character
4838                                            itself */
4839                 continue;
4840             }
4841 
4842             /* EXACTF doesn't have any non-ascii folds */
4843             if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4844                 continue;
4845             }
4846 
4847             /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4848              * */
4849             if (    isASCII(folded) != isASCII(fold_from)
4850                 &&  inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4851 
4852             {
4853                 continue;
4854             }
4855 
4856             /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4857              * locale, but those have been converted to EXACTFU above) */
4858             if (   op == EXACTFL
4859                 && (folded < 256) != (fold_from < 256))
4860             {
4861                 continue;
4862             }
4863 
4864             /* If this triggers, it likely is because of the unlikely case
4865              * where a new Unicode standard has changed what MAX_MATCHES should
4866              * be set to */
4867             assert(m->count < MAX_MATCHES);
4868 
4869             /* Add this character to the list of possible matches */
4870             if (utf8_target) {
4871                 uvchr_to_utf8(matches[m->count], fold_from);
4872                 lengths[m->count] = UVCHR_SKIP(fold_from);
4873                 m->count++;
4874             }
4875             else { /* Non-UTF8 target: no code point above 255 can appear in it
4876                     */
4877                 if (fold_from > 255) {
4878                     continue;
4879                 }
4880 
4881                 matches[m->count][0] = fold_from;
4882                 lengths[m->count] = 1;
4883                 m->count++;
4884             }
4885 
4886             /* Update min and mlengths */
4887             if (m->min_length > lengths[m->count-1]) {
4888                 m->min_length = lengths[m->count-1];
4889             }
4890 
4891             if (m->max_length < lengths[m->count-1]) {
4892                 index_of_longest = m->count - 1;
4893                 m->max_length = lengths[index_of_longest];
4894             }
4895         } /* looped through each potential fold */
4896 
4897         /* If there is something that folded to an initial multi-character
4898          * fold, repeat, using it.  This catches some edge cases.  An example
4899          * of one is /ss/i when UTF-8 encoded.  The function
4900          * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
4901          * (LATIN SMALL SHARP S).  If it returned a list of characters, this
4902          * code wouldn't be needed.  But since it doesn't, we have to look what
4903          * folds to the U+DF.  In this case, U+1E9E does, and has to be added.
4904          * */
4905         if (multi_fold_from) {
4906             folded = multi_fold_from;
4907             multi_fold_from = 0;
4908             goto redo_multi;
4909         }
4910     } /* End of finding things that participate in this fold */
4911 
4912     if (m->count == 0) {    /* If nothing found, can't match */
4913         m->min_length = 0;
4914         return FALSE;
4915     }
4916 
4917     /* Have calculated all possible matches.  Now calculate the mask and AND
4918      * values */
4919     m->initial_exact = 0;
4920     m->initial_definitive = 0;
4921 
4922     {
4923         unsigned int mask_ones = 0;
4924         unsigned int possible_ones = 0;
4925         U8 j;
4926 
4927         /* For each byte that is in all possible matches ... */
4928         for (j = 0; j < MIN(m->min_length, 5); j++) {
4929 
4930             /* Initialize the accumulator for this byte */
4931             byte_mask[j] = 0xFF;
4932             byte_anded[j] = matches[0][j];
4933 
4934             /* Then the rest of the rows (folds).  The mask is based on, like,
4935              * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
4936              * where they differ. */
4937             for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
4938                 byte_mask[j]  &= ~ (byte_anded[j] ^ matches[i][j]);
4939                 byte_anded[j] &= matches[i][j];
4940             }
4941 
4942             /* Keep track of the number of initial mask bytes that are all one
4943              * bits.  The code calling this can use this number to know that
4944              * a string that matches this number of bytes in the pattern is an
4945              * exact match of that pattern for this number of bytes.  But also
4946              * counted are the number of initial bytes that in total have a
4947              * single zero bit.  If a string matches those, masked, it must be
4948              * one of two possibilites, both of which this function has
4949              * determined are legal.  (But if that single 0 is one of the
4950              * initial bits for masking a UTF-8 start byte, that could
4951              * incorrectly lead to different length strings appearing to be
4952              * equivalent, so only do this optimization when the matchables are
4953              * all the same length.  This was uncovered by testing
4954              * /\x{029E}/i.) */
4955             if (m->min_length == m->max_length) {
4956                 mask_ones += PL_bitcount[byte_mask[j]];
4957                 possible_ones += 8;
4958                 if (mask_ones + 1 >= possible_ones) {
4959                     m->initial_definitive++;
4960                     if (mask_ones >= possible_ones) {
4961                         m->initial_exact++;
4962                     }
4963                 }
4964             }
4965         }
4966     }
4967 
4968     /* The first byte is separate for speed */
4969     m->first_byte_mask = byte_mask[0];
4970     m->first_byte_anded = byte_anded[0];
4971 
4972     /* Then pack up to the next 4 bytes into a word */
4973     m->mask32 = m->anded32 = 0;
4974     for (i = 1; i < MIN(m->min_length, 5); i++) {
4975         U8 which = i;
4976         U8 shift = (which - 1) * 8;
4977         m->mask32  |= (U32) byte_mask[i]  << shift;
4978         m->anded32 |= (U32) byte_anded[i] << shift;
4979     }
4980 
4981     /* Finally, take the match strings and place them sequentially into a
4982      * one-dimensional array.  (This is done to save significant space in the
4983      * structure.) Sort so the longest (presumably the least likely) is last.
4984      * XXX When this gets moved to regcomp, may want to fully sort shortest
4985      * first, but above we generally used the folded code point first, and
4986      * those tend to be no longer than their upper case values, so this is
4987      * already pretty well sorted by size.
4988      *
4989      * If the asserts fail, it's most likely because a new version of the
4990      * Unicode standard requires more space; simply increase the declaration
4991      * size. */
4992     {
4993         U8 cur_pos = 0;
4994         U8 output_index = 0;
4995 
4996         if (m->count > 1) { /* No need to sort a single entry */
4997             for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
4998 
4999                 /* Keep the same order for all but the longest.  (If the
5000                  * asserts fail, it could be because m->matches is declared too
5001                  * short, either because of a new Unicode release, or an
5002                  * overlooked test case, or it could be a bug.) */
5003                 if (i != index_of_longest) {
5004                     assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5005                     Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5006                     cur_pos += lengths[i];
5007                     m->lengths[output_index++] = lengths[i];
5008                 }
5009             }
5010         }
5011 
5012         assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5013         Copy(matches[index_of_longest], m->matches + cur_pos,
5014              lengths[index_of_longest], U8);
5015 
5016         /* Place the longest match last */
5017         m->lengths[output_index] = lengths[index_of_longest];
5018     }
5019 
5020 
5021     return TRUE;
5022 }
5023 
5024 PERL_STATIC_FORCE_INLINE    /* We want speed at the expense of size */
5025 bool
5026 S_test_EXACTISH_ST(const char * loc,
5027                    struct next_matchable_info info)
5028 {
5029     /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5030      * bytes starting at 'loc' can match based on 'next_matchable_info' */
5031 
5032     U32 input32 = 0;
5033 
5034     /* Check the first byte */
5035     if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5036         return FALSE;
5037 
5038     /* Pack the next up-to-4 bytes into a 32 bit word */
5039     switch (info.min_length) {
5040         default:
5041             input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5042             /* FALLTHROUGH */
5043         case 4:
5044             input32 |= (U8) loc[3] << 2 * 8;
5045             /* FALLTHROUGH */
5046         case 3:
5047             input32 |= (U8) loc[2] << 1 * 8;
5048             /* FALLTHROUGH */
5049         case 2:
5050             input32 |= (U8) loc[1];
5051             break;
5052         case 1:
5053             return TRUE;    /* We already tested and passed the 0th byte */
5054         case 0:
5055             ASSUME(0);
5056     }
5057 
5058     /* And AND that with the mask and compare that with the assembled ANDED
5059      * values */
5060     return (input32 & info.mask32) == info.anded32;
5061 }
5062 
5063 STATIC bool
5064 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5065 {
5066     /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5067      * between the inputs.  See https://www.unicode.org/reports/tr29/. */
5068 
5069     PERL_ARGS_ASSERT_ISGCB;
5070 
5071     switch (GCB_table[before][after]) {
5072         case GCB_BREAKABLE:
5073             return TRUE;
5074 
5075         case GCB_NOBREAK:
5076             return FALSE;
5077 
5078         case GCB_RI_then_RI:
5079             {
5080                 int RI_count = 1;
5081                 U8 * temp_pos = (U8 *) curpos;
5082 
5083                 /* Do not break within emoji flag sequences. That is, do not
5084                  * break between regional indicator (RI) symbols if there is an
5085                  * odd number of RI characters before the break point.
5086                  *  GB12   sot (RI RI)* RI × RI
5087                  *  GB13 [^RI] (RI RI)* RI × RI */
5088 
5089                 while (backup_one_GCB(strbeg,
5090                                     &temp_pos,
5091                                     utf8_target) == GCB_Regional_Indicator)
5092                 {
5093                     RI_count++;
5094                 }
5095 
5096                 return RI_count % 2 != 1;
5097             }
5098 
5099         case GCB_EX_then_EM:
5100 
5101             /* GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier */
5102             {
5103                 U8 * temp_pos = (U8 *) curpos;
5104                 GCB_enum prev;
5105 
5106                 do {
5107                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5108                 }
5109                 while (prev == GCB_Extend);
5110 
5111                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5112             }
5113 
5114         case GCB_Maybe_Emoji_NonBreak:
5115 
5116             {
5117 
5118             /* Do not break within emoji modifier sequences or emoji zwj sequences.
5119               GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5120               */
5121                 U8 * temp_pos = (U8 *) curpos;
5122                 GCB_enum prev;
5123 
5124                 do {
5125                     prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5126                 }
5127                 while (prev == GCB_Extend);
5128 
5129                 return prev != GCB_ExtPict_XX;
5130             }
5131 
5132         default:
5133             break;
5134     }
5135 
5136 #ifdef DEBUGGING
5137     Perl_re_printf( aTHX_  "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5138                                   before, after, GCB_table[before][after]);
5139     assert(0);
5140 #endif
5141     return TRUE;
5142 }
5143 
5144 STATIC GCB_enum
5145 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5146 {
5147     GCB_enum gcb;
5148 
5149     PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5150 
5151     if (*curpos < strbeg) {
5152         return GCB_EDGE;
5153     }
5154 
5155     if (utf8_target) {
5156         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5157         U8 * prev_prev_char_pos;
5158 
5159         if (! prev_char_pos) {
5160             return GCB_EDGE;
5161         }
5162 
5163         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5164             gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5165             *curpos = prev_char_pos;
5166             prev_char_pos = prev_prev_char_pos;
5167         }
5168         else {
5169             *curpos = (U8 *) strbeg;
5170             return GCB_EDGE;
5171         }
5172     }
5173     else {
5174         if (*curpos - 2 < strbeg) {
5175             *curpos = (U8 *) strbeg;
5176             return GCB_EDGE;
5177         }
5178         (*curpos)--;
5179         gcb = getGCB_VAL_CP(*(*curpos - 1));
5180     }
5181 
5182     return gcb;
5183 }
5184 
5185 /* Combining marks attach to most classes that precede them, but this defines
5186  * the exceptions (from TR14) */
5187 #define LB_CM_ATTACHES_TO(prev) ( ! (   prev == LB_EDGE                 \
5188                                      || prev == LB_Mandatory_Break      \
5189                                      || prev == LB_Carriage_Return      \
5190                                      || prev == LB_Line_Feed            \
5191                                      || prev == LB_Next_Line            \
5192                                      || prev == LB_Space                \
5193                                      || prev == LB_ZWSpace))
5194 
5195 STATIC bool
5196 S_isLB(pTHX_ LB_enum before,
5197              LB_enum after,
5198              const U8 * const strbeg,
5199              const U8 * const curpos,
5200              const U8 * const strend,
5201              const bool utf8_target)
5202 {
5203     U8 * temp_pos = (U8 *) curpos;
5204     LB_enum prev = before;
5205 
5206     /* Is the boundary between 'before' and 'after' line-breakable?
5207      * Most of this is just a table lookup of a generated table from Unicode
5208      * rules.  But some rules require context to decide, and so have to be
5209      * implemented in code */
5210 
5211     PERL_ARGS_ASSERT_ISLB;
5212 
5213     /* Rule numbers in the comments below are as of Unicode 9.0 */
5214 
5215   redo:
5216     before = prev;
5217     switch (LB_table[before][after]) {
5218         case LB_BREAKABLE:
5219             return TRUE;
5220 
5221         case LB_NOBREAK:
5222         case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5223             return FALSE;
5224 
5225         case LB_SP_foo + LB_BREAKABLE:
5226         case LB_SP_foo + LB_NOBREAK:
5227         case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5228 
5229             /* When we have something following a SP, we have to look at the
5230              * context in order to know what to do.
5231              *
5232              * SP SP should not reach here because LB7: Do not break before
5233              * spaces.  (For two spaces in a row there is nothing that
5234              * overrides that) */
5235             assert(after != LB_Space);
5236 
5237             /* Here we have a space followed by a non-space.  Mostly this is a
5238              * case of LB18: "Break after spaces".  But there are complications
5239              * as the handling of spaces is somewhat tricky.  They are in a
5240              * number of rules, which have to be applied in priority order, but
5241              * something earlier in the string can cause a rule to be skipped
5242              * and a lower priority rule invoked.  A prime example is LB7 which
5243              * says don't break before a space.  But rule LB8 (lower priority)
5244              * says that the first break opportunity after a ZW is after any
5245              * span of spaces immediately after it.  If a ZW comes before a SP
5246              * in the input, rule LB8 applies, and not LB7.  Other such rules
5247              * involve combining marks which are rules 9 and 10, but they may
5248              * override higher priority rules if they come earlier in the
5249              * string.  Since we're doing random access into the middle of the
5250              * string, we have to look for rules that should get applied based
5251              * on both string position and priority.  Combining marks do not
5252              * attach to either ZW nor SP, so we don't have to consider them
5253              * until later.
5254              *
5255              * To check for LB8, we have to find the first non-space character
5256              * before this span of spaces */
5257             do {
5258                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5259             }
5260             while (prev == LB_Space);
5261 
5262             /* LB8 Break before any character following a zero-width space,
5263              * even if one or more spaces intervene.
5264              *      ZW SP* ÷
5265              * So if we have a ZW just before this span, and to get here this
5266              * is the final space in the span. */
5267             if (prev == LB_ZWSpace) {
5268                 return TRUE;
5269             }
5270 
5271             /* Here, not ZW SP+.  There are several rules that have higher
5272              * priority than LB18 and can be resolved now, as they don't depend
5273              * on anything earlier in the string (except ZW, which we have
5274              * already handled).  One of these rules is LB11 Do not break
5275              * before Word joiner, but we have specially encoded that in the
5276              * lookup table so it is caught by the single test below which
5277              * catches the other ones. */
5278             if (LB_table[LB_Space][after] - LB_SP_foo
5279                                             == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5280             {
5281                 return FALSE;
5282             }
5283 
5284             /* If we get here, we have to XXX consider combining marks. */
5285             if (prev == LB_Combining_Mark) {
5286 
5287                 /* What happens with these depends on the character they
5288                  * follow.  */
5289                 do {
5290                     prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5291                 }
5292                 while (prev == LB_Combining_Mark);
5293 
5294                 /* Most times these attach to and inherit the characteristics
5295                  * of that character, but not always, and when not, they are to
5296                  * be treated as AL by rule LB10. */
5297                 if (! LB_CM_ATTACHES_TO(prev)) {
5298                     prev = LB_Alphabetic;
5299                 }
5300             }
5301 
5302             /* Here, we have the character preceding the span of spaces all set
5303              * up.  We follow LB18: "Break after spaces" unless the table shows
5304              * that is overriden */
5305             return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5306 
5307         case LB_CM_ZWJ_foo:
5308 
5309             /* We don't know how to treat the CM except by looking at the first
5310              * non-CM character preceding it.  ZWJ is treated as CM */
5311             do {
5312                 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5313             }
5314             while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5315 
5316             /* Here, 'prev' is that first earlier non-CM character.  If the CM
5317              * attatches to it, then it inherits the behavior of 'prev'.  If it
5318              * doesn't attach, it is to be treated as an AL */
5319             if (! LB_CM_ATTACHES_TO(prev)) {
5320                 prev = LB_Alphabetic;
5321             }
5322 
5323             goto redo;
5324 
5325         case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5326         case LB_HY_or_BA_then_foo + LB_NOBREAK:
5327 
5328             /* LB21a Don't break after Hebrew + Hyphen.
5329              * HL (HY | BA) × */
5330 
5331             if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5332                                                           == LB_Hebrew_Letter)
5333             {
5334                 return FALSE;
5335             }
5336 
5337             return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5338 
5339         case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5340         case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5341 
5342             /* LB25a (PR | PO) × ( OP | HY )? NU */
5343             if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5344                 return FALSE;
5345             }
5346 
5347             return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5348                                                                 == LB_BREAKABLE;
5349 
5350         case LB_SY_or_IS_then_various + LB_BREAKABLE:
5351         case LB_SY_or_IS_then_various + LB_NOBREAK:
5352         {
5353             /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5354 
5355             LB_enum temp = prev;
5356             do {
5357                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5358             }
5359             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5360             if (temp == LB_Numeric) {
5361                 return FALSE;
5362             }
5363 
5364             return LB_table[prev][after] - LB_SY_or_IS_then_various
5365                                                                == LB_BREAKABLE;
5366         }
5367 
5368         case LB_various_then_PO_or_PR + LB_BREAKABLE:
5369         case LB_various_then_PO_or_PR + LB_NOBREAK:
5370         {
5371             /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5372 
5373             LB_enum temp = prev;
5374             if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5375             {
5376                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5377             }
5378             while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5379                 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5380             }
5381             if (temp == LB_Numeric) {
5382                 return FALSE;
5383             }
5384             return LB_various_then_PO_or_PR;
5385         }
5386 
5387         case LB_RI_then_RI + LB_NOBREAK:
5388         case LB_RI_then_RI + LB_BREAKABLE:
5389             {
5390                 int RI_count = 1;
5391 
5392                 /* LB30a Break between two regional indicator symbols if and
5393                  * only if there are an even number of regional indicators
5394                  * preceding the position of the break.
5395                  *
5396                  *    sot (RI RI)* RI × RI
5397                  *  [^RI] (RI RI)* RI × RI */
5398 
5399                 while (backup_one_LB(strbeg,
5400                                      &temp_pos,
5401                                      utf8_target) == LB_Regional_Indicator)
5402                 {
5403                     RI_count++;
5404                 }
5405 
5406                 return RI_count % 2 == 0;
5407             }
5408 
5409         default:
5410             break;
5411     }
5412 
5413 #ifdef DEBUGGING
5414     Perl_re_printf( aTHX_  "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5415                                   before, after, LB_table[before][after]);
5416     assert(0);
5417 #endif
5418     return TRUE;
5419 }
5420 
5421 STATIC LB_enum
5422 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5423 {
5424 
5425     LB_enum lb;
5426 
5427     PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5428 
5429     if (*curpos >= strend) {
5430         return LB_EDGE;
5431     }
5432 
5433     if (utf8_target) {
5434         *curpos += UTF8SKIP(*curpos);
5435         if (*curpos >= strend) {
5436             return LB_EDGE;
5437         }
5438         lb = getLB_VAL_UTF8(*curpos, strend);
5439     }
5440     else {
5441         (*curpos)++;
5442         if (*curpos >= strend) {
5443             return LB_EDGE;
5444         }
5445         lb = getLB_VAL_CP(**curpos);
5446     }
5447 
5448     return lb;
5449 }
5450 
5451 STATIC LB_enum
5452 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5453 {
5454     LB_enum lb;
5455 
5456     PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5457 
5458     if (*curpos < strbeg) {
5459         return LB_EDGE;
5460     }
5461 
5462     if (utf8_target) {
5463         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5464         U8 * prev_prev_char_pos;
5465 
5466         if (! prev_char_pos) {
5467             return LB_EDGE;
5468         }
5469 
5470         if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5471             lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5472             *curpos = prev_char_pos;
5473             prev_char_pos = prev_prev_char_pos;
5474         }
5475         else {
5476             *curpos = (U8 *) strbeg;
5477             return LB_EDGE;
5478         }
5479     }
5480     else {
5481         if (*curpos - 2 < strbeg) {
5482             *curpos = (U8 *) strbeg;
5483             return LB_EDGE;
5484         }
5485         (*curpos)--;
5486         lb = getLB_VAL_CP(*(*curpos - 1));
5487     }
5488 
5489     return lb;
5490 }
5491 
5492 STATIC bool
5493 S_isSB(pTHX_ SB_enum before,
5494              SB_enum after,
5495              const U8 * const strbeg,
5496              const U8 * const curpos,
5497              const U8 * const strend,
5498              const bool utf8_target)
5499 {
5500     /* returns a boolean indicating if there is a Sentence Boundary Break
5501      * between the inputs.  See https://www.unicode.org/reports/tr29/ */
5502 
5503     U8 * lpos = (U8 *) curpos;
5504     bool has_para_sep = FALSE;
5505     bool has_sp = FALSE;
5506 
5507     PERL_ARGS_ASSERT_ISSB;
5508 
5509     /* Break at the start and end of text.
5510         SB1.  sot  ÷
5511         SB2.  ÷  eot
5512       But unstated in Unicode is don't break if the text is empty */
5513     if (before == SB_EDGE || after == SB_EDGE) {
5514         return before != after;
5515     }
5516 
5517     /* SB 3: Do not break within CRLF. */
5518     if (before == SB_CR && after == SB_LF) {
5519         return FALSE;
5520     }
5521 
5522     /* Break after paragraph separators.  CR and LF are considered
5523      * so because Unicode views text as like word processing text where there
5524      * are no newlines except between paragraphs, and the word processor takes
5525      * care of wrapping without there being hard line-breaks in the text *./
5526        SB4.  Sep | CR | LF  ÷ */
5527     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5528         return TRUE;
5529     }
5530 
5531     /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5532      * (See Section 6.2, Replacing Ignore Rules.)
5533         SB5.  X (Extend | Format)*  →  X */
5534     if (after == SB_Extend || after == SB_Format) {
5535 
5536         /* Implied is that the these characters attach to everything
5537          * immediately prior to them except for those separator-type
5538          * characters.  And the rules earlier have already handled the case
5539          * when one of those immediately precedes the extend char */
5540         return FALSE;
5541     }
5542 
5543     if (before == SB_Extend || before == SB_Format) {
5544         U8 * temp_pos = lpos;
5545         const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5546         if (   backup != SB_EDGE
5547             && backup != SB_Sep
5548             && backup != SB_CR
5549             && backup != SB_LF)
5550         {
5551             before = backup;
5552             lpos = temp_pos;
5553         }
5554 
5555         /* Here, both 'before' and 'backup' are these types; implied is that we
5556          * don't break between them */
5557         if (backup == SB_Extend || backup == SB_Format) {
5558             return FALSE;
5559         }
5560     }
5561 
5562     /* Do not break after ambiguous terminators like period, if they are
5563      * immediately followed by a number or lowercase letter, if they are
5564      * between uppercase letters, if the first following letter (optionally
5565      * after certain punctuation) is lowercase, or if they are followed by
5566      * "continuation" punctuation such as comma, colon, or semicolon. For
5567      * example, a period may be an abbreviation or numeric period, and thus may
5568      * not mark the end of a sentence.
5569 
5570      * SB6. ATerm  ×  Numeric */
5571     if (before == SB_ATerm && after == SB_Numeric) {
5572         return FALSE;
5573     }
5574 
5575     /* SB7.  (Upper | Lower) ATerm  ×  Upper */
5576     if (before == SB_ATerm && after == SB_Upper) {
5577         U8 * temp_pos = lpos;
5578         SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5579         if (backup == SB_Upper || backup == SB_Lower) {
5580             return FALSE;
5581         }
5582     }
5583 
5584     /* The remaining rules that aren't the final one, all require an STerm or
5585      * an ATerm after having backed up over some Close* Sp*, and in one case an
5586      * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5587      * So do that backup now, setting flags if either Sp or a paragraph
5588      * separator are found */
5589 
5590     if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5591         has_para_sep = TRUE;
5592         before = backup_one_SB(strbeg, &lpos, utf8_target);
5593     }
5594 
5595     if (before == SB_Sp) {
5596         has_sp = TRUE;
5597         do {
5598             before = backup_one_SB(strbeg, &lpos, utf8_target);
5599         }
5600         while (before == SB_Sp);
5601     }
5602 
5603     while (before == SB_Close) {
5604         before = backup_one_SB(strbeg, &lpos, utf8_target);
5605     }
5606 
5607     /* The next few rules apply only when the backed-up-to is an ATerm, and in
5608      * most cases an STerm */
5609     if (before == SB_STerm || before == SB_ATerm) {
5610 
5611         /* So, here the lhs matches
5612          *      (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5613          * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5614          * The rules that apply here are:
5615          *
5616          * SB8    ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR
5617                                            | LF | STerm | ATerm) )* Lower
5618            SB8a  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
5619            SB9   (STerm | ATerm) Close*  ×  (Close | Sp | Sep | CR | LF)
5620            SB10  (STerm | ATerm) Close* Sp*  ×  (Sp | Sep | CR | LF)
5621            SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  ÷
5622          */
5623 
5624         /* And all but SB11 forbid having seen a paragraph separator */
5625         if (! has_para_sep) {
5626             if (before == SB_ATerm) {          /* SB8 */
5627                 U8 * rpos = (U8 *) curpos;
5628                 SB_enum later = after;
5629 
5630                 while (    later != SB_OLetter
5631                         && later != SB_Upper
5632                         && later != SB_Lower
5633                         && later != SB_Sep
5634                         && later != SB_CR
5635                         && later != SB_LF
5636                         && later != SB_STerm
5637                         && later != SB_ATerm
5638                         && later != SB_EDGE)
5639                 {
5640                     later = advance_one_SB(&rpos, strend, utf8_target);
5641                 }
5642                 if (later == SB_Lower) {
5643                     return FALSE;
5644                 }
5645             }
5646 
5647             if (   after == SB_SContinue    /* SB8a */
5648                 || after == SB_STerm
5649                 || after == SB_ATerm)
5650             {
5651                 return FALSE;
5652             }
5653 
5654             if (! has_sp) {     /* SB9 applies only if there was no Sp* */
5655                 if (   after == SB_Close
5656                     || after == SB_Sp
5657                     || after == SB_Sep
5658                     || after == SB_CR
5659                     || after == SB_LF)
5660                 {
5661                     return FALSE;
5662                 }
5663             }
5664 
5665             /* SB10.  This and SB9 could probably be combined some way, but khw
5666              * has decided to follow the Unicode rule book precisely for
5667              * simplified maintenance */
5668             if (   after == SB_Sp
5669                 || after == SB_Sep
5670                 || after == SB_CR
5671                 || after == SB_LF)
5672             {
5673                 return FALSE;
5674             }
5675         }
5676 
5677         /* SB11.  */
5678         return TRUE;
5679     }
5680 
5681     /* Otherwise, do not break.
5682     SB12.  Any  ×  Any */
5683 
5684     return FALSE;
5685 }
5686 
5687 STATIC SB_enum
5688 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5689 {
5690     SB_enum sb;
5691 
5692     PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5693 
5694     if (*curpos >= strend) {
5695         return SB_EDGE;
5696     }
5697 
5698     if (utf8_target) {
5699         do {
5700             *curpos += UTF8SKIP(*curpos);
5701             if (*curpos >= strend) {
5702                 return SB_EDGE;
5703             }
5704             sb = getSB_VAL_UTF8(*curpos, strend);
5705         } while (sb == SB_Extend || sb == SB_Format);
5706     }
5707     else {
5708         do {
5709             (*curpos)++;
5710             if (*curpos >= strend) {
5711                 return SB_EDGE;
5712             }
5713             sb = getSB_VAL_CP(**curpos);
5714         } while (sb == SB_Extend || sb == SB_Format);
5715     }
5716 
5717     return sb;
5718 }
5719 
5720 STATIC SB_enum
5721 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5722 {
5723     SB_enum sb;
5724 
5725     PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5726 
5727     if (*curpos < strbeg) {
5728         return SB_EDGE;
5729     }
5730 
5731     if (utf8_target) {
5732         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5733         if (! prev_char_pos) {
5734             return SB_EDGE;
5735         }
5736 
5737         /* Back up over Extend and Format.  curpos is always just to the right
5738          * of the characater whose value we are getting */
5739         do {
5740             U8 * prev_prev_char_pos;
5741             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5742                                                                       strbeg)))
5743             {
5744                 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5745                 *curpos = prev_char_pos;
5746                 prev_char_pos = prev_prev_char_pos;
5747             }
5748             else {
5749                 *curpos = (U8 *) strbeg;
5750                 return SB_EDGE;
5751             }
5752         } while (sb == SB_Extend || sb == SB_Format);
5753     }
5754     else {
5755         do {
5756             if (*curpos - 2 < strbeg) {
5757                 *curpos = (U8 *) strbeg;
5758                 return SB_EDGE;
5759             }
5760             (*curpos)--;
5761             sb = getSB_VAL_CP(*(*curpos - 1));
5762         } while (sb == SB_Extend || sb == SB_Format);
5763     }
5764 
5765     return sb;
5766 }
5767 
5768 STATIC bool
5769 S_isWB(pTHX_ WB_enum previous,
5770              WB_enum before,
5771              WB_enum after,
5772              const U8 * const strbeg,
5773              const U8 * const curpos,
5774              const U8 * const strend,
5775              const bool utf8_target)
5776 {
5777     /*  Return a boolean as to if the boundary between 'before' and 'after' is
5778      *  a Unicode word break, using their published algorithm, but tailored for
5779      *  Perl by treating spans of white space as one unit.  Context may be
5780      *  needed to make this determination.  If the value for the character
5781      *  before 'before' is known, it is passed as 'previous'; otherwise that
5782      *  should be set to WB_UNKNOWN.  The other input parameters give the
5783      *  boundaries and current position in the matching of the string.  That
5784      *  is, 'curpos' marks the position where the character whose wb value is
5785      *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
5786 
5787     U8 * before_pos = (U8 *) curpos;
5788     U8 * after_pos = (U8 *) curpos;
5789     WB_enum prev = before;
5790     WB_enum next;
5791 
5792     PERL_ARGS_ASSERT_ISWB;
5793 
5794     /* Rule numbers in the comments below are as of Unicode 9.0 */
5795 
5796   redo:
5797     before = prev;
5798     switch (WB_table[before][after]) {
5799         case WB_BREAKABLE:
5800             return TRUE;
5801 
5802         case WB_NOBREAK:
5803             return FALSE;
5804 
5805         case WB_hs_then_hs:     /* 2 horizontal spaces in a row */
5806             next = advance_one_WB(&after_pos, strend, utf8_target,
5807                                  FALSE /* Don't skip Extend nor Format */ );
5808             /* A space immediately preceding an Extend or Format is attached
5809              * to by them, and hence gets separated from previous spaces.
5810              * Otherwise don't break between horizontal white space */
5811             return next == WB_Extend || next == WB_Format;
5812 
5813         /* WB4 Ignore Format and Extend characters, except when they appear at
5814          * the beginning of a region of text.  This code currently isn't
5815          * general purpose, but it works as the rules are currently and likely
5816          * to be laid out.  The reason it works is that when 'they appear at
5817          * the beginning of a region of text', the rule is to break before
5818          * them, just like any other character.  Therefore, the default rule
5819          * applies and we don't have to look in more depth.  Should this ever
5820          * change, we would have to have 2 'case' statements, like in the rules
5821          * below, and backup a single character (not spacing over the extend
5822          * ones) and then see if that is one of the region-end characters and
5823          * go from there */
5824         case WB_Ex_or_FO_or_ZWJ_then_foo:
5825             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5826             goto redo;
5827 
5828         case WB_DQ_then_HL + WB_BREAKABLE:
5829         case WB_DQ_then_HL + WB_NOBREAK:
5830 
5831             /* WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
5832 
5833             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5834                                                             == WB_Hebrew_Letter)
5835             {
5836                 return FALSE;
5837             }
5838 
5839              return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5840 
5841         case WB_HL_then_DQ + WB_BREAKABLE:
5842         case WB_HL_then_DQ + WB_NOBREAK:
5843 
5844             /* WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
5845 
5846             if (advance_one_WB(&after_pos, strend, utf8_target,
5847                                        TRUE /* Do skip Extend and Format */ )
5848                                                             == WB_Hebrew_Letter)
5849             {
5850                 return FALSE;
5851             }
5852 
5853             return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5854 
5855         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5856         case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5857 
5858             /* WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
5859              *       | Single_Quote) (ALetter | Hebrew_Letter) */
5860 
5861             next = advance_one_WB(&after_pos, strend, utf8_target,
5862                                        TRUE /* Do skip Extend and Format */ );
5863 
5864             if (next == WB_ALetter || next == WB_Hebrew_Letter)
5865             {
5866                 return FALSE;
5867             }
5868 
5869             return WB_table[before][after]
5870                             - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5871 
5872         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5873         case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5874 
5875             /* WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5876              *       | Single_Quote)  ×  (ALetter | Hebrew_Letter) */
5877 
5878             prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5879             if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5880             {
5881                 return FALSE;
5882             }
5883 
5884             return WB_table[before][after]
5885                             - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5886 
5887         case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5888         case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5889 
5890             /* WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
5891              * */
5892 
5893             if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5894                                                             == WB_Numeric)
5895             {
5896                 return FALSE;
5897             }
5898 
5899             return WB_table[before][after]
5900                                 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5901 
5902         case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5903         case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5904 
5905             /* WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric */
5906 
5907             if (advance_one_WB(&after_pos, strend, utf8_target,
5908                                        TRUE /* Do skip Extend and Format */ )
5909                                                             == WB_Numeric)
5910             {
5911                 return FALSE;
5912             }
5913 
5914             return WB_table[before][after]
5915                                 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5916 
5917         case WB_RI_then_RI + WB_NOBREAK:
5918         case WB_RI_then_RI + WB_BREAKABLE:
5919             {
5920                 int RI_count = 1;
5921 
5922                 /* Do not break within emoji flag sequences. That is, do not
5923                  * break between regional indicator (RI) symbols if there is an
5924                  * odd number of RI characters before the potential break
5925                  * point.
5926                  *
5927                  * WB15   sot (RI RI)* RI × RI
5928                  * WB16 [^RI] (RI RI)* RI × RI */
5929 
5930                 while (backup_one_WB(&previous,
5931                                      strbeg,
5932                                      &before_pos,
5933                                      utf8_target) == WB_Regional_Indicator)
5934                 {
5935                     RI_count++;
5936                 }
5937 
5938                 return RI_count % 2 != 1;
5939             }
5940 
5941         default:
5942             break;
5943     }
5944 
5945 #ifdef DEBUGGING
5946     Perl_re_printf( aTHX_  "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5947                                   before, after, WB_table[before][after]);
5948     assert(0);
5949 #endif
5950     return TRUE;
5951 }
5952 
5953 STATIC WB_enum
5954 S_advance_one_WB(pTHX_ U8 ** curpos,
5955                        const U8 * const strend,
5956                        const bool utf8_target,
5957                        const bool skip_Extend_Format)
5958 {
5959     WB_enum wb;
5960 
5961     PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5962 
5963     if (*curpos >= strend) {
5964         return WB_EDGE;
5965     }
5966 
5967     if (utf8_target) {
5968 
5969         /* Advance over Extend and Format */
5970         do {
5971             *curpos += UTF8SKIP(*curpos);
5972             if (*curpos >= strend) {
5973                 return WB_EDGE;
5974             }
5975             wb = getWB_VAL_UTF8(*curpos, strend);
5976         } while (    skip_Extend_Format
5977                  && (wb == WB_Extend || wb == WB_Format));
5978     }
5979     else {
5980         do {
5981             (*curpos)++;
5982             if (*curpos >= strend) {
5983                 return WB_EDGE;
5984             }
5985             wb = getWB_VAL_CP(**curpos);
5986         } while (    skip_Extend_Format
5987                  && (wb == WB_Extend || wb == WB_Format));
5988     }
5989 
5990     return wb;
5991 }
5992 
5993 STATIC WB_enum
5994 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5995 {
5996     WB_enum wb;
5997 
5998     PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5999 
6000     /* If we know what the previous character's break value is, don't have
6001         * to look it up */
6002     if (*previous != WB_UNKNOWN) {
6003         wb = *previous;
6004 
6005         /* But we need to move backwards by one */
6006         if (utf8_target) {
6007             *curpos = reghopmaybe3(*curpos, -1, strbeg);
6008             if (! *curpos) {
6009                 *previous = WB_EDGE;
6010                 *curpos = (U8 *) strbeg;
6011             }
6012             else {
6013                 *previous = WB_UNKNOWN;
6014             }
6015         }
6016         else {
6017             (*curpos)--;
6018             *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6019         }
6020 
6021         /* And we always back up over these three types */
6022         if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6023             return wb;
6024         }
6025     }
6026 
6027     if (*curpos < strbeg) {
6028         return WB_EDGE;
6029     }
6030 
6031     if (utf8_target) {
6032         U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6033         if (! prev_char_pos) {
6034             return WB_EDGE;
6035         }
6036 
6037         /* Back up over Extend and Format.  curpos is always just to the right
6038          * of the characater whose value we are getting */
6039         do {
6040             U8 * prev_prev_char_pos;
6041             if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6042                                                    -1,
6043                                                    strbeg)))
6044             {
6045                 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6046                 *curpos = prev_char_pos;
6047                 prev_char_pos = prev_prev_char_pos;
6048             }
6049             else {
6050                 *curpos = (U8 *) strbeg;
6051                 return WB_EDGE;
6052             }
6053         } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6054     }
6055     else {
6056         do {
6057             if (*curpos - 2 < strbeg) {
6058                 *curpos = (U8 *) strbeg;
6059                 return WB_EDGE;
6060             }
6061             (*curpos)--;
6062             wb = getWB_VAL_CP(*(*curpos - 1));
6063         } while (wb == WB_Extend || wb == WB_Format);
6064     }
6065 
6066     return wb;
6067 }
6068 
6069 /* Macros for regmatch(), using its internal variables */
6070 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6071 #define NEXTCHR_IS_EOS (nextbyte < 0)
6072 
6073 #define SET_nextchr \
6074     nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6075 
6076 #define SET_locinput(p) \
6077     locinput = (p);  \
6078     SET_nextchr
6079 
6080 #define sayYES goto yes
6081 #define sayNO goto no
6082 #define sayNO_SILENT goto no_silent
6083 
6084 /* we dont use STMT_START/END here because it leads to
6085    "unreachable code" warnings, which are bogus, but distracting. */
6086 #define CACHEsayNO \
6087     if (ST.cache_mask) \
6088        reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6089     sayNO
6090 
6091 #define EVAL_CLOSE_PAREN_IS(st,expr)                        \
6092 (                                                           \
6093     (   ( st )                                         ) && \
6094     (   ( st )->u.eval.close_paren                     ) && \
6095     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6096 )
6097 
6098 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr)                   \
6099 (                                                           \
6100     (   ( st )                                         ) && \
6101     (   ( st )->u.eval.close_paren                     ) && \
6102     (   ( expr )                                       ) && \
6103     ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) )    \
6104 )
6105 
6106 
6107 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6108     (st)->u.eval.close_paren = ( (expr) + 1 )
6109 
6110 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6111     (st)->u.eval.close_paren = 0
6112 
6113 /* push a new state then goto it */
6114 
6115 #define PUSH_STATE_GOTO(state, node, input, eol, sr0)       \
6116     pushinput = input; \
6117     pusheol = eol; \
6118     pushsr0 = sr0; \
6119     scan = node; \
6120     st->resume_state = state; \
6121     goto push_state;
6122 
6123 /* push a new state with success backtracking, then goto it */
6124 
6125 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0)   \
6126     pushinput = input; \
6127     pusheol = eol;     \
6128     pushsr0 = sr0; \
6129     scan = node; \
6130     st->resume_state = state; \
6131     goto push_yes_state;
6132 
6133 #define DEBUG_STATE_pp(pp)                                  \
6134     DEBUG_STATE_r({                                         \
6135         DUMP_EXEC_POS(locinput, scan, utf8_target,depth);   \
6136         Perl_re_printf( aTHX_                               \
6137             "%*s" pp " %s%s%s%s%s\n",                       \
6138             INDENT_CHARS(depth), "",                        \
6139             PL_reg_name[st->resume_state],                  \
6140             ((st==yes_state||st==mark_state) ? "[" : ""),   \
6141             ((st==yes_state) ? "Y" : ""),                   \
6142             ((st==mark_state) ? "M" : ""),                  \
6143             ((st==yes_state||st==mark_state) ? "]" : "")    \
6144         );                                                  \
6145     });
6146 
6147 /*
6148 
6149 regmatch() - main matching routine
6150 
6151 This is basically one big switch statement in a loop. We execute an op,
6152 set 'next' to point the next op, and continue. If we come to a point which
6153 we may need to backtrack to on failure such as (A|B|C), we push a
6154 backtrack state onto the backtrack stack. On failure, we pop the top
6155 state, and re-enter the loop at the state indicated. If there are no more
6156 states to pop, we return failure.
6157 
6158 Sometimes we also need to backtrack on success; for example /A+/, where
6159 after successfully matching one A, we need to go back and try to
6160 match another one; similarly for lookahead assertions: if the assertion
6161 completes successfully, we backtrack to the state just before the assertion
6162 and then carry on.  In these cases, the pushed state is marked as
6163 'backtrack on success too'. This marking is in fact done by a chain of
6164 pointers, each pointing to the previous 'yes' state. On success, we pop to
6165 the nearest yes state, discarding any intermediate failure-only states.
6166 Sometimes a yes state is pushed just to force some cleanup code to be
6167 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6168 it to free the inner regex.
6169 
6170 Note that failure backtracking rewinds the cursor position, while
6171 success backtracking leaves it alone.
6172 
6173 A pattern is complete when the END op is executed, while a subpattern
6174 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6175 ops trigger the "pop to last yes state if any, otherwise return true"
6176 behaviour.
6177 
6178 A common convention in this function is to use A and B to refer to the two
6179 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6180 the subpattern to be matched possibly multiple times, while B is the entire
6181 rest of the pattern. Variable and state names reflect this convention.
6182 
6183 The states in the main switch are the union of ops and failure/success of
6184 substates associated with that op.  For example, IFMATCH is the op
6185 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6186 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6187 successfully matched A and IFMATCH_A_fail is a state saying that we have
6188 just failed to match A. Resume states always come in pairs. The backtrack
6189 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6190 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6191 on success or failure.
6192 
6193 The struct that holds a backtracking state is actually a big union, with
6194 one variant for each major type of op. The variable st points to the
6195 top-most backtrack struct. To make the code clearer, within each
6196 block of code we #define ST to alias the relevant union.
6197 
6198 Here's a concrete example of a (vastly oversimplified) IFMATCH
6199 implementation:
6200 
6201     switch (state) {
6202     ....
6203 
6204 #define ST st->u.ifmatch
6205 
6206     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6207         ST.foo = ...; // some state we wish to save
6208         ...
6209         // push a yes backtrack state with a resume value of
6210         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6211         // first node of A:
6212         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6213         // NOTREACHED
6214 
6215     case IFMATCH_A: // we have successfully executed A; now continue with B
6216         next = B;
6217         bar = ST.foo; // do something with the preserved value
6218         break;
6219 
6220     case IFMATCH_A_fail: // A failed, so the assertion failed
6221         ...;   // do some housekeeping, then ...
6222         sayNO; // propagate the failure
6223 
6224 #undef ST
6225 
6226     ...
6227     }
6228 
6229 For any old-timers reading this who are familiar with the old recursive
6230 approach, the code above is equivalent to:
6231 
6232     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6233     {
6234         int foo = ...
6235         ...
6236         if (regmatch(A)) {
6237             next = B;
6238             bar = foo;
6239             break;
6240         }
6241         ...;   // do some housekeeping, then ...
6242         sayNO; // propagate the failure
6243     }
6244 
6245 The topmost backtrack state, pointed to by st, is usually free. If you
6246 want to claim it, populate any ST.foo fields in it with values you wish to
6247 save, then do one of
6248 
6249         PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6250         PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6251 
6252 which sets that backtrack state's resume value to 'resume_state', pushes a
6253 new free entry to the top of the backtrack stack, then goes to 'node'.
6254 On backtracking, the free slot is popped, and the saved state becomes the
6255 new free state. An ST.foo field in this new top state can be temporarily
6256 accessed to retrieve values, but once the main loop is re-entered, it
6257 becomes available for reuse.
6258 
6259 Note that the depth of the backtrack stack constantly increases during the
6260 left-to-right execution of the pattern, rather than going up and down with
6261 the pattern nesting. For example the stack is at its maximum at Z at the
6262 end of the pattern, rather than at X in the following:
6263 
6264     /(((X)+)+)+....(Y)+....Z/
6265 
6266 The only exceptions to this are lookahead/behind assertions and the cut,
6267 (?>A), which pop all the backtrack states associated with A before
6268 continuing.
6269 
6270 Backtrack state structs are allocated in slabs of about 4K in size.
6271 PL_regmatch_state and st always point to the currently active state,
6272 and PL_regmatch_slab points to the slab currently containing
6273 PL_regmatch_state.  The first time regmatch() is called, the first slab is
6274 allocated, and is never freed until interpreter destruction. When the slab
6275 is full, a new one is allocated and chained to the end. At exit from
6276 regmatch(), slabs allocated since entry are freed.
6277 
6278 In order to work with variable length lookbehinds, an upper limit is placed on
6279 lookbehinds which is set to where the match position is at the end of where the
6280 lookbehind would get to.  Nothing in the lookbehind should match above that,
6281 except we should be able to look beyond if for things like \b, which need the
6282 next character in the string to be able to determine if this is a boundary or
6283 not.  We also can't match the end of string/line unless we are also at the end
6284 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6285 that match a width, we have to add a condition that they are within the legal
6286 bounds of our window into the string.
6287 
6288 */
6289 
6290 /* returns -1 on failure, $+[0] on success */
6291 STATIC SSize_t
6292 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6293 {
6294     const bool utf8_target = reginfo->is_utf8_target;
6295     const U32 uniflags = UTF8_ALLOW_DEFAULT;
6296     REGEXP *rex_sv = reginfo->prog;
6297     regexp *rex = ReANY(rex_sv);
6298     RXi_GET_DECL(rex,rexi);
6299     /* the current state. This is a cached copy of PL_regmatch_state */
6300     regmatch_state *st;
6301     /* cache heavy used fields of st in registers */
6302     regnode *scan;
6303     regnode *next;
6304     U32 n = 0;	/* general value; init to avoid compiler warning */
6305     U32 utmp = 0;  /* tmp variable - valid for at most one opcode */
6306     SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
6307     SSize_t endref = 0; /* offset of end of backref when ln is start */
6308     char *locinput = startpos;
6309     char *loceol = reginfo->strend;
6310     char *pushinput; /* where to continue after a PUSH */
6311     char *pusheol;   /* where to stop matching (loceol) after a PUSH */
6312     U8   *pushsr0;   /* save starting pos of script run */
6313     PERL_INT_FAST16_T nextbyte;   /* is always set to UCHARAT(locinput), or -1
6314                                      at EOS */
6315 
6316     bool result = 0;	    /* return value of S_regmatch */
6317     U32 depth = 0;            /* depth of backtrack stack */
6318     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6319     const U32 max_nochange_depth =
6320         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6321         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6322     regmatch_state *yes_state = NULL; /* state to pop to on success of
6323                                                             subpattern */
6324     /* mark_state piggy backs on the yes_state logic so that when we unwind
6325        the stack on success we can update the mark_state as we go */
6326     regmatch_state *mark_state = NULL; /* last mark state we have seen */
6327     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6328     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
6329     U32 state_num;
6330     bool no_final = 0;      /* prevent failure from backtracking? */
6331     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
6332     char *startpoint = locinput;
6333     SV *popmark = NULL;     /* are we looking for a mark? */
6334     SV *sv_commit = NULL;   /* last mark name seen in failure */
6335     SV *sv_yes_mark = NULL; /* last mark name we have seen
6336                                during a successful match */
6337     U32 lastopen = 0;       /* last open we saw */
6338     bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6339     SV* const oreplsv = GvSVn(PL_replgv);
6340     /* these three flags are set by various ops to signal information to
6341      * the very next op. They have a useful lifetime of exactly one loop
6342      * iteration, and are not preserved or restored by state pushes/pops
6343      */
6344     bool sw = 0;	    /* the condition value in (?(cond)a|b) */
6345     bool minmod = 0;	    /* the next "{n,m}" is a "{n,m}?" */
6346     int logical = 0;	    /* the following EVAL is:
6347                                 0: (?{...})
6348                                 1: (?(?{...})X|Y)
6349                                 2: (??{...})
6350                                or the following IFMATCH/UNLESSM is:
6351                                 false: plain (?=foo)
6352                                 true:  used as a condition: (?(?=foo))
6353                             */
6354     PAD* last_pad = NULL;
6355     dMULTICALL;
6356     U8 gimme = G_SCALAR;
6357     CV *caller_cv = NULL;	/* who called us */
6358     CV *last_pushed_cv = NULL;	/* most recently called (?{}) CV */
6359     U32 maxopenparen = 0;       /* max '(' index seen so far */
6360     int to_complement;  /* Invert the result? */
6361     _char_class_number classnum;
6362     bool is_utf8_pat = reginfo->is_utf8_pat;
6363     bool match = FALSE;
6364     I32 orig_savestack_ix = PL_savestack_ix;
6365     U8 * script_run_begin = NULL;
6366     char *match_end= NULL; /* where a match MUST end to be considered successful */
6367     bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */
6368 
6369 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6370 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6371 #  define SOLARIS_BAD_OPTIMIZER
6372     const U32 *pl_charclass_dup = PL_charclass;
6373 #  define PL_charclass pl_charclass_dup
6374 #endif
6375 
6376 #ifdef DEBUGGING
6377     DECLARE_AND_GET_RE_DEBUG_FLAGS;
6378 #endif
6379 
6380     /* protect against undef(*^R) */
6381     SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6382 
6383     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6384     multicall_oldcatch = 0;
6385     PERL_UNUSED_VAR(multicall_cop);
6386 
6387     PERL_ARGS_ASSERT_REGMATCH;
6388 
6389     st = PL_regmatch_state;
6390 
6391     /* Note that nextbyte is a byte even in UTF */
6392     SET_nextchr;
6393     scan = prog;
6394 
6395     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6396             DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6397             Perl_re_printf( aTHX_ "regmatch start\n" );
6398     }));
6399 
6400     while (scan != NULL) {
6401         next = scan + NEXT_OFF(scan);
6402         if (next == scan)
6403             next = NULL;
6404         state_num = OP(scan);
6405 
6406       reenter_switch:
6407         DEBUG_EXECUTE_r(
6408             if (state_num <= REGNODE_MAX) {
6409                 SV * const prop = sv_newmortal();
6410                 regnode *rnext = regnext(scan);
6411 
6412                 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6413                 regprop(rex, prop, scan, reginfo, NULL);
6414                 Perl_re_printf( aTHX_
6415                     "%*s%" IVdf ":%s(%" IVdf ")\n",
6416                     INDENT_CHARS(depth), "",
6417                     (IV)(scan - rexi->program),
6418                     SvPVX_const(prop),
6419                     (PL_regkind[OP(scan)] == END || !rnext) ?
6420                         0 : (IV)(rnext - rexi->program));
6421             }
6422         );
6423 
6424         to_complement = 0;
6425 
6426         SET_nextchr;
6427         assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6428 
6429         switch (state_num) {
6430         case SBOL: /*  /^../ and /\A../  */
6431             if (locinput == reginfo->strbeg)
6432                 break;
6433             sayNO;
6434 
6435         case MBOL: /*  /^../m  */
6436             if (locinput == reginfo->strbeg ||
6437                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6438             {
6439                 break;
6440             }
6441             sayNO;
6442 
6443         case GPOS: /*  \G  */
6444             if (locinput == reginfo->ganch)
6445                 break;
6446             sayNO;
6447 
6448         case KEEPS: /*   \K  */
6449             /* update the startpoint */
6450             st->u.keeper.val = rex->offs[0].start;
6451             rex->offs[0].start = locinput - reginfo->strbeg;
6452             PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6453                             script_run_begin);
6454             NOT_REACHED; /* NOTREACHED */
6455 
6456         case KEEPS_next_fail:
6457             /* rollback the start point change */
6458             rex->offs[0].start = st->u.keeper.val;
6459             sayNO_SILENT;
6460             NOT_REACHED; /* NOTREACHED */
6461 
6462         case MEOL: /* /..$/m  */
6463             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6464                 sayNO;
6465             break;
6466 
6467         case SEOL: /* /..$/  */
6468             if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6469                 sayNO;
6470             if (reginfo->strend - locinput > 1)
6471                 sayNO;
6472             break;
6473 
6474         case EOS: /*  \z  */
6475             if (!NEXTCHR_IS_EOS)
6476                 sayNO;
6477             break;
6478 
6479         case SANY: /*  /./s  */
6480             if (NEXTCHR_IS_EOS || locinput >= loceol)
6481                 sayNO;
6482             goto increment_locinput;
6483 
6484         case REG_ANY: /*  /./  */
6485             if (   NEXTCHR_IS_EOS
6486                 || locinput >= loceol
6487                 || nextbyte == '\n')
6488             {
6489                 sayNO;
6490             }
6491             goto increment_locinput;
6492 
6493 
6494 #undef  ST
6495 #define ST st->u.trie
6496         case TRIEC: /* (ab|cd) with known charclass */
6497             /* In this case the charclass data is available inline so
6498                we can fail fast without a lot of extra overhead.
6499              */
6500             if ( !   NEXTCHR_IS_EOS
6501                 &&   locinput < loceol
6502                 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6503             {
6504                 DEBUG_EXECUTE_r(
6505                     Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6506                               depth, PL_colors[4], PL_colors[5])
6507                 );
6508                 sayNO_SILENT;
6509                 NOT_REACHED; /* NOTREACHED */
6510             }
6511             /* FALLTHROUGH */
6512         case TRIE:  /* (ab|cd)  */
6513             /* the basic plan of execution of the trie is:
6514              * At the beginning, run though all the states, and
6515              * find the longest-matching word. Also remember the position
6516              * of the shortest matching word. For example, this pattern:
6517              *    1  2 3 4    5
6518              *    ab|a|x|abcd|abc
6519              * when matched against the string "abcde", will generate
6520              * accept states for all words except 3, with the longest
6521              * matching word being 4, and the shortest being 2 (with
6522              * the position being after char 1 of the string).
6523              *
6524              * Then for each matching word, in word order (i.e. 1,2,4,5),
6525              * we run the remainder of the pattern; on each try setting
6526              * the current position to the character following the word,
6527              * returning to try the next word on failure.
6528              *
6529              * We avoid having to build a list of words at runtime by
6530              * using a compile-time structure, wordinfo[].prev, which
6531              * gives, for each word, the previous accepting word (if any).
6532              * In the case above it would contain the mappings 1->2, 2->0,
6533              * 3->0, 4->5, 5->1.  We can use this table to generate, from
6534              * the longest word (4 above), a list of all words, by
6535              * following the list of prev pointers; this gives us the
6536              * unordered list 4,5,1,2. Then given the current word we have
6537              * just tried, we can go through the list and find the
6538              * next-biggest word to try (so if we just failed on word 2,
6539              * the next in the list is 4).
6540              *
6541              * Since at runtime we don't record the matching position in
6542              * the string for each word, we have to work that out for
6543              * each word we're about to process. The wordinfo table holds
6544              * the character length of each word; given that we recorded
6545              * at the start: the position of the shortest word and its
6546              * length in chars, we just need to move the pointer the
6547              * difference between the two char lengths. Depending on
6548              * Unicode status and folding, that's cheap or expensive.
6549              *
6550              * This algorithm is optimised for the case where are only a
6551              * small number of accept states, i.e. 0,1, or maybe 2.
6552              * With lots of accepts states, and having to try all of them,
6553              * it becomes quadratic on number of accept states to find all
6554              * the next words.
6555              */
6556 
6557             {
6558                 /* what type of TRIE am I? (utf8 makes this contextual) */
6559                 DECL_TRIE_TYPE(scan);
6560 
6561                 /* what trie are we using right now */
6562                 reg_trie_data * const trie
6563                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6564                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6565                 U32 state = trie->startstate;
6566 
6567                 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6568                     _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6569                     if (utf8_target
6570                         && ! NEXTCHR_IS_EOS
6571                         && UTF8_IS_ABOVE_LATIN1(nextbyte)
6572                         && scan->flags == EXACTL)
6573                     {
6574                         /* We only output for EXACTL, as we let the folder
6575                          * output this message for EXACTFLU8 to avoid
6576                          * duplication */
6577                         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6578                                                                reginfo->strend);
6579                     }
6580                 }
6581                 if (   trie->bitmap
6582                     && (     NEXTCHR_IS_EOS
6583                         ||   locinput >= loceol
6584                         || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6585                 {
6586                     if (trie->states[ state ].wordnum) {
6587                          DEBUG_EXECUTE_r(
6588                             Perl_re_exec_indentf( aTHX_  "%sTRIE: matched empty string...%s\n",
6589                                           depth, PL_colors[4], PL_colors[5])
6590                         );
6591                         if (!trie->jump)
6592                             break;
6593                     } else {
6594                         DEBUG_EXECUTE_r(
6595                             Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
6596                                           depth, PL_colors[4], PL_colors[5])
6597                         );
6598                         sayNO_SILENT;
6599                    }
6600                 }
6601 
6602             {
6603                 U8 *uc = ( U8* )locinput;
6604 
6605                 STRLEN len = 0;
6606                 STRLEN foldlen = 0;
6607                 U8 *uscan = (U8*)NULL;
6608                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6609                 U32 charcount = 0; /* how many input chars we have matched */
6610                 U32 accepted = 0; /* have we seen any accepting states? */
6611 
6612                 ST.jump = trie->jump;
6613                 ST.me = scan;
6614                 ST.firstpos = NULL;
6615                 ST.longfold = FALSE; /* char longer if folded => it's harder */
6616                 ST.nextword = 0;
6617 
6618                 /* fully traverse the TRIE; note the position of the
6619                    shortest accept state and the wordnum of the longest
6620                    accept state */
6621 
6622                 while ( state && uc <= (U8*)(loceol) ) {
6623                     U32 base = trie->states[ state ].trans.base;
6624                     UV uvc = 0;
6625                     U16 charid = 0;
6626                     U16 wordnum;
6627                     wordnum = trie->states[ state ].wordnum;
6628 
6629                     if (wordnum) { /* it's an accept state */
6630                         if (!accepted) {
6631                             accepted = 1;
6632                             /* record first match position */
6633                             if (ST.longfold) {
6634                                 ST.firstpos = (U8*)locinput;
6635                                 ST.firstchars = 0;
6636                             }
6637                             else {
6638                                 ST.firstpos = uc;
6639                                 ST.firstchars = charcount;
6640                             }
6641                         }
6642                         if (!ST.nextword || wordnum < ST.nextword)
6643                             ST.nextword = wordnum;
6644                         ST.topword = wordnum;
6645                     }
6646 
6647                     DEBUG_TRIE_EXECUTE_r({
6648                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6649                                 /* HERE */
6650                                 PerlIO_printf( Perl_debug_log,
6651                                     "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6652                                     INDENT_CHARS(depth), "", PL_colors[4],
6653                                     (UV)state, (accepted ? 'Y' : 'N'));
6654                     });
6655 
6656                     /* read a char and goto next state */
6657                     if ( base && (foldlen || uc < (U8*)(loceol))) {
6658                         I32 offset;
6659                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6660                                              (U8 *) loceol, uscan,
6661                                              len, uvc, charid, foldlen,
6662                                              foldbuf, uniflags);
6663                         charcount++;
6664                         if (foldlen>0)
6665                             ST.longfold = TRUE;
6666                         if (charid &&
6667                              ( ((offset =
6668                               base + charid - 1 - trie->uniquecharcount)) >= 0)
6669 
6670                              && ((U32)offset < trie->lasttrans)
6671                              && trie->trans[offset].check == state)
6672                         {
6673                             state = trie->trans[offset].next;
6674                         }
6675                         else {
6676                             state = 0;
6677                         }
6678                         uc += len;
6679 
6680                     }
6681                     else {
6682                         state = 0;
6683                     }
6684                     DEBUG_TRIE_EXECUTE_r(
6685                         Perl_re_printf( aTHX_
6686                             "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6687                             charid, uvc, (UV)state, PL_colors[5] );
6688                     );
6689                 }
6690                 if (!accepted)
6691                    sayNO;
6692 
6693                 /* calculate total number of accept states */
6694                 {
6695                     U16 w = ST.topword;
6696                     accepted = 0;
6697                     while (w) {
6698                         w = trie->wordinfo[w].prev;
6699                         accepted++;
6700                     }
6701                     ST.accepted = accepted;
6702                 }
6703 
6704                 DEBUG_EXECUTE_r(
6705                     Perl_re_exec_indentf( aTHX_  "%sTRIE: got %" IVdf " possible matches%s\n",
6706                         depth,
6707                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6708                 );
6709                 goto trie_first_try; /* jump into the fail handler */
6710             }}
6711             NOT_REACHED; /* NOTREACHED */
6712 
6713         case TRIE_next_fail: /* we failed - try next alternative */
6714         {
6715             U8 *uc;
6716             if ( ST.jump ) {
6717                 /* undo any captures done in the tail part of a branch,
6718                  * e.g.
6719                  *    /(?:X(.)(.)|Y(.)).../
6720                  * where the trie just matches X then calls out to do the
6721                  * rest of the branch */
6722                 REGCP_UNWIND(ST.cp);
6723                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6724             }
6725             if (!--ST.accepted) {
6726                 DEBUG_EXECUTE_r({
6727                     Perl_re_exec_indentf( aTHX_  "%sTRIE failed...%s\n",
6728                         depth,
6729                         PL_colors[4],
6730                         PL_colors[5] );
6731                 });
6732                 sayNO_SILENT;
6733             }
6734             {
6735                 /* Find next-highest word to process.  Note that this code
6736                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
6737                 U16 min = 0;
6738                 U16 word;
6739                 U16 const nextword = ST.nextword;
6740                 reg_trie_wordinfo * const wordinfo
6741                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6742                 for (word=ST.topword; word; word=wordinfo[word].prev) {
6743                     if (word > nextword && (!min || word < min))
6744                         min = word;
6745                 }
6746                 ST.nextword = min;
6747             }
6748 
6749           trie_first_try:
6750             if (do_cutgroup) {
6751                 do_cutgroup = 0;
6752                 no_final = 0;
6753             }
6754 
6755             if ( ST.jump ) {
6756                 ST.lastparen = rex->lastparen;
6757                 ST.lastcloseparen = rex->lastcloseparen;
6758                 REGCP_SET(ST.cp);
6759             }
6760 
6761             /* find start char of end of current word */
6762             {
6763                 U32 chars; /* how many chars to skip */
6764                 reg_trie_data * const trie
6765                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6766 
6767                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6768                             >=  ST.firstchars);
6769                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6770                             - ST.firstchars;
6771                 uc = ST.firstpos;
6772 
6773                 if (ST.longfold) {
6774                     /* the hard option - fold each char in turn and find
6775                      * its folded length (which may be different */
6776                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6777                     STRLEN foldlen;
6778                     STRLEN len;
6779                     UV uvc;
6780                     U8 *uscan;
6781 
6782                     while (chars) {
6783                         if (utf8_target) {
6784                             /* XXX This assumes the length is well-formed, as
6785                              * does the UTF8SKIP below */
6786                             uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6787                                                     uniflags);
6788                             uc += len;
6789                         }
6790                         else {
6791                             uvc = *uc;
6792                             uc++;
6793                         }
6794                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6795                         uscan = foldbuf;
6796                         while (foldlen) {
6797                             if (!--chars)
6798                                 break;
6799                             uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6800                                                  uniflags);
6801                             uscan += len;
6802                             foldlen -= len;
6803                         }
6804                     }
6805                 }
6806                 else {
6807                     if (utf8_target)
6808                         uc = utf8_hop(uc, chars);
6809                     else
6810                         uc += chars;
6811                 }
6812             }
6813 
6814             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6815                             ? ST.jump[ST.nextword]
6816                             : NEXT_OFF(ST.me));
6817 
6818             DEBUG_EXECUTE_r({
6819                 Perl_re_exec_indentf( aTHX_  "%sTRIE matched word #%d, continuing%s\n",
6820                     depth,
6821                     PL_colors[4],
6822                     ST.nextword,
6823                     PL_colors[5]
6824                     );
6825             });
6826 
6827             if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6828                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6829                                 script_run_begin);
6830                 NOT_REACHED; /* NOTREACHED */
6831             }
6832             /* only one choice left - just continue */
6833             DEBUG_EXECUTE_r({
6834                 AV *const trie_words
6835                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6836                 SV ** const tmp = trie_words
6837                         ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6838                 SV *sv= tmp ? sv_newmortal() : NULL;
6839 
6840                 Perl_re_exec_indentf( aTHX_  "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6841                     depth, PL_colors[4],
6842                     ST.nextword,
6843                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6844                             PL_colors[0], PL_colors[1],
6845                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6846                         )
6847                     : "not compiled under -Dr",
6848                     PL_colors[5] );
6849             });
6850 
6851             locinput = (char*)uc;
6852             continue; /* execute rest of RE */
6853             /* NOTREACHED */
6854         }
6855 #undef  ST
6856 
6857         case LEXACT_REQ8:
6858             if (! utf8_target) {
6859                 sayNO;
6860             }
6861             /* FALLTHROUGH */
6862 
6863         case LEXACT:
6864         {
6865             char *s;
6866 
6867             s = STRINGl(scan);
6868             ln = STR_LENl(scan);
6869             goto join_short_long_exact;
6870 
6871         case EXACTL:             /*  /abc/l       */
6872             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6873 
6874             /* Complete checking would involve going through every character
6875              * matched by the string to see if any is above latin1.  But the
6876              * comparision otherwise might very well be a fast assembly
6877              * language routine, and I (khw) don't think slowing things down
6878              * just to check for this warning is worth it.  So this just checks
6879              * the first character */
6880             if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6881                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6882             }
6883             goto do_exact;
6884         case EXACT_REQ8:
6885             if (! utf8_target) {
6886                 sayNO;
6887             }
6888             /* FALLTHROUGH */
6889 
6890         case EXACT:             /*  /abc/        */
6891           do_exact:
6892             s = STRINGs(scan);
6893             ln = STR_LENs(scan);
6894 
6895           join_short_long_exact:
6896             if (utf8_target != is_utf8_pat) {
6897                 /* The target and the pattern have differing utf8ness. */
6898                 char *l = locinput;
6899                 const char * const e = s + ln;
6900 
6901                 if (utf8_target) {
6902                     /* The target is utf8, the pattern is not utf8.
6903                      * Above-Latin1 code points can't match the pattern;
6904                      * invariants match exactly, and the other Latin1 ones need
6905                      * to be downgraded to a single byte in order to do the
6906                      * comparison.  (If we could be confident that the target
6907                      * is not malformed, this could be refactored to have fewer
6908                      * tests by just assuming that if the first bytes match, it
6909                      * is an invariant, but there are tests in the test suite
6910                      * dealing with (??{...}) which violate this) */
6911                     while (s < e) {
6912                         if (   l >= loceol
6913                             || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6914                         {
6915                             sayNO;
6916                         }
6917                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
6918                             if (*l != *s) {
6919                                 sayNO;
6920                             }
6921                             l++;
6922                         }
6923                         else {
6924                             if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6925                             {
6926                                 sayNO;
6927                             }
6928                             l += 2;
6929                         }
6930                         s++;
6931                     }
6932                 }
6933                 else {
6934                     /* The target is not utf8, the pattern is utf8. */
6935                     while (s < e) {
6936                         if (   l >= loceol
6937                             || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6938                         {
6939                             sayNO;
6940                         }
6941                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
6942                             if (*s != *l) {
6943                                 sayNO;
6944                             }
6945                             s++;
6946                         }
6947                         else {
6948                             if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6949                             {
6950                                 sayNO;
6951                             }
6952                             s += 2;
6953                         }
6954                         l++;
6955                     }
6956                 }
6957                 locinput = l;
6958             }
6959             else {
6960                 /* The target and the pattern have the same utf8ness. */
6961                 /* Inline the first character, for speed. */
6962                 if (   loceol - locinput < ln
6963                     || UCHARAT(s) != nextbyte
6964                     || (ln > 1 && memNE(s, locinput, ln)))
6965                 {
6966                     sayNO;
6967                 }
6968                 locinput += ln;
6969             }
6970             break;
6971             }
6972 
6973         case EXACTFL:            /*  /abc/il      */
6974           {
6975             re_fold_t folder;
6976             const U8 * fold_array;
6977             const char * s;
6978             U32 fold_utf8_flags;
6979 
6980             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6981             folder = foldEQ_locale;
6982             fold_array = PL_fold_locale;
6983             fold_utf8_flags = FOLDEQ_LOCALE;
6984             goto do_exactf;
6985 
6986         case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
6987                                       is effectively /u; hence to match, target
6988                                       must be UTF-8. */
6989             if (! utf8_target) {
6990                 sayNO;
6991             }
6992             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6993                                              | FOLDEQ_S2_FOLDS_SANE;
6994             folder = foldEQ_latin1_s2_folded;
6995             fold_array = PL_fold_latin1;
6996             goto do_exactf;
6997 
6998         case EXACTFU_REQ8:      /* /abc/iu with something in /abc/ > 255 */
6999             if (! utf8_target) {
7000                 sayNO;
7001             }
7002             assert(is_utf8_pat);
7003             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7004             goto do_exactf;
7005 
7006         case EXACTFUP:          /*  /foo/iu, and something is problematic in
7007                                     'foo' so can't take shortcuts. */
7008             assert(! is_utf8_pat);
7009             folder = foldEQ_latin1;
7010             fold_array = PL_fold_latin1;
7011             fold_utf8_flags = 0;
7012             goto do_exactf;
7013 
7014         case EXACTFU:            /*  /abc/iu      */
7015             folder = foldEQ_latin1_s2_folded;
7016             fold_array = PL_fold_latin1;
7017             fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7018             goto do_exactf;
7019 
7020         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
7021                                    patterns */
7022             assert(! is_utf8_pat);
7023             /* FALLTHROUGH */
7024         case EXACTFAA:            /*  /abc/iaa     */
7025             folder = foldEQ_latin1_s2_folded;
7026             fold_array = PL_fold_latin1;
7027             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7028             if (is_utf8_pat || ! utf8_target) {
7029 
7030                 /* The possible presence of a MICRO SIGN in the pattern forbids
7031                  * us to view a non-UTF-8 pattern as folded when there is a
7032                  * UTF-8 target */
7033                 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7034                                   |FOLDEQ_S2_FOLDS_SANE;
7035             }
7036             goto do_exactf;
7037 
7038 
7039         case EXACTF:             /*  /abc/i    This node only generated for
7040                                                non-utf8 patterns */
7041             assert(! is_utf8_pat);
7042             folder = foldEQ;
7043             fold_array = PL_fold;
7044             fold_utf8_flags = 0;
7045 
7046           do_exactf:
7047             s = STRINGs(scan);
7048             ln = STR_LENs(scan);
7049 
7050             if (   utf8_target
7051                 || is_utf8_pat
7052                 || state_num == EXACTFUP
7053                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7054             {
7055               /* Either target or the pattern are utf8, or has the issue where
7056                * the fold lengths may differ. */
7057                 const char * const l = locinput;
7058                 char *e = loceol;
7059 
7060                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
7061                                         s, 0,  ln, is_utf8_pat,fold_utf8_flags))
7062                 {
7063                     sayNO;
7064                 }
7065                 locinput = e;
7066                 break;
7067             }
7068 
7069             /* Neither the target nor the pattern are utf8 */
7070             if (UCHARAT(s) != nextbyte
7071                 && !NEXTCHR_IS_EOS
7072                 && UCHARAT(s) != fold_array[nextbyte])
7073             {
7074                 sayNO;
7075             }
7076             if (loceol - locinput < ln)
7077                 sayNO;
7078             if (ln > 1 && ! folder(locinput, s, ln))
7079                 sayNO;
7080             locinput += ln;
7081             break;
7082         }
7083 
7084         case NBOUNDL: /*  /\B/l  */
7085             to_complement = 1;
7086             /* FALLTHROUGH */
7087 
7088         case BOUNDL:  /*  /\b/l  */
7089         {
7090             bool b1, b2;
7091             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7092 
7093             if (FLAGS(scan) != TRADITIONAL_BOUND) {
7094                 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7095                 goto boundu;
7096             }
7097 
7098             if (utf8_target) {
7099                 if (locinput == reginfo->strbeg)
7100                     b1 = isWORDCHAR_LC('\n');
7101                 else {
7102                     U8 *p = reghop3((U8*)locinput, -1,
7103                                     (U8*)(reginfo->strbeg));
7104                     b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7105                 }
7106                 b2 = (NEXTCHR_IS_EOS)
7107                     ? isWORDCHAR_LC('\n')
7108                     : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7109                                               (U8*) reginfo->strend);
7110             }
7111             else { /* Here the string isn't utf8 */
7112                 b1 = (locinput == reginfo->strbeg)
7113                      ? isWORDCHAR_LC('\n')
7114                      : isWORDCHAR_LC(UCHARAT(locinput - 1));
7115                 b2 = (NEXTCHR_IS_EOS)
7116                     ? isWORDCHAR_LC('\n')
7117                     : isWORDCHAR_LC(nextbyte);
7118             }
7119             if (to_complement ^ (b1 == b2)) {
7120                 sayNO;
7121             }
7122             break;
7123         }
7124 
7125         case NBOUND:  /*  /\B/   */
7126             to_complement = 1;
7127             /* FALLTHROUGH */
7128 
7129         case BOUND:   /*  /\b/   */
7130             if (utf8_target) {
7131                 goto bound_utf8;
7132             }
7133             goto bound_ascii_match_only;
7134 
7135         case NBOUNDA: /*  /\B/a  */
7136             to_complement = 1;
7137             /* FALLTHROUGH */
7138 
7139         case BOUNDA:  /*  /\b/a  */
7140         {
7141             bool b1, b2;
7142 
7143           bound_ascii_match_only:
7144             /* Here the string isn't utf8, or is utf8 and only ascii characters
7145              * are to match \w.  In the latter case looking at the byte just
7146              * prior to the current one may be just the final byte of a
7147              * multi-byte character.  This is ok.  There are two cases:
7148              * 1) it is a single byte character, and then the test is doing
7149              *    just what it's supposed to.
7150              * 2) it is a multi-byte character, in which case the final byte is
7151              *    never mistakable for ASCII, and so the test will say it is
7152              *    not a word character, which is the correct answer. */
7153             b1 = (locinput == reginfo->strbeg)
7154                  ? isWORDCHAR_A('\n')
7155                  : isWORDCHAR_A(UCHARAT(locinput - 1));
7156             b2 = (NEXTCHR_IS_EOS)
7157                 ? isWORDCHAR_A('\n')
7158                 : isWORDCHAR_A(nextbyte);
7159             if (to_complement ^ (b1 == b2)) {
7160                 sayNO;
7161             }
7162             break;
7163         }
7164 
7165         case NBOUNDU: /*  /\B/u  */
7166             to_complement = 1;
7167             /* FALLTHROUGH */
7168 
7169         case BOUNDU:  /*  /\b/u  */
7170 
7171           boundu:
7172             if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7173                 match = FALSE;
7174             }
7175             else if (utf8_target) {
7176               bound_utf8:
7177                 switch((bound_type) FLAGS(scan)) {
7178                     case TRADITIONAL_BOUND:
7179                     {
7180                         bool b1, b2;
7181                         if (locinput == reginfo->strbeg) {
7182                             b1 = 0 /* isWORDCHAR_L1('\n') */;
7183                         }
7184                         else {
7185                             U8 *p = reghop3((U8*)locinput, -1,
7186                                             (U8*)(reginfo->strbeg));
7187 
7188                             b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7189                         }
7190                         b2 = (NEXTCHR_IS_EOS)
7191                             ? 0 /* isWORDCHAR_L1('\n') */
7192                             : isWORDCHAR_utf8_safe((U8*)locinput,
7193                                                    (U8*) reginfo->strend);
7194                         match = cBOOL(b1 != b2);
7195                         break;
7196                     }
7197                     case GCB_BOUND:
7198                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7199                             match = TRUE; /* GCB always matches at begin and
7200                                              end */
7201                         }
7202                         else {
7203                             /* Find the gcb values of previous and current
7204                              * chars, then see if is a break point */
7205                             match = isGCB(getGCB_VAL_UTF8(
7206                                                 reghop3((U8*)locinput,
7207                                                         -1,
7208                                                         (U8*)(reginfo->strbeg)),
7209                                                 (U8*) reginfo->strend),
7210                                           getGCB_VAL_UTF8((U8*) locinput,
7211                                                         (U8*) reginfo->strend),
7212                                           (U8*) reginfo->strbeg,
7213                                           (U8*) locinput,
7214                                           utf8_target);
7215                         }
7216                         break;
7217 
7218                     case LB_BOUND:
7219                         if (locinput == reginfo->strbeg) {
7220                             match = FALSE;
7221                         }
7222                         else if (NEXTCHR_IS_EOS) {
7223                             match = TRUE;
7224                         }
7225                         else {
7226                             match = isLB(getLB_VAL_UTF8(
7227                                                 reghop3((U8*)locinput,
7228                                                         -1,
7229                                                         (U8*)(reginfo->strbeg)),
7230                                                 (U8*) reginfo->strend),
7231                                           getLB_VAL_UTF8((U8*) locinput,
7232                                                         (U8*) reginfo->strend),
7233                                           (U8*) reginfo->strbeg,
7234                                           (U8*) locinput,
7235                                           (U8*) reginfo->strend,
7236                                           utf8_target);
7237                         }
7238                         break;
7239 
7240                     case SB_BOUND: /* Always matches at begin and end */
7241                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7242                             match = TRUE;
7243                         }
7244                         else {
7245                             match = isSB(getSB_VAL_UTF8(
7246                                                 reghop3((U8*)locinput,
7247                                                         -1,
7248                                                         (U8*)(reginfo->strbeg)),
7249                                                 (U8*) reginfo->strend),
7250                                           getSB_VAL_UTF8((U8*) locinput,
7251                                                         (U8*) reginfo->strend),
7252                                           (U8*) reginfo->strbeg,
7253                                           (U8*) locinput,
7254                                           (U8*) reginfo->strend,
7255                                           utf8_target);
7256                         }
7257                         break;
7258 
7259                     case WB_BOUND:
7260                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7261                             match = TRUE;
7262                         }
7263                         else {
7264                             match = isWB(WB_UNKNOWN,
7265                                          getWB_VAL_UTF8(
7266                                                 reghop3((U8*)locinput,
7267                                                         -1,
7268                                                         (U8*)(reginfo->strbeg)),
7269                                                 (U8*) reginfo->strend),
7270                                           getWB_VAL_UTF8((U8*) locinput,
7271                                                         (U8*) reginfo->strend),
7272                                           (U8*) reginfo->strbeg,
7273                                           (U8*) locinput,
7274                                           (U8*) reginfo->strend,
7275                                           utf8_target);
7276                         }
7277                         break;
7278                 }
7279             }
7280             else {  /* Not utf8 target */
7281                 switch((bound_type) FLAGS(scan)) {
7282                     case TRADITIONAL_BOUND:
7283                     {
7284                         bool b1, b2;
7285                         b1 = (locinput == reginfo->strbeg)
7286                             ? 0 /* isWORDCHAR_L1('\n') */
7287                             : isWORDCHAR_L1(UCHARAT(locinput - 1));
7288                         b2 = (NEXTCHR_IS_EOS)
7289                             ? 0 /* isWORDCHAR_L1('\n') */
7290                             : isWORDCHAR_L1(nextbyte);
7291                         match = cBOOL(b1 != b2);
7292                         break;
7293                     }
7294 
7295                     case GCB_BOUND:
7296                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7297                             match = TRUE; /* GCB always matches at begin and
7298                                              end */
7299                         }
7300                         else {  /* Only CR-LF combo isn't a GCB in 0-255
7301                                    range */
7302                             match =    UCHARAT(locinput - 1) != '\r'
7303                                     || UCHARAT(locinput) != '\n';
7304                         }
7305                         break;
7306 
7307                     case LB_BOUND:
7308                         if (locinput == reginfo->strbeg) {
7309                             match = FALSE;
7310                         }
7311                         else if (NEXTCHR_IS_EOS) {
7312                             match = TRUE;
7313                         }
7314                         else {
7315                             match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7316                                          getLB_VAL_CP(UCHARAT(locinput)),
7317                                          (U8*) reginfo->strbeg,
7318                                          (U8*) locinput,
7319                                          (U8*) reginfo->strend,
7320                                          utf8_target);
7321                         }
7322                         break;
7323 
7324                     case SB_BOUND: /* Always matches at begin and end */
7325                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7326                             match = TRUE;
7327                         }
7328                         else {
7329                             match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7330                                          getSB_VAL_CP(UCHARAT(locinput)),
7331                                          (U8*) reginfo->strbeg,
7332                                          (U8*) locinput,
7333                                          (U8*) reginfo->strend,
7334                                          utf8_target);
7335                         }
7336                         break;
7337 
7338                     case WB_BOUND:
7339                         if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7340                             match = TRUE;
7341                         }
7342                         else {
7343                             match = isWB(WB_UNKNOWN,
7344                                          getWB_VAL_CP(UCHARAT(locinput -1)),
7345                                          getWB_VAL_CP(UCHARAT(locinput)),
7346                                          (U8*) reginfo->strbeg,
7347                                          (U8*) locinput,
7348                                          (U8*) reginfo->strend,
7349                                          utf8_target);
7350                         }
7351                         break;
7352                 }
7353             }
7354 
7355             if (to_complement ^ ! match) {
7356                 sayNO;
7357             }
7358             break;
7359 
7360         case ANYOFPOSIXL:
7361         case ANYOFL:  /*  /[abc]/l      */
7362             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7363             CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7364 
7365             /* FALLTHROUGH */
7366         case ANYOFD:  /*   /[abc]/d       */
7367         case ANYOF:  /*   /[abc]/       */
7368             if (NEXTCHR_IS_EOS || locinput >= loceol)
7369                 sayNO;
7370             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7371                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7372             {
7373                 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7374                     sayNO;
7375                 }
7376                 locinput++;
7377             }
7378             else {
7379                 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7380                                                                    utf8_target))
7381                 {
7382                     sayNO;
7383                 }
7384                 goto increment_locinput;
7385             }
7386             break;
7387 
7388         case ANYOFM:
7389             if (   NEXTCHR_IS_EOS
7390                 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
7391                 || locinput >= loceol)
7392             {
7393                 sayNO;
7394             }
7395             locinput++; /* ANYOFM is always single byte */
7396             break;
7397 
7398         case NANYOFM:
7399             if (   NEXTCHR_IS_EOS
7400                 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
7401                 || locinput >= loceol)
7402             {
7403                 sayNO;
7404             }
7405             goto increment_locinput;
7406             break;
7407 
7408         case ANYOFH:
7409             if (   ! utf8_target
7410                 ||   NEXTCHR_IS_EOS
7411                 ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7412                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7413                                                                    utf8_target))
7414             {
7415                 sayNO;
7416             }
7417             goto increment_locinput;
7418             break;
7419 
7420         case ANYOFHb:
7421             if (   ! utf8_target
7422                 ||   NEXTCHR_IS_EOS
7423                 ||   ANYOF_FLAGS(scan) != (U8) *locinput
7424                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7425                                                                   utf8_target))
7426             {
7427                 sayNO;
7428             }
7429             goto increment_locinput;
7430             break;
7431 
7432         case ANYOFHr:
7433             if (   ! utf8_target
7434                 ||   NEXTCHR_IS_EOS
7435                 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7436                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7437                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7438                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7439                                                                    utf8_target))
7440             {
7441                 sayNO;
7442             }
7443             goto increment_locinput;
7444             break;
7445 
7446         case ANYOFHs:
7447             if (   ! utf8_target
7448                 ||   NEXTCHR_IS_EOS
7449                 ||   loceol - locinput < FLAGS(scan)
7450                 ||   memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7451                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7452                                                                    utf8_target))
7453             {
7454                 sayNO;
7455             }
7456             goto increment_locinput;
7457             break;
7458 
7459         case ANYOFR:
7460             if (NEXTCHR_IS_EOS) {
7461                 sayNO;
7462             }
7463 
7464             if (utf8_target) {
7465                 if (    ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7466                    || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7467                                                 (U8 *) reginfo->strend,
7468                                                 NULL),
7469                                     ANYOFRbase(scan), ANYOFRdelta(scan)))
7470                 {
7471                     sayNO;
7472                 }
7473             }
7474             else {
7475                 if (! withinCOUNT((U8) *locinput,
7476                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7477                 {
7478                     sayNO;
7479                 }
7480             }
7481             goto increment_locinput;
7482             break;
7483 
7484         case ANYOFRb:
7485             if (NEXTCHR_IS_EOS) {
7486                 sayNO;
7487             }
7488 
7489             if (utf8_target) {
7490                 if (     ANYOF_FLAGS(scan) != (U8) *locinput
7491                     || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7492                                                 (U8 *) reginfo->strend,
7493                                                 NULL),
7494                                      ANYOFRbase(scan), ANYOFRdelta(scan)))
7495                 {
7496                     sayNO;
7497                 }
7498             }
7499             else {
7500                 if (! withinCOUNT((U8) *locinput,
7501                                   ANYOFRbase(scan), ANYOFRdelta(scan)))
7502                 {
7503                     sayNO;
7504                 }
7505             }
7506             goto increment_locinput;
7507             break;
7508 
7509         /* The argument (FLAGS) to all the POSIX node types is the class number
7510          * */
7511 
7512         case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
7513             to_complement = 1;
7514             /* FALLTHROUGH */
7515 
7516         case POSIXL:    /* \w or [:punct:] etc. under /l */
7517             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7518             if (NEXTCHR_IS_EOS || locinput >= loceol)
7519                 sayNO;
7520 
7521             /* Use isFOO_lc() for characters within Latin1.  (Note that
7522              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7523              * wouldn't be invariant) */
7524             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7525                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7526                     sayNO;
7527                 }
7528 
7529                 locinput++;
7530                 break;
7531             }
7532 
7533             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7534                 /* An above Latin-1 code point, or malformed */
7535                 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7536                                                        reginfo->strend);
7537                 goto utf8_posix_above_latin1;
7538             }
7539 
7540             /* Here is a UTF-8 variant code point below 256 and the target is
7541              * UTF-8 */
7542             if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7543                                             EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7544                                             *(locinput + 1))))))
7545             {
7546                 sayNO;
7547             }
7548 
7549             goto increment_locinput;
7550 
7551         case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
7552             to_complement = 1;
7553             /* FALLTHROUGH */
7554 
7555         case POSIXD:    /* \w or [:punct:] etc. under /d */
7556             if (utf8_target) {
7557                 goto utf8_posix;
7558             }
7559             goto posixa;
7560 
7561         case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
7562 
7563             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7564                 sayNO;
7565             }
7566 
7567             /* All UTF-8 variants match */
7568             if (! UTF8_IS_INVARIANT(nextbyte)) {
7569                 goto increment_locinput;
7570             }
7571 
7572             to_complement = 1;
7573             goto join_nposixa;
7574 
7575         case POSIXA:    /* \w or [:punct:] etc. under /a */
7576 
7577           posixa:
7578             /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7579              * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7580              * character is a single byte */
7581 
7582             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7583                 sayNO;
7584             }
7585 
7586           join_nposixa:
7587 
7588             if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte,
7589                                                                 FLAGS(scan)))))
7590             {
7591                 sayNO;
7592             }
7593 
7594             /* Here we are either not in utf8, or we matched a utf8-invariant,
7595              * so the next char is the next byte */
7596             locinput++;
7597             break;
7598 
7599         case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
7600             to_complement = 1;
7601             /* FALLTHROUGH */
7602 
7603         case POSIXU:    /* \w or [:punct:] etc. under /u */
7604           utf8_posix:
7605             if (NEXTCHR_IS_EOS || locinput >= loceol) {
7606                 sayNO;
7607             }
7608 
7609             /* Use _generic_isCC() for characters within Latin1.  (Note that
7610              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7611              * wouldn't be invariant) */
7612             if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7613                 if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte,
7614                                                            FLAGS(scan)))))
7615                 {
7616                     sayNO;
7617                 }
7618                 locinput++;
7619             }
7620             else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7621                 if (! (to_complement
7622                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7623                                                                *(locinput + 1)),
7624                                              FLAGS(scan)))))
7625                 {
7626                     sayNO;
7627                 }
7628                 locinput += 2;
7629             }
7630             else {  /* Handle above Latin-1 code points */
7631               utf8_posix_above_latin1:
7632                 classnum = (_char_class_number) FLAGS(scan);
7633                 switch (classnum) {
7634                     default:
7635                         if (! (to_complement
7636                            ^ cBOOL(_invlist_contains_cp(
7637                                       PL_XPosix_ptrs[classnum],
7638                                       utf8_to_uvchr_buf((U8 *) locinput,
7639                                                         (U8 *) reginfo->strend,
7640                                                         NULL)))))
7641                         {
7642                             sayNO;
7643                         }
7644                         break;
7645                     case _CC_ENUM_SPACE:
7646                         if (! (to_complement
7647                                     ^ cBOOL(is_XPERLSPACE_high(locinput))))
7648                         {
7649                             sayNO;
7650                         }
7651                         break;
7652                     case _CC_ENUM_BLANK:
7653                         if (! (to_complement
7654                                         ^ cBOOL(is_HORIZWS_high(locinput))))
7655                         {
7656                             sayNO;
7657                         }
7658                         break;
7659                     case _CC_ENUM_XDIGIT:
7660                         if (! (to_complement
7661                                         ^ cBOOL(is_XDIGIT_high(locinput))))
7662                         {
7663                             sayNO;
7664                         }
7665                         break;
7666                     case _CC_ENUM_VERTSPACE:
7667                         if (! (to_complement
7668                                         ^ cBOOL(is_VERTWS_high(locinput))))
7669                         {
7670                             sayNO;
7671                         }
7672                         break;
7673                     case _CC_ENUM_CNTRL:    /* These can't match above Latin1 */
7674                     case _CC_ENUM_ASCII:
7675                         if (! to_complement) {
7676                             sayNO;
7677                         }
7678                         break;
7679                 }
7680                 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7681             }
7682             break;
7683 
7684         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
7685                        a Unicode extended Grapheme Cluster */
7686             if (NEXTCHR_IS_EOS || locinput >= loceol)
7687                 sayNO;
7688             if  (! utf8_target) {
7689 
7690                 /* Match either CR LF  or '.', as all the other possibilities
7691                  * require utf8 */
7692                 locinput++;	    /* Match the . or CR */
7693                 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7694                                        match the LF */
7695                     && locinput <  loceol
7696                     && UCHARAT(locinput) == '\n')
7697                 {
7698                     locinput++;
7699                 }
7700             }
7701             else {
7702 
7703                 /* Get the gcb type for the current character */
7704                 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7705                                                        (U8*) reginfo->strend);
7706 
7707                 /* Then scan through the input until we get to the first
7708                  * character whose type is supposed to be a gcb with the
7709                  * current character.  (There is always a break at the
7710                  * end-of-input) */
7711                 locinput += UTF8SKIP(locinput);
7712                 while (locinput < loceol) {
7713                     GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7714                                                          (U8*) reginfo->strend);
7715                     if (isGCB(prev_gcb, cur_gcb,
7716                               (U8*) reginfo->strbeg, (U8*) locinput,
7717                               utf8_target))
7718                     {
7719                         break;
7720                     }
7721 
7722                     prev_gcb = cur_gcb;
7723                     locinput += UTF8SKIP(locinput);
7724                 }
7725 
7726 
7727             }
7728             break;
7729 
7730         case REFFLN:  /*  /\g{name}/il  */
7731         {   /* The capture buffer cases.  The ones beginning with N for the
7732                named buffers just convert to the equivalent numbered and
7733                pretend they were called as the corresponding numbered buffer
7734                op.  */
7735             /* don't initialize these in the declaration, it makes C++
7736                unhappy */
7737             const char *s;
7738             char type;
7739             re_fold_t folder;
7740             const U8 *fold_array;
7741             UV utf8_fold_flags;
7742 
7743             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7744             folder = foldEQ_locale;
7745             fold_array = PL_fold_locale;
7746             type = REFFL;
7747             utf8_fold_flags = FOLDEQ_LOCALE;
7748             goto do_nref;
7749 
7750         case REFFAN:  /*  /\g{name}/iaa  */
7751             folder = foldEQ_latin1;
7752             fold_array = PL_fold_latin1;
7753             type = REFFA;
7754             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7755             goto do_nref;
7756 
7757         case REFFUN:  /*  /\g{name}/iu  */
7758             folder = foldEQ_latin1;
7759             fold_array = PL_fold_latin1;
7760             type = REFFU;
7761             utf8_fold_flags = 0;
7762             goto do_nref;
7763 
7764         case REFFN:  /*  /\g{name}/i  */
7765             folder = foldEQ;
7766             fold_array = PL_fold;
7767             type = REFF;
7768             utf8_fold_flags = 0;
7769             goto do_nref;
7770 
7771         case REFN:  /*  /\g{name}/   */
7772             type = REF;
7773             folder = NULL;
7774             fold_array = NULL;
7775             utf8_fold_flags = 0;
7776           do_nref:
7777 
7778             /* For the named back references, find the corresponding buffer
7779              * number */
7780             n = reg_check_named_buff_matched(rex,scan);
7781 
7782             if ( ! n ) {
7783                 sayNO;
7784             }
7785             goto do_nref_ref_common;
7786 
7787         case REFFL:  /*  /\1/il  */
7788             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7789             folder = foldEQ_locale;
7790             fold_array = PL_fold_locale;
7791             utf8_fold_flags = FOLDEQ_LOCALE;
7792             goto do_ref;
7793 
7794         case REFFA:  /*  /\1/iaa  */
7795             folder = foldEQ_latin1;
7796             fold_array = PL_fold_latin1;
7797             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7798             goto do_ref;
7799 
7800         case REFFU:  /*  /\1/iu  */
7801             folder = foldEQ_latin1;
7802             fold_array = PL_fold_latin1;
7803             utf8_fold_flags = 0;
7804             goto do_ref;
7805 
7806         case REFF:  /*  /\1/i  */
7807             folder = foldEQ;
7808             fold_array = PL_fold;
7809             utf8_fold_flags = 0;
7810             goto do_ref;
7811 
7812         case REF:  /*  /\1/    */
7813             folder = NULL;
7814             fold_array = NULL;
7815             utf8_fold_flags = 0;
7816 
7817           do_ref:
7818             type = OP(scan);
7819             n = ARG(scan);  /* which paren pair */
7820 
7821           do_nref_ref_common:
7822             ln = rex->offs[n].start;
7823             endref = rex->offs[n].end;
7824             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7825             if (rex->lastparen < n || ln == -1 || endref == -1)
7826                 sayNO;			/* Do not match unless seen CLOSEn. */
7827             if (ln == endref)
7828                 break;
7829 
7830             s = reginfo->strbeg + ln;
7831             if (type != REF	/* REF can do byte comparison */
7832                 && (utf8_target || type == REFFU || type == REFFL))
7833             {
7834                 char * limit = loceol;
7835 
7836                 /* This call case insensitively compares the entire buffer
7837                     * at s, with the current input starting at locinput, but
7838                     * not going off the end given by loceol, and
7839                     * returns in <limit> upon success, how much of the
7840                     * current input was matched */
7841                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7842                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
7843                 {
7844                     sayNO;
7845                 }
7846                 locinput = limit;
7847                 break;
7848             }
7849 
7850             /* Not utf8:  Inline the first character, for speed. */
7851             if ( ! NEXTCHR_IS_EOS
7852                 && locinput < loceol
7853                 && UCHARAT(s) != nextbyte
7854                 && (   type == REF
7855                     || UCHARAT(s) != fold_array[nextbyte]))
7856             {
7857                 sayNO;
7858             }
7859             ln = endref - ln;
7860             if (locinput + ln > loceol)
7861                 sayNO;
7862             if (ln > 1 && (type == REF
7863                            ? memNE(s, locinput, ln)
7864                            : ! folder(locinput, s, ln)))
7865                 sayNO;
7866             locinput += ln;
7867             break;
7868         }
7869 
7870         case NOTHING: /* null op; e.g. the 'nothing' following
7871                        * the '*' in m{(a+|b)*}' */
7872             break;
7873         case TAIL: /* placeholder while compiling (A|B|C) */
7874             break;
7875 
7876 #undef  ST
7877 #define ST st->u.eval
7878 #define CUR_EVAL cur_eval->u.eval
7879 
7880         {
7881             SV *ret;
7882             REGEXP *re_sv;
7883             regexp *re;
7884             regexp_internal *rei;
7885             regnode *startpoint;
7886             U32 arg;
7887 
7888         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
7889             arg= (U32)ARG(scan);
7890             if (cur_eval && cur_eval->locinput == locinput) {
7891                 if ( ++nochange_depth > max_nochange_depth )
7892                     Perl_croak(aTHX_
7893                         "Pattern subroutine nesting without pos change"
7894                         " exceeded limit in regex");
7895             } else {
7896                 nochange_depth = 0;
7897             }
7898             re_sv = rex_sv;
7899             re = rex;
7900             rei = rexi;
7901             startpoint = scan + ARG2L(scan);
7902             EVAL_CLOSE_PAREN_SET( st, arg );
7903             /* Detect infinite recursion
7904              *
7905              * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7906              * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7907              * So we track the position in the string we are at each time
7908              * we recurse and if we try to enter the same routine twice from
7909              * the same position we throw an error.
7910              */
7911             if ( rex->recurse_locinput[arg] == locinput ) {
7912                 /* FIXME: we should show the regop that is failing as part
7913                  * of the error message. */
7914                 Perl_croak(aTHX_ "Infinite recursion in regex");
7915             } else {
7916                 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7917                 rex->recurse_locinput[arg]= locinput;
7918 
7919                 DEBUG_r({
7920                     DECLARE_AND_GET_RE_DEBUG_FLAGS;
7921                     DEBUG_STACK_r({
7922                         Perl_re_exec_indentf( aTHX_
7923                             "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7924                             depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7925                         );
7926                     });
7927                 });
7928             }
7929 
7930             /* Save all the positions seen so far. */
7931             ST.cp = regcppush(rex, 0, maxopenparen);
7932             REGCP_SET(ST.lastcp);
7933 
7934             /* and then jump to the code we share with EVAL */
7935             goto eval_recurse_doit;
7936             /* NOTREACHED */
7937 
7938         case EVAL:  /*   /(?{...})B/   /(??{A})B/  and  /(?(?{...})X|Y)B/   */
7939             if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7940                 if ( ++nochange_depth > max_nochange_depth )
7941                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7942             } else {
7943                 nochange_depth = 0;
7944             }
7945             {
7946                 /* execute the code in the {...} */
7947 
7948                 dSP;
7949                 IV before;
7950                 OP * const oop = PL_op;
7951                 COP * const ocurcop = PL_curcop;
7952                 OP *nop;
7953                 CV *newcv;
7954 
7955                 /* save *all* paren positions */
7956                 regcppush(rex, 0, maxopenparen);
7957                 REGCP_SET(ST.lastcp);
7958 
7959                 if (!caller_cv)
7960                     caller_cv = find_runcv(NULL);
7961 
7962                 n = ARG(scan);
7963 
7964                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7965                     newcv = (ReANY(
7966                                     (REGEXP*)(rexi->data->data[n])
7967                             ))->qr_anoncv;
7968                     nop = (OP*)rexi->data->data[n+1];
7969                 }
7970                 else if (rexi->data->what[n] == 'l') { /* literal code */
7971                     newcv = caller_cv;
7972                     nop = (OP*)rexi->data->data[n];
7973                     assert(CvDEPTH(newcv));
7974                 }
7975                 else {
7976                     /* literal with own CV */
7977                     assert(rexi->data->what[n] == 'L');
7978                     newcv = rex->qr_anoncv;
7979                     nop = (OP*)rexi->data->data[n];
7980                 }
7981 
7982                 /* Some notes about MULTICALL and the context and save stacks.
7983                  *
7984                  * In something like
7985                  *   /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7986                  * since codeblocks don't introduce a new scope (so that
7987                  * local() etc accumulate), at the end of a successful
7988                  * match there will be a SAVEt_CLEARSV on the savestack
7989                  * for each of $x, $y, $z. If the three code blocks above
7990                  * happen to have come from different CVs (e.g. via
7991                  * embedded qr//s), then we must ensure that during any
7992                  * savestack unwinding, PL_comppad always points to the
7993                  * right pad at each moment. We achieve this by
7994                  * interleaving SAVEt_COMPPAD's on the savestack whenever
7995                  * there is a change of pad.
7996                  * In theory whenever we call a code block, we should
7997                  * push a CXt_SUB context, then pop it on return from
7998                  * that code block. This causes a bit of an issue in that
7999                  * normally popping a context also clears the savestack
8000                  * back to cx->blk_oldsaveix, but here we specifically
8001                  * don't want to clear the save stack on exit from the
8002                  * code block.
8003                  * Also for efficiency we don't want to keep pushing and
8004                  * popping the single SUB context as we backtrack etc.
8005                  * So instead, we push a single context the first time
8006                  * we need, it, then hang onto it until the end of this
8007                  * function. Whenever we encounter a new code block, we
8008                  * update the CV etc if that's changed. During the times
8009                  * in this function where we're not executing a code
8010                  * block, having the SUB context still there is a bit
8011                  * naughty - but we hope that no-one notices.
8012                  * When the SUB context is initially pushed, we fake up
8013                  * cx->blk_oldsaveix to be as if we'd pushed this context
8014                  * on first entry to S_regmatch rather than at some random
8015                  * point during the regexe execution. That way if we
8016                  * croak, popping the context stack will ensure that
8017                  * *everything* SAVEd by this function is undone and then
8018                  * the context popped, rather than e.g., popping the
8019                  * context (and restoring the original PL_comppad) then
8020                  * popping more of the savestack and restoring a bad
8021                  * PL_comppad.
8022                  */
8023 
8024                 /* If this is the first EVAL, push a MULTICALL. On
8025                  * subsequent calls, if we're executing a different CV, or
8026                  * if PL_comppad has got messed up from backtracking
8027                  * through SAVECOMPPADs, then refresh the context.
8028                  */
8029                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8030                 {
8031                     U8 flags = (CXp_SUB_RE |
8032                                 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8033                     SAVECOMPPAD();
8034                     if (last_pushed_cv) {
8035                         CHANGE_MULTICALL_FLAGS(newcv, flags);
8036                     }
8037                     else {
8038                         PUSH_MULTICALL_FLAGS(newcv, flags);
8039                     }
8040                     /* see notes above */
8041                     CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8042 
8043                     last_pushed_cv = newcv;
8044                 }
8045                 else {
8046                     /* these assignments are just to silence compiler
8047                      * warnings */
8048                     multicall_cop = NULL;
8049                 }
8050                 last_pad = PL_comppad;
8051 
8052                 /* the initial nextstate you would normally execute
8053                  * at the start of an eval (which would cause error
8054                  * messages to come from the eval), may be optimised
8055                  * away from the execution path in the regex code blocks;
8056                  * so manually set PL_curcop to it initially */
8057                 {
8058                     OP *o = cUNOPx(nop)->op_first;
8059                     assert(o->op_type == OP_NULL);
8060                     if (o->op_targ == OP_SCOPE) {
8061                         o = cUNOPo->op_first;
8062                     }
8063                     else {
8064                         assert(o->op_targ == OP_LEAVE);
8065                         o = cUNOPo->op_first;
8066                         assert(o->op_type == OP_ENTER);
8067                         o = OpSIBLING(o);
8068                     }
8069 
8070                     if (o->op_type != OP_STUB) {
8071                         assert(    o->op_type == OP_NEXTSTATE
8072                                 || o->op_type == OP_DBSTATE
8073                                 || (o->op_type == OP_NULL
8074                                     &&  (  o->op_targ == OP_NEXTSTATE
8075                                         || o->op_targ == OP_DBSTATE
8076                                         )
8077                                     )
8078                         );
8079                         PL_curcop = (COP*)o;
8080                     }
8081                 }
8082                 nop = nop->op_next;
8083 
8084                 DEBUG_STATE_r( Perl_re_printf( aTHX_
8085                     "  re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8086 
8087                 rex->offs[0].end = locinput - reginfo->strbeg;
8088                 if (reginfo->info_aux_eval->pos_magic)
8089                     MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8090                                   reginfo->sv, reginfo->strbeg,
8091                                   locinput - reginfo->strbeg);
8092 
8093                 if (sv_yes_mark) {
8094                     SV *sv_mrk = get_sv("REGMARK", 1);
8095                     sv_setsv(sv_mrk, sv_yes_mark);
8096                 }
8097 
8098                 /* we don't use MULTICALL here as we want to call the
8099                  * first op of the block of interest, rather than the
8100                  * first op of the sub. Also, we don't want to free
8101                  * the savestack frame */
8102                 before = (IV)(SP-PL_stack_base);
8103                 PL_op = nop;
8104                 CALLRUNOPS(aTHX);			/* Scalar context. */
8105                 SPAGAIN;
8106                 if ((IV)(SP-PL_stack_base) == before)
8107                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
8108                 else {
8109                     ret = POPs;
8110                     PUTBACK;
8111                 }
8112 
8113                 /* before restoring everything, evaluate the returned
8114                  * value, so that 'uninit' warnings don't use the wrong
8115                  * PL_op or pad. Also need to process any magic vars
8116                  * (e.g. $1) *before* parentheses are restored */
8117 
8118                 PL_op = NULL;
8119 
8120                 re_sv = NULL;
8121                 if (logical == 0) {       /*   (?{})/   */
8122                     SV *replsv = save_scalar(PL_replgv);
8123                     sv_setsv(replsv, ret); /* $^R */
8124                     SvSETMAGIC(replsv);
8125                 }
8126                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
8127                     sw = cBOOL(SvTRUE_NN(ret));
8128                     logical = 0;
8129                 }
8130                 else {                   /*  /(??{})  */
8131                     /*  if its overloaded, let the regex compiler handle
8132                      *  it; otherwise extract regex, or stringify  */
8133                     if (SvGMAGICAL(ret))
8134                         ret = sv_mortalcopy(ret);
8135                     if (!SvAMAGIC(ret)) {
8136                         SV *sv = ret;
8137                         if (SvROK(sv))
8138                             sv = SvRV(sv);
8139                         if (SvTYPE(sv) == SVt_REGEXP)
8140                             re_sv = (REGEXP*) sv;
8141                         else if (SvSMAGICAL(ret)) {
8142                             MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8143                             if (mg)
8144                                 re_sv = (REGEXP *) mg->mg_obj;
8145                         }
8146 
8147                         /* force any undef warnings here */
8148                         if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8149                             ret = sv_mortalcopy(ret);
8150                             (void) SvPV_force_nolen(ret);
8151                         }
8152                     }
8153 
8154                 }
8155 
8156                 /* *** Note that at this point we don't restore
8157                  * PL_comppad, (or pop the CxSUB) on the assumption it may
8158                  * be used again soon. This is safe as long as nothing
8159                  * in the regexp code uses the pad ! */
8160                 PL_op = oop;
8161                 PL_curcop = ocurcop;
8162                 regcp_restore(rex, ST.lastcp, &maxopenparen);
8163                 PL_curpm_under = PL_curpm;
8164                 PL_curpm = PL_reg_curpm;
8165 
8166                 if (logical != 2) {
8167                     PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8168                                     script_run_begin);
8169                     /* NOTREACHED */
8170                 }
8171             }
8172 
8173                 /* only /(??{})/  from now on */
8174                 logical = 0;
8175                 {
8176                     /* extract RE object from returned value; compiling if
8177                      * necessary */
8178 
8179                     if (re_sv) {
8180                         re_sv = reg_temp_copy(NULL, re_sv);
8181                     }
8182                     else {
8183                         U32 pm_flags = 0;
8184 
8185                         if (SvUTF8(ret) && IN_BYTES) {
8186                             /* In use 'bytes': make a copy of the octet
8187                              * sequence, but without the flag on */
8188                             STRLEN len;
8189                             const char *const p = SvPV(ret, len);
8190                             ret = newSVpvn_flags(p, len, SVs_TEMP);
8191                         }
8192                         if (rex->intflags & PREGf_USE_RE_EVAL)
8193                             pm_flags |= PMf_USE_RE_EVAL;
8194 
8195                         /* if we got here, it should be an engine which
8196                          * supports compiling code blocks and stuff */
8197                         assert(rex->engine && rex->engine->op_comp);
8198                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
8199                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8200                                     rex->engine, NULL, NULL,
8201                                     /* copy /msixn etc to inner pattern */
8202                                     ARG2L(scan),
8203                                     pm_flags);
8204 
8205                         if (!(SvFLAGS(ret)
8206                               & (SVs_TEMP | SVs_GMG | SVf_ROK))
8207                          && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8208                             /* This isn't a first class regexp. Instead, it's
8209                                caching a regexp onto an existing, Perl visible
8210                                scalar.  */
8211                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8212                         }
8213                     }
8214                     SAVEFREESV(re_sv);
8215                     re = ReANY(re_sv);
8216                 }
8217                 RXp_MATCH_COPIED_off(re);
8218                 re->subbeg = rex->subbeg;
8219                 re->sublen = rex->sublen;
8220                 re->suboffset = rex->suboffset;
8221                 re->subcoffset = rex->subcoffset;
8222                 re->lastparen = 0;
8223                 re->lastcloseparen = 0;
8224                 rei = RXi_GET(re);
8225                 DEBUG_EXECUTE_r(
8226                     debug_start_match(re_sv, utf8_target, locinput,
8227                                     reginfo->strend, "EVAL/GOSUB: Matching embedded");
8228                 );
8229                 startpoint = rei->program + 1;
8230                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8231                                              * close_paren only for GOSUB */
8232                 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8233                 /* Save all the seen positions so far. */
8234                 ST.cp = regcppush(rex, 0, maxopenparen);
8235                 REGCP_SET(ST.lastcp);
8236                 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8237                 maxopenparen = 0;
8238                 /* run the pattern returned from (??{...}) */
8239 
8240               eval_recurse_doit: /* Share code with GOSUB below this line
8241                             * At this point we expect the stack context to be
8242                             * set up correctly */
8243 
8244                 /* invalidate the S-L poscache. We're now executing a
8245                  * different set of WHILEM ops (and their associated
8246                  * indexes) against the same string, so the bits in the
8247                  * cache are meaningless. Setting maxiter to zero forces
8248                  * the cache to be invalidated and zeroed before reuse.
8249                  * XXX This is too dramatic a measure. Ideally we should
8250                  * save the old cache and restore when running the outer
8251                  * pattern again */
8252                 reginfo->poscache_maxiter = 0;
8253 
8254                 /* the new regexp might have a different is_utf8_pat than we do */
8255                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8256 
8257                 ST.prev_rex = rex_sv;
8258                 ST.prev_curlyx = cur_curlyx;
8259                 rex_sv = re_sv;
8260                 SET_reg_curpm(rex_sv);
8261                 rex = re;
8262                 rexi = rei;
8263                 cur_curlyx = NULL;
8264                 ST.B = next;
8265                 ST.prev_eval = cur_eval;
8266                 cur_eval = st;
8267                 /* now continue from first node in postoned RE */
8268                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8269                                     loceol, script_run_begin);
8270                 NOT_REACHED; /* NOTREACHED */
8271         }
8272 
8273         case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8274             /* note: this is called twice; first after popping B, then A */
8275             DEBUG_STACK_r({
8276                 Perl_re_exec_indentf( aTHX_  "EVAL_AB cur_eval=%p prev_eval=%p\n",
8277                     depth, cur_eval, ST.prev_eval);
8278             });
8279 
8280 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8281             if ( cur_eval && CUR_EVAL.close_paren ) {\
8282                 DEBUG_STACK_r({ \
8283                     Perl_re_exec_indentf( aTHX_  STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8284                         depth,    \
8285                         CUR_EVAL.close_paren - 1,\
8286                         cur_eval, \
8287                         VAL);     \
8288                 });               \
8289                 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8290             }
8291 
8292             SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8293 
8294             rex_sv = ST.prev_rex;
8295             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8296             SET_reg_curpm(rex_sv);
8297             rex = ReANY(rex_sv);
8298             rexi = RXi_GET(rex);
8299             {
8300                 /* preserve $^R across LEAVE's. See Bug 121070. */
8301                 SV *save_sv= GvSV(PL_replgv);
8302                 SV *replsv;
8303                 SvREFCNT_inc(save_sv);
8304                 regcpblow(ST.cp); /* LEAVE in disguise */
8305                 /* don't move this initialization up */
8306                 replsv = GvSV(PL_replgv);
8307                 sv_setsv(replsv, save_sv);
8308                 SvSETMAGIC(replsv);
8309                 SvREFCNT_dec(save_sv);
8310             }
8311             cur_eval = ST.prev_eval;
8312             cur_curlyx = ST.prev_curlyx;
8313 
8314             /* Invalidate cache. See "invalidate" comment above. */
8315             reginfo->poscache_maxiter = 0;
8316             if ( nochange_depth )
8317                 nochange_depth--;
8318 
8319             SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8320             sayYES;
8321 
8322 
8323         case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8324             REGCP_UNWIND(ST.lastcp);
8325             sayNO;
8326 
8327         case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8328             /* note: this is called twice; first after popping B, then A */
8329             DEBUG_STACK_r({
8330                 Perl_re_exec_indentf( aTHX_  "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8331                     depth, cur_eval, ST.prev_eval);
8332             });
8333 
8334             SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8335 
8336             rex_sv = ST.prev_rex;
8337             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8338             SET_reg_curpm(rex_sv);
8339             rex = ReANY(rex_sv);
8340             rexi = RXi_GET(rex);
8341 
8342             REGCP_UNWIND(ST.lastcp);
8343             regcppop(rex, &maxopenparen);
8344             cur_eval = ST.prev_eval;
8345             cur_curlyx = ST.prev_curlyx;
8346 
8347             /* Invalidate cache. See "invalidate" comment above. */
8348             reginfo->poscache_maxiter = 0;
8349             if ( nochange_depth )
8350                 nochange_depth--;
8351 
8352             SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8353             sayNO_SILENT;
8354 #undef ST
8355 
8356         case OPEN: /*  (  */
8357             n = ARG(scan);  /* which paren pair */
8358             rex->offs[n].start_tmp = locinput - reginfo->strbeg;
8359             if (n > maxopenparen)
8360                 maxopenparen = n;
8361             DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8362                 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8363                 depth,
8364                 PTR2UV(rex),
8365                 PTR2UV(rex->offs),
8366                 (UV)n,
8367                 (IV)rex->offs[n].start_tmp,
8368                 (UV)maxopenparen
8369             ));
8370             lastopen = n;
8371             break;
8372 
8373         case SROPEN: /*  (*SCRIPT_RUN:  */
8374             script_run_begin = (U8 *) locinput;
8375             break;
8376 
8377 
8378         case CLOSE:  /*  )  */
8379             n = ARG(scan);  /* which paren pair */
8380             CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8381                              locinput - reginfo->strbeg);
8382             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8383                 goto fake_end;
8384 
8385             break;
8386 
8387         case SRCLOSE:  /*  (*SCRIPT_RUN: ... )   */
8388 
8389             if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8390             {
8391                 sayNO;
8392             }
8393 
8394             break;
8395 
8396 
8397         case ACCEPT:  /*  (*ACCEPT)  */
8398             is_accepted = true;
8399             if (scan->flags)
8400                 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8401             utmp = (U32)ARG2L(scan);
8402 
8403             if ( utmp ) {
8404                 regnode *cursor;
8405                 for (
8406                     cursor = scan;
8407                     cursor && ( OP(cursor) != END );
8408                     cursor = ( PL_regkind[ OP(cursor) ] == END )
8409                              ? NEXTOPER(cursor)
8410                              : regnext(cursor)
8411                 ){
8412                     if ( OP(cursor) != CLOSE )
8413                         continue;
8414 
8415                     n = ARG(cursor);
8416 
8417                     if ( n > lastopen ) /* might be OPEN/CLOSE in the way */
8418                         continue;       /* so skip this one */
8419 
8420                     CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8421                                      locinput - reginfo->strbeg);
8422 
8423                     if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8424                         break;
8425                 }
8426             }
8427             goto fake_end;
8428             /* NOTREACHED */
8429 
8430         case GROUPP:  /*  (?(1))  */
8431             n = ARG(scan);  /* which paren pair */
8432             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
8433             break;
8434 
8435         case GROUPPN:  /*  (?(<name>))  */
8436             /* reg_check_named_buff_matched returns 0 for no match */
8437             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8438             break;
8439 
8440         case INSUBP:   /*  (?(R))  */
8441             n = ARG(scan);
8442             /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8443              * of SCAN is already set up as matches a eval.close_paren */
8444             sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8445             break;
8446 
8447         case DEFINEP:  /*  (?(DEFINE))  */
8448             sw = 0;
8449             break;
8450 
8451         case IFTHEN:   /*  (?(cond)A|B)  */
8452             reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8453             if (sw)
8454                 next = NEXTOPER(NEXTOPER(scan));
8455             else {
8456                 next = scan + ARG(scan);
8457                 if (OP(next) == IFTHEN) /* Fake one. */
8458                     next = NEXTOPER(NEXTOPER(next));
8459             }
8460             break;
8461 
8462         case LOGICAL:  /* modifier for EVAL and IFMATCH */
8463             logical = scan->flags;
8464             break;
8465 
8466 /*******************************************************************
8467 
8468 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8469 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8470 STAR/PLUS/CURLY/CURLYN are used instead.)
8471 
8472 A*B is compiled as <CURLYX><A><WHILEM><B>
8473 
8474 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8475 state, which contains the current count, initialised to -1. It also sets
8476 cur_curlyx to point to this state, with any previous value saved in the
8477 state block.
8478 
8479 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8480 since the pattern may possibly match zero times (i.e. it's a while {} loop
8481 rather than a do {} while loop).
8482 
8483 Each entry to WHILEM represents a successful match of A. The count in the
8484 CURLYX block is incremented, another WHILEM state is pushed, and execution
8485 passes to A or B depending on greediness and the current count.
8486 
8487 For example, if matching against the string a1a2a3b (where the aN are
8488 substrings that match /A/), then the match progresses as follows: (the
8489 pushed states are interspersed with the bits of strings matched so far):
8490 
8491     <CURLYX cnt=-1>
8492     <CURLYX cnt=0><WHILEM>
8493     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8494     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8495     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8496     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8497 
8498 (Contrast this with something like CURLYM, which maintains only a single
8499 backtrack state:
8500 
8501     <CURLYM cnt=0> a1
8502     a1 <CURLYM cnt=1> a2
8503     a1 a2 <CURLYM cnt=2> a3
8504     a1 a2 a3 <CURLYM cnt=3> b
8505 )
8506 
8507 Each WHILEM state block marks a point to backtrack to upon partial failure
8508 of A or B, and also contains some minor state data related to that
8509 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
8510 overall state, such as the count, and pointers to the A and B ops.
8511 
8512 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8513 must always point to the *current* CURLYX block, the rules are:
8514 
8515 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8516 and set cur_curlyx to point the new block.
8517 
8518 When popping the CURLYX block after a successful or unsuccessful match,
8519 restore the previous cur_curlyx.
8520 
8521 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8522 to the outer one saved in the CURLYX block.
8523 
8524 When popping the WHILEM block after a successful or unsuccessful B match,
8525 restore the previous cur_curlyx.
8526 
8527 Here's an example for the pattern (AI* BI)*BO
8528 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8529 
8530 cur_
8531 curlyx backtrack stack
8532 ------ ---------------
8533 NULL
8534 CO     <CO prev=NULL> <WO>
8535 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8536 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8537 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8538 
8539 At this point the pattern succeeds, and we work back down the stack to
8540 clean up, restoring as we go:
8541 
8542 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8543 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8544 CO     <CO prev=NULL> <WO>
8545 NULL
8546 
8547 *******************************************************************/
8548 
8549 #define ST st->u.curlyx
8550 
8551         case CURLYX:    /* start of /A*B/  (for complex A) */
8552         {
8553             /* No need to save/restore up to this paren */
8554             I32 parenfloor = scan->flags;
8555 
8556             assert(next); /* keep Coverity happy */
8557             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
8558                 next += ARG(next);
8559 
8560             /* XXXX Probably it is better to teach regpush to support
8561                parenfloor > maxopenparen ... */
8562             if (parenfloor > (I32)rex->lastparen)
8563                 parenfloor = rex->lastparen; /* Pessimization... */
8564 
8565             ST.prev_curlyx= cur_curlyx;
8566             cur_curlyx = st;
8567             ST.cp = PL_savestack_ix;
8568 
8569             /* these fields contain the state of the current curly.
8570              * they are accessed by subsequent WHILEMs */
8571             ST.parenfloor = parenfloor;
8572             ST.me = scan;
8573             ST.B = next;
8574             ST.minmod = minmod;
8575             minmod = 0;
8576             ST.count = -1;	/* this will be updated by WHILEM */
8577             ST.lastloc = NULL;  /* this will be updated by WHILEM */
8578 
8579             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
8580                                 script_run_begin);
8581             NOT_REACHED; /* NOTREACHED */
8582         }
8583 
8584         case CURLYX_end: /* just finished matching all of A*B */
8585             cur_curlyx = ST.prev_curlyx;
8586             sayYES;
8587             NOT_REACHED; /* NOTREACHED */
8588 
8589         case CURLYX_end_fail: /* just failed to match all of A*B */
8590             regcpblow(ST.cp);
8591             cur_curlyx = ST.prev_curlyx;
8592             sayNO;
8593             NOT_REACHED; /* NOTREACHED */
8594 
8595 
8596 #undef ST
8597 #define ST st->u.whilem
8598 
8599         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
8600         {
8601             /* see the discussion above about CURLYX/WHILEM */
8602             I32 n;
8603             int min, max;
8604             regnode *A;
8605 
8606             assert(cur_curlyx); /* keep Coverity happy */
8607 
8608             min = ARG1(cur_curlyx->u.curlyx.me);
8609             max = ARG2(cur_curlyx->u.curlyx.me);
8610             A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
8611             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8612             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8613             ST.cache_offset = 0;
8614             ST.cache_mask = 0;
8615 
8616 
8617             DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: matched %ld out of %d..%d\n",
8618                   depth, (long)n, min, max)
8619             );
8620 
8621             /* First just match a string of min A's. */
8622 
8623             if (n < min) {
8624                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8625                 cur_curlyx->u.curlyx.lastloc = locinput;
8626                 REGCP_SET(ST.lastcp);
8627 
8628                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8629                                 script_run_begin);
8630                 NOT_REACHED; /* NOTREACHED */
8631             }
8632 
8633             /* If degenerate A matches "", assume A done. */
8634 
8635             if (locinput == cur_curlyx->u.curlyx.lastloc) {
8636                 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: empty match detected, trying continuation...\n",
8637                    depth)
8638                 );
8639                 goto do_whilem_B_max;
8640             }
8641 
8642             /* super-linear cache processing.
8643              *
8644              * The idea here is that for certain types of CURLYX/WHILEM -
8645              * principally those whose upper bound is infinity (and
8646              * excluding regexes that have things like \1 and other very
8647              * non-regular expresssiony things), then if a pattern like
8648              * /....A*.../ fails and we backtrack to the WHILEM, then we
8649              * make a note that this particular WHILEM op was at string
8650              * position 47 (say) when the rest of pattern failed. Then, if
8651              * we ever find ourselves back at that WHILEM, and at string
8652              * position 47 again, we can just fail immediately rather than
8653              * running the rest of the pattern again.
8654              *
8655              * This is very handy when patterns start to go
8656              * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8657              * with a combinatorial explosion of backtracking.
8658              *
8659              * The cache is implemented as a bit array, with one bit per
8660              * string byte position per WHILEM op (up to 16) - so its
8661              * between 0.25 and 2x the string size.
8662              *
8663              * To avoid allocating a poscache buffer every time, we do an
8664              * initially countdown; only after we have  executed a WHILEM
8665              * op (string-length x #WHILEMs) times do we allocate the
8666              * cache.
8667              *
8668              * The top 4 bits of scan->flags byte say how many different
8669              * relevant CURLLYX/WHILEM op pairs there are, while the
8670              * bottom 4-bits is the identifying index number of this
8671              * WHILEM.
8672              */
8673 
8674             if (scan->flags) {
8675 
8676                 if (!reginfo->poscache_maxiter) {
8677                     /* start the countdown: Postpone detection until we
8678                      * know the match is not *that* much linear. */
8679                     reginfo->poscache_maxiter
8680                         =    (reginfo->strend - reginfo->strbeg + 1)
8681                            * (scan->flags>>4);
8682                     /* possible overflow for long strings and many CURLYX's */
8683                     if (reginfo->poscache_maxiter < 0)
8684                         reginfo->poscache_maxiter = I32_MAX;
8685                     reginfo->poscache_iter = reginfo->poscache_maxiter;
8686                 }
8687 
8688                 if (reginfo->poscache_iter-- == 0) {
8689                     /* initialise cache */
8690                     const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8691                     regmatch_info_aux *const aux = reginfo->info_aux;
8692                     if (aux->poscache) {
8693                         if ((SSize_t)reginfo->poscache_size < size) {
8694                             Renew(aux->poscache, size, char);
8695                             reginfo->poscache_size = size;
8696                         }
8697                         Zero(aux->poscache, size, char);
8698                     }
8699                     else {
8700                         reginfo->poscache_size = size;
8701                         Newxz(aux->poscache, size, char);
8702                     }
8703                     DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8704       "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8705                               PL_colors[4], PL_colors[5])
8706                     );
8707                 }
8708 
8709                 if (reginfo->poscache_iter < 0) {
8710                     /* have we already failed at this position? */
8711                     SSize_t offset, mask;
8712 
8713                     reginfo->poscache_iter = -1; /* stop eventual underflow */
8714                     offset  = (scan->flags & 0xf) - 1
8715                                 +   (locinput - reginfo->strbeg)
8716                                   * (scan->flags>>4);
8717                     mask    = 1 << (offset % 8);
8718                     offset /= 8;
8719                     if (reginfo->info_aux->poscache[offset] & mask) {
8720                         DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_  "WHILEM: (cache) already tried at this position...\n",
8721                             depth)
8722                         );
8723                         cur_curlyx->u.curlyx.count--;
8724                         sayNO; /* cache records failure */
8725                     }
8726                     ST.cache_offset = offset;
8727                     ST.cache_mask   = mask;
8728                 }
8729             }
8730 
8731             /* Prefer B over A for minimal matching. */
8732 
8733             if (cur_curlyx->u.curlyx.minmod) {
8734                 ST.save_curlyx = cur_curlyx;
8735                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8736                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8737                                     locinput, loceol, script_run_begin);
8738                 NOT_REACHED; /* NOTREACHED */
8739             }
8740 
8741             /* Prefer A over B for maximal matching. */
8742 
8743             if (n < max) { /* More greed allowed? */
8744                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8745                             maxopenparen);
8746                 cur_curlyx->u.curlyx.lastloc = locinput;
8747                 REGCP_SET(ST.lastcp);
8748                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8749                                 script_run_begin);
8750                 NOT_REACHED; /* NOTREACHED */
8751             }
8752             goto do_whilem_B_max;
8753         }
8754         NOT_REACHED; /* NOTREACHED */
8755 
8756         case WHILEM_B_min: /* just matched B in a minimal match */
8757         case WHILEM_B_max: /* just matched B in a maximal match */
8758             cur_curlyx = ST.save_curlyx;
8759             sayYES;
8760             NOT_REACHED; /* NOTREACHED */
8761 
8762         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8763             cur_curlyx = ST.save_curlyx;
8764             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8765             cur_curlyx->u.curlyx.count--;
8766             CACHEsayNO;
8767             NOT_REACHED; /* NOTREACHED */
8768 
8769         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8770             /* FALLTHROUGH */
8771         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8772             REGCP_UNWIND(ST.lastcp);
8773             regcppop(rex, &maxopenparen);
8774             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8775             cur_curlyx->u.curlyx.count--;
8776             CACHEsayNO;
8777             NOT_REACHED; /* NOTREACHED */
8778 
8779         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8780             REGCP_UNWIND(ST.lastcp);
8781             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8782             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: failed, trying continuation...\n",
8783                 depth)
8784             );
8785           do_whilem_B_max:
8786             if (cur_curlyx->u.curlyx.count >= REG_INFTY
8787                 && ckWARN(WARN_REGEXP)
8788                 && !reginfo->warned)
8789             {
8790                 reginfo->warned	= TRUE;
8791                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8792                      "Complex regular subexpression recursion limit (%d) "
8793                      "exceeded",
8794                      REG_INFTY - 1);
8795             }
8796 
8797             /* now try B */
8798             ST.save_curlyx = cur_curlyx;
8799             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8800             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8801                                 locinput, loceol, script_run_begin);
8802             NOT_REACHED; /* NOTREACHED */
8803 
8804         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8805             cur_curlyx = ST.save_curlyx;
8806 
8807             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8808                 /* Maximum greed exceeded */
8809                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8810                     && ckWARN(WARN_REGEXP)
8811                     && !reginfo->warned)
8812                 {
8813                     reginfo->warned	= TRUE;
8814                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8815                         "Complex regular subexpression recursion "
8816                         "limit (%d) exceeded",
8817                         REG_INFTY - 1);
8818                 }
8819                 cur_curlyx->u.curlyx.count--;
8820                 CACHEsayNO;
8821             }
8822 
8823             DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_  "WHILEM: B min fail: trying longer...\n", depth)
8824             );
8825             /* Try grabbing another A and see if it helps. */
8826             cur_curlyx->u.curlyx.lastloc = locinput;
8827             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8828                             maxopenparen);
8829             REGCP_SET(ST.lastcp);
8830             PUSH_STATE_GOTO(WHILEM_A_min,
8831                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8832                 locinput, loceol, script_run_begin);
8833             NOT_REACHED; /* NOTREACHED */
8834 
8835 #undef  ST
8836 #define ST st->u.branch
8837 
8838         case BRANCHJ:	    /*  /(...|A|...)/ with long next pointer */
8839             next = scan + ARG(scan);
8840             if (next == scan)
8841                 next = NULL;
8842             scan = NEXTOPER(scan);
8843             /* FALLTHROUGH */
8844 
8845         case BRANCH:	    /*  /(...|A|...)/ */
8846             scan = NEXTOPER(scan); /* scan now points to inner node */
8847             ST.lastparen = rex->lastparen;
8848             ST.lastcloseparen = rex->lastcloseparen;
8849             ST.next_branch = next;
8850             REGCP_SET(ST.cp);
8851 
8852             /* Now go into the branch */
8853             if (has_cutgroup) {
8854                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8855                                     script_run_begin);
8856             } else {
8857                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8858                                 script_run_begin);
8859             }
8860             NOT_REACHED; /* NOTREACHED */
8861 
8862         case CUTGROUP:  /*  /(*THEN)/  */
8863             sv_yes_mark = st->u.mark.mark_name = scan->flags
8864                 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8865                 : NULL;
8866             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8867                             script_run_begin);
8868             NOT_REACHED; /* NOTREACHED */
8869 
8870         case CUTGROUP_next_fail:
8871             do_cutgroup = 1;
8872             no_final = 1;
8873             if (st->u.mark.mark_name)
8874                 sv_commit = st->u.mark.mark_name;
8875             sayNO;
8876             NOT_REACHED; /* NOTREACHED */
8877 
8878         case BRANCH_next:
8879             sayYES;
8880             NOT_REACHED; /* NOTREACHED */
8881 
8882         case BRANCH_next_fail: /* that branch failed; try the next, if any */
8883             if (do_cutgroup) {
8884                 do_cutgroup = 0;
8885                 no_final = 0;
8886             }
8887             REGCP_UNWIND(ST.cp);
8888             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8889             scan = ST.next_branch;
8890             /* no more branches? */
8891             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8892                 DEBUG_EXECUTE_r({
8893                     Perl_re_exec_indentf( aTHX_  "%sBRANCH failed...%s\n",
8894                         depth,
8895                         PL_colors[4],
8896                         PL_colors[5] );
8897                 });
8898                 sayNO_SILENT;
8899             }
8900             continue; /* execute next BRANCH[J] op */
8901             /* NOTREACHED */
8902 
8903         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
8904             minmod = 1;
8905             break;
8906 
8907 #undef  ST
8908 #define ST st->u.curlym
8909 
8910         case CURLYM:	/* /A{m,n}B/ where A is fixed-length */
8911 
8912             /* This is an optimisation of CURLYX that enables us to push
8913              * only a single backtracking state, no matter how many matches
8914              * there are in {m,n}. It relies on the pattern being constant
8915              * length, with no parens to influence future backrefs
8916              */
8917 
8918             ST.me = scan;
8919             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8920 
8921             ST.lastparen      = rex->lastparen;
8922             ST.lastcloseparen = rex->lastcloseparen;
8923 
8924             /* if paren positive, emulate an OPEN/CLOSE around A */
8925             if (ST.me->flags) {
8926                 U32 paren = ST.me->flags;
8927                 lastopen = paren;
8928                 if (paren > maxopenparen)
8929                     maxopenparen = paren;
8930                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8931             }
8932             ST.A = scan;
8933             ST.B = next;
8934             ST.alen = 0;
8935             ST.count = 0;
8936             ST.minmod = minmod;
8937             minmod = 0;
8938             ST.Binfo.count = -1;
8939             REGCP_SET(ST.cp);
8940 
8941             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8942                 goto curlym_do_B;
8943 
8944           curlym_do_A: /* execute the A in /A{m,n}B/  */
8945             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8946                                 script_run_begin);
8947             NOT_REACHED; /* NOTREACHED */
8948 
8949         case CURLYM_A: /* we've just matched an A */
8950             ST.count++;
8951             /* after first match, determine A's length: u.curlym.alen */
8952             if (ST.count == 1) {
8953                 if (reginfo->is_utf8_target) {
8954                     char *s = st->locinput;
8955                     while (s < locinput) {
8956                         ST.alen++;
8957                         s += UTF8SKIP(s);
8958                     }
8959                 }
8960                 else {
8961                     ST.alen = locinput - st->locinput;
8962                 }
8963                 if (ST.alen == 0)
8964                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8965             }
8966             DEBUG_EXECUTE_r(
8967                 Perl_re_exec_indentf( aTHX_  "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8968                           depth, (IV) ST.count, (IV)ST.alen)
8969             );
8970 
8971             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8972                 goto fake_end;
8973 
8974 
8975             if (!is_accepted) {
8976                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8977                 if ( max == REG_INFTY || ST.count < max )
8978                     goto curlym_do_A; /* try to match another A */
8979             }
8980             goto curlym_do_B; /* try to match B */
8981 
8982         case CURLYM_A_fail: /* just failed to match an A */
8983             REGCP_UNWIND(ST.cp);
8984 
8985 
8986             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8987                 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8988                 sayNO;
8989 
8990           curlym_do_B: /* execute the B in /A{m,n}B/  */
8991             if (is_accepted)
8992                 goto curlym_close_B;
8993 
8994             if (ST.Binfo.count < 0) {
8995                 /* calculate possible match of 1st char following curly */
8996                 assert(ST.B);
8997                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8998                     regnode *text_node = ST.B;
8999                     if (! HAS_TEXT(text_node))
9000                         FIND_NEXT_IMPT(text_node);
9001                     if (PL_regkind[OP(text_node)] == EXACT) {
9002                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9003                                                         &ST.Binfo, reginfo))
9004                         {
9005                             sayNO;
9006                         }
9007                     }
9008                 }
9009             }
9010 
9011             DEBUG_EXECUTE_r(
9012                 Perl_re_exec_indentf( aTHX_  "CURLYM trying tail with matches=%" IVdf "...\n",
9013                     depth, (IV)ST.count)
9014             );
9015             if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
9016                 assert(ST.Binfo.count > 0);
9017 
9018                 /* Do a quick test to hopefully rule out most non-matches */
9019                 if (     locinput + ST.Binfo.min_length > loceol
9020                     || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
9021                 {
9022                     DEBUG_OPTIMISE_r(
9023                         Perl_re_exec_indentf( aTHX_
9024                             "CURLYM Fast bail next target=0x%X anded==0x%X"
9025                                                                 " mask=0x%X\n",
9026                             depth,
9027                             (int) nextbyte, ST.Binfo.first_byte_anded,
9028                                             ST.Binfo.first_byte_mask)
9029                     );
9030                     state_num = CURLYM_B_fail;
9031                     goto reenter_switch;
9032                 }
9033             }
9034 
9035           curlym_close_B:
9036             if (ST.me->flags) {
9037                 /* emulate CLOSE: mark current A as captured */
9038                 U32 paren = (U32)ST.me->flags;
9039                 if (ST.count || is_accepted) {
9040                     CLOSE_CAPTURE(paren,
9041                         HOPc(locinput, -ST.alen) - reginfo->strbeg,
9042                         locinput - reginfo->strbeg);
9043                 }
9044                 else
9045                     rex->offs[paren].end = -1;
9046                 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9047                 {
9048                     if (ST.count || is_accepted)
9049                         goto fake_end;
9050                     else
9051                         sayNO;
9052                 }
9053             }
9054 
9055             if (is_accepted)
9056                 goto fake_end;
9057 
9058             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol,   /* match B */
9059                             script_run_begin);
9060             NOT_REACHED; /* NOTREACHED */
9061 
9062         case CURLYM_B_fail: /* just failed to match a B */
9063             REGCP_UNWIND(ST.cp);
9064             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9065             if (ST.minmod) {
9066                 I32 max = ARG2(ST.me);
9067                 if (max != REG_INFTY && ST.count == max)
9068                     sayNO;
9069                 goto curlym_do_A; /* try to match a further A */
9070             }
9071             /* backtrack one A */
9072             if (ST.count == ARG1(ST.me) /* min */)
9073                 sayNO;
9074             ST.count--;
9075             SET_locinput(HOPc(locinput, -ST.alen));
9076             goto curlym_do_B; /* try to match B */
9077 
9078 #undef ST
9079 #define ST st->u.curly
9080 
9081 #define CURLY_SETPAREN(paren, success) \
9082     if (paren) { \
9083         if (success) { \
9084             CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
9085                                  locinput - reginfo->strbeg); \
9086         } \
9087         else { \
9088             rex->offs[paren].end = -1; \
9089             rex->lastparen      = ST.lastparen; \
9090             rex->lastcloseparen = ST.lastcloseparen; \
9091         } \
9092     }
9093 
9094         case STAR:		/*  /A*B/ where A is width 1 char */
9095             ST.paren = 0;
9096             ST.min = 0;
9097             ST.max = REG_INFTY;
9098             scan = NEXTOPER(scan);
9099             goto repeat;
9100 
9101         case PLUS:		/*  /A+B/ where A is width 1 char */
9102             ST.paren = 0;
9103             ST.min = 1;
9104             ST.max = REG_INFTY;
9105             scan = NEXTOPER(scan);
9106             goto repeat;
9107 
9108         case CURLYN:		/*  /(A){m,n}B/ where A is width 1 char */
9109             ST.paren = scan->flags;	/* Which paren to set */
9110             ST.lastparen      = rex->lastparen;
9111             ST.lastcloseparen = rex->lastcloseparen;
9112             if (ST.paren > maxopenparen)
9113                 maxopenparen = ST.paren;
9114             ST.min = ARG1(scan);  /* min to match */
9115             ST.max = ARG2(scan);  /* max to match */
9116             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
9117 
9118             /* handle the single-char capture called as a GOSUB etc */
9119             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9120             {
9121                 char *li = locinput;
9122                 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9123                     sayNO;
9124                 SET_locinput(li);
9125                 goto fake_end;
9126             }
9127 
9128             goto repeat;
9129 
9130         case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
9131             ST.paren = 0;
9132             ST.min = ARG1(scan);  /* min to match */
9133             ST.max = ARG2(scan);  /* max to match */
9134             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
9135           repeat:
9136             /*
9137             * Lookahead to avoid useless match attempts
9138             * when we know what character comes next.
9139             *
9140             * Used to only do .*x and .*?x, but now it allows
9141             * for )'s, ('s and (?{ ... })'s to be in the way
9142             * of the quantifier and the EXACT-like node.  -- japhy
9143             */
9144 
9145             assert(ST.min <= ST.max);
9146             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9147                 ST.Binfo.count = 0;
9148             }
9149             else {
9150                 regnode *text_node = next;
9151 
9152                 if (! HAS_TEXT(text_node))
9153                     FIND_NEXT_IMPT(text_node);
9154 
9155                 if (! HAS_TEXT(text_node))
9156                     ST.Binfo.count = 0;
9157                 else {
9158                     if ( PL_regkind[OP(text_node)] != EXACT ) {
9159                         ST.Binfo.count = 0;
9160                     }
9161                     else {
9162                         if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9163                                                         &ST.Binfo, reginfo))
9164                         {
9165                             sayNO;
9166                         }
9167                     }
9168                 }
9169             }
9170 
9171             ST.A = scan;
9172             ST.B = next;
9173             if (minmod) {
9174                 char *li = locinput;
9175                 minmod = 0;
9176                 if (ST.min &&
9177                         regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9178                             < ST.min)
9179                     sayNO;
9180                 SET_locinput(li);
9181                 ST.count = ST.min;
9182                 REGCP_SET(ST.cp);
9183 
9184                 if (ST.Binfo.count <= 0)
9185                     goto curly_try_B_min;
9186 
9187                 ST.oldloc = locinput;
9188 
9189                 /* set ST.maxpos to the furthest point along the
9190                  * string that could possibly match, i.e., that a match could
9191                  * start at. */
9192                 if  (ST.max == REG_INFTY) {
9193                     ST.maxpos = loceol - 1;
9194                     if (utf8_target)
9195                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9196                             ST.maxpos--;
9197                 }
9198                 else if (utf8_target) {
9199                     int m = ST.max - ST.min;
9200                     for (ST.maxpos = locinput;
9201                          m >0 && ST.maxpos <  loceol; m--)
9202                         ST.maxpos += UTF8SKIP(ST.maxpos);
9203                 }
9204                 else {
9205                     ST.maxpos = locinput + ST.max - ST.min;
9206                     if (ST.maxpos >=  loceol)
9207                         ST.maxpos =  loceol - 1;
9208                 }
9209                 goto curly_try_B_min_known;
9210 
9211             }
9212             else {
9213                 /* avoid taking address of locinput, so it can remain
9214                  * a register var */
9215                 char *li = locinput;
9216                 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9217                 if (ST.count < ST.min)
9218                     sayNO;
9219                 SET_locinput(li);
9220                 if ((ST.count > ST.min)
9221                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
9222                 {
9223                     /* A{m,n} must come at the end of the string, there's
9224                      * no point in backing off ... */
9225                     ST.min = ST.count;
9226                     /* ...except that $ and \Z can match before *and* after
9227                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
9228                        We may back off by one in this case. */
9229                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9230                         ST.min--;
9231                 }
9232                 REGCP_SET(ST.cp);
9233                 goto curly_try_B_max;
9234             }
9235             NOT_REACHED; /* NOTREACHED */
9236 
9237         case CURLY_B_min_fail:
9238             /* failed to find B in a non-greedy match. */
9239 
9240             REGCP_UNWIND(ST.cp);
9241             if (ST.paren) {
9242                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9243             }
9244 
9245             if (ST.Binfo.count == 0) {
9246                 /* failed -- move forward one */
9247                 char *li = locinput;
9248                 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9249                     sayNO;
9250                 }
9251                 locinput = li;
9252                 ST.count++;
9253                 if (!(   ST.count <= ST.max
9254                         /* count overflow ? */
9255                      || (ST.max == REG_INFTY && ST.count > 0))
9256                 )
9257                     sayNO;
9258             }
9259             else {
9260                 int n;
9261                 /* Couldn't or didn't -- move forward. */
9262                 ST.oldloc = locinput;
9263                 if (utf8_target)
9264                     locinput += UTF8SKIP(locinput);
9265                 else
9266                     locinput++;
9267                 ST.count++;
9268 
9269               curly_try_B_min_known:
9270                 /* find the next place where 'B' could work, then call B */
9271                 if (locinput + ST.Binfo.initial_exact < loceol) {
9272                     if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9273 
9274                         /* Here, the mask is all 1's for the entire length of
9275                          * any possible match.  (That actually means that there
9276                          * is only one possible match.)  Look for the next
9277                          * occurrence */
9278                         locinput = ninstr(locinput, loceol,
9279                                         (char *) ST.Binfo.matches,
9280                                         (char *) ST.Binfo.matches
9281                                                     + ST.Binfo.initial_exact);
9282                         if (locinput == NULL) {
9283                             sayNO;
9284                         }
9285                     }
9286                     else do {
9287                         /* If the first byte(s) of the mask are all ones, it
9288                          * means those bytes must match identically, so can use
9289                          * ninstr() to find the next possible matchpoint */
9290                         if (ST.Binfo.initial_exact > 0) {
9291                             locinput = ninstr(locinput, loceol,
9292                                               (char *) ST.Binfo.matches,
9293                                               (char *) ST.Binfo.matches
9294                                                      + ST.Binfo.initial_exact);
9295                         }
9296                         else { /* Otherwise find the next byte that matches,
9297                                   masked */
9298                             locinput = (char *) find_next_masked(
9299                                                 (U8 *) locinput, (U8 *) loceol,
9300                                                 ST.Binfo.first_byte_anded,
9301                                                 ST.Binfo.first_byte_mask);
9302                             /* Advance to the end of a multi-byte character */
9303                             if (utf8_target) {
9304                                 while (   locinput < loceol
9305                                     && UTF8_IS_CONTINUATION(*locinput))
9306                                 {
9307                                     locinput++;
9308                                 }
9309                             }
9310                         }
9311                         if (   locinput == NULL
9312                             || locinput + ST.Binfo.min_length > loceol)
9313                         {
9314                             sayNO;
9315                         }
9316 
9317                         /* Here, we have found a possible match point; if can't
9318                          * rule it out, quit the loop so can check fully */
9319                         if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9320                             break;
9321                         }
9322 
9323                         locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9324 
9325                     } while (locinput <= ST.maxpos);
9326                 }
9327 
9328                 if (locinput > ST.maxpos)
9329                     sayNO;
9330 
9331                 n = (utf8_target)
9332                     ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9333                     : (STRLEN) (locinput - ST.oldloc);
9334 
9335 
9336                 /* Here is at the beginning of a character that meets the mask
9337                  * criteria.  Need to make sure that some real possibility */
9338 
9339                 if (n) {
9340                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9341                      * at what may be the beginning of b; check that everything
9342                      * between oldloc and locinput matches */
9343                     char *li = ST.oldloc;
9344                     ST.count += n;
9345                     if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9346                         sayNO;
9347                     assert(n == REG_INFTY || locinput == li);
9348                 }
9349             }
9350 
9351           curly_try_B_min:
9352             CURLY_SETPAREN(ST.paren, ST.count);
9353             PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9354                             script_run_begin);
9355             NOT_REACHED; /* NOTREACHED */
9356 
9357 
9358           curly_try_B_max:
9359             /* a successful greedy match: now try to match B */
9360             if (        ST.Binfo.count <= 0
9361                 || (    ST.Binfo.count > 0
9362                     &&  locinput + ST.Binfo.min_length <= loceol
9363                     &&  S_test_EXACTISH_ST(locinput, ST.Binfo)))
9364             {
9365                 CURLY_SETPAREN(ST.paren, ST.count);
9366                 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9367                                 script_run_begin);
9368                 NOT_REACHED; /* NOTREACHED */
9369             }
9370             /* FALLTHROUGH */
9371 
9372         case CURLY_B_max_fail:
9373             /* failed to find B in a greedy match */
9374 
9375             REGCP_UNWIND(ST.cp);
9376             if (ST.paren) {
9377                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9378             }
9379             /*  back up. */
9380             if (--ST.count < ST.min)
9381                 sayNO;
9382             locinput = HOPc(locinput, -1);
9383             goto curly_try_B_max;
9384 
9385 #undef ST
9386 
9387         case END: /*  last op of main pattern  */
9388           fake_end:
9389             if (cur_eval) {
9390                 /* we've just finished A in /(??{A})B/; now continue with B */
9391                 is_accepted= false;
9392                 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9393                 st->u.eval.prev_rex = rex_sv;		/* inner */
9394 
9395                 /* Save *all* the positions. */
9396                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9397                 rex_sv = CUR_EVAL.prev_rex;
9398                 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9399                 SET_reg_curpm(rex_sv);
9400                 rex = ReANY(rex_sv);
9401                 rexi = RXi_GET(rex);
9402 
9403                 st->u.eval.prev_curlyx = cur_curlyx;
9404                 cur_curlyx = CUR_EVAL.prev_curlyx;
9405 
9406                 REGCP_SET(st->u.eval.lastcp);
9407 
9408                 /* Restore parens of the outer rex without popping the
9409                  * savestack */
9410                 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9411 
9412                 st->u.eval.prev_eval = cur_eval;
9413                 cur_eval = CUR_EVAL.prev_eval;
9414                 DEBUG_EXECUTE_r(
9415                     Perl_re_exec_indentf( aTHX_  "END: EVAL trying tail ... (cur_eval=%p)\n",
9416                                       depth, cur_eval););
9417                 if ( nochange_depth )
9418                     nochange_depth--;
9419 
9420                 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9421 
9422                 PUSH_YES_STATE_GOTO(EVAL_postponed_AB,          /* match B */
9423                                     st->u.eval.prev_eval->u.eval.B,
9424                                     locinput, loceol, script_run_begin);
9425             }
9426 
9427             if (locinput < reginfo->till) {
9428                 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9429                                       "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9430                                       PL_colors[4],
9431                                       (long)(locinput - startpos),
9432                                       (long)(reginfo->till - startpos),
9433                                       PL_colors[5]));
9434 
9435                 sayNO_SILENT;		/* Cannot match: too short. */
9436             }
9437             sayYES;			/* Success! */
9438 
9439         case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH
9440                                 matches end at the right spot, required for
9441                                 variable length matches. */
9442             if (match_end && locinput != match_end)
9443             {
9444                 DEBUG_EXECUTE_r(
9445                 Perl_re_exec_indentf( aTHX_
9446                     "%sLOOKBEHIND_END: subpattern failed...%s\n",
9447                     depth, PL_colors[4], PL_colors[5]));
9448                 sayNO;            /* Variable length match didn't line up */
9449             }
9450             /* FALLTHROUGH */
9451 
9452         case SUCCEED: /* successful SUSPEND/CURLYM and
9453                                             *lookahead* IFMATCH/UNLESSM*/
9454             DEBUG_EXECUTE_r(
9455             Perl_re_exec_indentf( aTHX_
9456                 "%sSUCCEED: subpattern success...%s\n",
9457                 depth, PL_colors[4], PL_colors[5]));
9458             sayYES;			/* Success! */
9459 
9460 #undef  ST
9461 #define ST st->u.ifmatch
9462 
9463         case SUSPEND:	/* (?>A) */
9464             ST.wanted = 1;
9465             ST.start = locinput;
9466             ST.end = loceol;
9467             ST.count = 1;
9468             goto do_ifmatch;
9469 
9470         case UNLESSM:	/* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9471             ST.wanted = 0;
9472             goto ifmatch_trivial_fail_test;
9473 
9474         case IFMATCH:	/* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9475             ST.wanted = 1;
9476           ifmatch_trivial_fail_test:
9477             ST.prev_match_end= match_end;
9478             ST.count = scan->next_off + 1; /* next_off repurposed to be
9479                                               lookbehind count, requires
9480                                               non-zero flags */
9481             if (! scan->flags) {    /* 'flags' zero means lookahed */
9482 
9483                 /* Lookahead starts here and ends at the normal place */
9484                 ST.start = locinput;
9485                 ST.end = loceol;
9486                 match_end = NULL;
9487             }
9488             else {
9489                 PERL_UINT_FAST8_T back_count = scan->flags;
9490                 char * s;
9491                 match_end = locinput;
9492 
9493                 /* Lookbehind can look beyond the current position */
9494                 ST.end = loceol;
9495 
9496                 /* ... and starts at the first place in the input that is in
9497                  * the range of the possible start positions */
9498                 for (; ST.count > 0; ST.count--, back_count--) {
9499                     s = HOPBACKc(locinput, back_count);
9500                     if (s) {
9501                         ST.start = s;
9502                         goto do_ifmatch;
9503                     }
9504                 }
9505 
9506                 /* If the lookbehind doesn't start in the actual string, is a
9507                  * trivial match failure */
9508                 match_end = ST.prev_match_end;
9509                 if (logical) {
9510                     logical = 0;
9511                     sw = 1 - cBOOL(ST.wanted);
9512                 }
9513                 else if (ST.wanted)
9514                     sayNO;
9515 
9516                 /* Here, we didn't want it to match, so is actually success */
9517                 next = scan + ARG(scan);
9518                 if (next == scan)
9519                     next = NULL;
9520                 break;
9521             }
9522 
9523           do_ifmatch:
9524             ST.me = scan;
9525             ST.logical = logical;
9526             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9527 
9528             /* execute body of (?...A) */
9529             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
9530                                 ST.end, script_run_begin);
9531             NOT_REACHED; /* NOTREACHED */
9532 
9533         {
9534             bool matched;
9535 
9536         case IFMATCH_A_fail: /* body of (?...A) failed */
9537             if (! ST.logical && ST.count > 1) {
9538 
9539                 /* It isn't a real failure until we've tried all starting
9540                  * positions.  Move to the next starting position and retry */
9541                 ST.count--;
9542                 ST.start = HOPc(ST.start, 1);
9543                 scan = ST.me;
9544                 logical = ST.logical;
9545                 goto do_ifmatch;
9546             }
9547 
9548             /* Here, all starting positions have been tried. */
9549             matched = FALSE;
9550             goto ifmatch_done;
9551 
9552         case IFMATCH_A: /* body of (?...A) succeeded */
9553             matched = TRUE;
9554           ifmatch_done:
9555             sw = matched == ST.wanted;
9556             match_end = ST.prev_match_end;
9557             if (! ST.logical && !sw) {
9558                 sayNO;
9559             }
9560 
9561             if (OP(ST.me) != SUSPEND) {
9562                 /* restore old position except for (?>...) */
9563                 locinput = st->locinput;
9564                 loceol = st->loceol;
9565                 script_run_begin = st->sr0;
9566             }
9567             scan = ST.me + ARG(ST.me);
9568             if (scan == ST.me)
9569                 scan = NULL;
9570             continue; /* execute B */
9571         }
9572 
9573 #undef ST
9574 
9575         case LONGJMP: /*  alternative with many branches compiles to
9576                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9577             next = scan + ARG(scan);
9578             if (next == scan)
9579                 next = NULL;
9580             break;
9581 
9582         case COMMIT:  /*  (*COMMIT)  */
9583             reginfo->cutpoint = loceol;
9584             /* FALLTHROUGH */
9585 
9586         case PRUNE:   /*  (*PRUNE)   */
9587             if (scan->flags)
9588                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9589             PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9590                             script_run_begin);
9591             NOT_REACHED; /* NOTREACHED */
9592 
9593         case COMMIT_next_fail:
9594             no_final = 1;
9595             /* FALLTHROUGH */
9596             sayNO;
9597             NOT_REACHED; /* NOTREACHED */
9598 
9599         case OPFAIL:   /* (*FAIL)  */
9600             if (scan->flags)
9601                 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9602             if (logical) {
9603                 /* deal with (?(?!)X|Y) properly,
9604                  * make sure we trigger the no branch
9605                  * of the trailing IFTHEN structure*/
9606                 sw= 0;
9607                 break;
9608             } else {
9609                 sayNO;
9610             }
9611             NOT_REACHED; /* NOTREACHED */
9612 
9613 #define ST st->u.mark
9614         case MARKPOINT: /*  (*MARK:foo)  */
9615             ST.prev_mark = mark_state;
9616             ST.mark_name = sv_commit = sv_yes_mark
9617                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9618             mark_state = st;
9619             ST.mark_loc = locinput;
9620             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9621                                 script_run_begin);
9622             NOT_REACHED; /* NOTREACHED */
9623 
9624         case MARKPOINT_next:
9625             mark_state = ST.prev_mark;
9626             sayYES;
9627             NOT_REACHED; /* NOTREACHED */
9628 
9629         case MARKPOINT_next_fail:
9630             if (popmark && sv_eq(ST.mark_name,popmark))
9631             {
9632                 if (ST.mark_loc > startpoint)
9633                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9634                 popmark = NULL; /* we found our mark */
9635                 sv_commit = ST.mark_name;
9636 
9637                 DEBUG_EXECUTE_r({
9638                         Perl_re_exec_indentf( aTHX_  "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9639                             depth,
9640                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9641                 });
9642             }
9643             mark_state = ST.prev_mark;
9644             sv_yes_mark = mark_state ?
9645                 mark_state->u.mark.mark_name : NULL;
9646             sayNO;
9647             NOT_REACHED; /* NOTREACHED */
9648 
9649         case SKIP:  /*  (*SKIP)  */
9650             if (!scan->flags) {
9651                 /* (*SKIP) : if we fail we cut here*/
9652                 ST.mark_name = NULL;
9653                 ST.mark_loc = locinput;
9654                 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9655                                 script_run_begin);
9656             } else {
9657                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9658                    otherwise do nothing.  Meaning we need to scan
9659                  */
9660                 regmatch_state *cur = mark_state;
9661                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9662 
9663                 while (cur) {
9664                     if ( sv_eq( cur->u.mark.mark_name,
9665                                 find ) )
9666                     {
9667                         ST.mark_name = find;
9668                         PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9669                                          script_run_begin);
9670                     }
9671                     cur = cur->u.mark.prev_mark;
9672                 }
9673             }
9674             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9675             break;
9676 
9677         case SKIP_next_fail:
9678             if (ST.mark_name) {
9679                 /* (*CUT:NAME) - Set up to search for the name as we
9680                    collapse the stack*/
9681                 popmark = ST.mark_name;
9682             } else {
9683                 /* (*CUT) - No name, we cut here.*/
9684                 if (ST.mark_loc > startpoint)
9685                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9686                 /* but we set sv_commit to latest mark_name if there
9687                    is one so they can test to see how things lead to this
9688                    cut */
9689                 if (mark_state)
9690                     sv_commit=mark_state->u.mark.mark_name;
9691             }
9692             no_final = 1;
9693             sayNO;
9694             NOT_REACHED; /* NOTREACHED */
9695 #undef ST
9696 
9697         case LNBREAK: /* \R */
9698             if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9699                 locinput += n;
9700             } else
9701                 sayNO;
9702             break;
9703 
9704         default:
9705             PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9706                           PTR2UV(scan), OP(scan));
9707             Perl_croak(aTHX_ "regexp memory corruption");
9708 
9709         /* this is a point to jump to in order to increment
9710          * locinput by one character */
9711           increment_locinput:
9712             assert(!NEXTCHR_IS_EOS);
9713             if (utf8_target) {
9714                 locinput += PL_utf8skip[nextbyte];
9715                 /* locinput is allowed to go 1 char off the end (signifying
9716                  * EOS), but not 2+ */
9717                 if (locinput >  loceol)
9718                     sayNO;
9719             }
9720             else
9721                 locinput++;
9722             break;
9723 
9724         } /* end switch */
9725 
9726         /* switch break jumps here */
9727         scan = next; /* prepare to execute the next op and ... */
9728         continue;    /* ... jump back to the top, reusing st */
9729         /* NOTREACHED */
9730 
9731       push_yes_state:
9732         /* push a state that backtracks on success */
9733         st->u.yes.prev_yes_state = yes_state;
9734         yes_state = st;
9735         /* FALLTHROUGH */
9736       push_state:
9737         /* push a new regex state, then continue at scan  */
9738         {
9739             regmatch_state *newst;
9740             DECLARE_AND_GET_RE_DEBUG_FLAGS;
9741 
9742             DEBUG_r( /* DEBUG_STACK_r */
9743               if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9744                 regmatch_state *cur = st;
9745                 regmatch_state *curyes = yes_state;
9746                 U32 i;
9747                 regmatch_slab *slab = PL_regmatch_slab;
9748                 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9749                     if (cur < SLAB_FIRST(slab)) {
9750                         slab = slab->prev;
9751                         cur = SLAB_LAST(slab);
9752                     }
9753                     Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9754                         depth,
9755                         i ? "    " : "push",
9756                         depth - i, PL_reg_name[cur->resume_state],
9757                         (curyes == cur) ? "yes" : ""
9758                     );
9759                     if (curyes == cur)
9760                         curyes = cur->u.yes.prev_yes_state;
9761                 }
9762             } else {
9763                 DEBUG_STATE_pp("push")
9764             });
9765             depth++;
9766             st->locinput = locinput;
9767             st->loceol = loceol;
9768             st->sr0 = script_run_begin;
9769             newst = st+1;
9770             if (newst >  SLAB_LAST(PL_regmatch_slab))
9771                 newst = S_push_slab(aTHX);
9772             PL_regmatch_state = newst;
9773 
9774             locinput = pushinput;
9775             loceol = pusheol;
9776             script_run_begin = pushsr0;
9777             st = newst;
9778             continue;
9779             /* NOTREACHED */
9780         }
9781     }
9782 #ifdef SOLARIS_BAD_OPTIMIZER
9783 #  undef PL_charclass
9784 #endif
9785 
9786     /*
9787     * We get here only if there's trouble -- normally "case END" is
9788     * the terminating point.
9789     */
9790     Perl_croak(aTHX_ "corrupted regexp pointers");
9791     NOT_REACHED; /* NOTREACHED */
9792 
9793   yes:
9794     if (yes_state) {
9795         /* we have successfully completed a subexpression, but we must now
9796          * pop to the state marked by yes_state and continue from there */
9797         assert(st != yes_state);
9798 #ifdef DEBUGGING
9799         while (st != yes_state) {
9800             st--;
9801             if (st < SLAB_FIRST(PL_regmatch_slab)) {
9802                 PL_regmatch_slab = PL_regmatch_slab->prev;
9803                 st = SLAB_LAST(PL_regmatch_slab);
9804             }
9805             DEBUG_STATE_r({
9806                 if (no_final) {
9807                     DEBUG_STATE_pp("pop (no final)");
9808                 } else {
9809                     DEBUG_STATE_pp("pop (yes)");
9810                 }
9811             });
9812             depth--;
9813         }
9814 #else
9815         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9816             || yes_state > SLAB_LAST(PL_regmatch_slab))
9817         {
9818             /* not in this slab, pop slab */
9819             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9820             PL_regmatch_slab = PL_regmatch_slab->prev;
9821             st = SLAB_LAST(PL_regmatch_slab);
9822         }
9823         depth -= (st - yes_state);
9824 #endif
9825         st = yes_state;
9826         yes_state = st->u.yes.prev_yes_state;
9827         PL_regmatch_state = st;
9828 
9829         if (no_final) {
9830             locinput= st->locinput;
9831             loceol= st->loceol;
9832             script_run_begin = st->sr0;
9833         }
9834         state_num = st->resume_state + no_final;
9835         goto reenter_switch;
9836     }
9837 
9838     DEBUG_EXECUTE_r(Perl_re_printf( aTHX_  "%sMatch successful!%s\n",
9839                           PL_colors[4], PL_colors[5]));
9840 
9841     if (reginfo->info_aux_eval) {
9842         /* each successfully executed (?{...}) block does the equivalent of
9843          *   local $^R = do {...}
9844          * When popping the save stack, all these locals would be undone;
9845          * bypass this by setting the outermost saved $^R to the latest
9846          * value */
9847         /* I dont know if this is needed or works properly now.
9848          * see code related to PL_replgv elsewhere in this file.
9849          * Yves
9850          */
9851         if (oreplsv != GvSV(PL_replgv)) {
9852             sv_setsv(oreplsv, GvSV(PL_replgv));
9853             SvSETMAGIC(oreplsv);
9854         }
9855     }
9856     result = 1;
9857     goto final_exit;
9858 
9859   no:
9860     DEBUG_EXECUTE_r(
9861         Perl_re_exec_indentf( aTHX_  "%sfailed...%s\n",
9862             depth,
9863             PL_colors[4], PL_colors[5])
9864         );
9865 
9866   no_silent:
9867     if (no_final) {
9868         if (yes_state) {
9869             goto yes;
9870         } else {
9871             goto final_exit;
9872         }
9873     }
9874     if (depth) {
9875         /* there's a previous state to backtrack to */
9876         st--;
9877         if (st < SLAB_FIRST(PL_regmatch_slab)) {
9878             PL_regmatch_slab = PL_regmatch_slab->prev;
9879             st = SLAB_LAST(PL_regmatch_slab);
9880         }
9881         PL_regmatch_state = st;
9882         locinput= st->locinput;
9883         loceol= st->loceol;
9884         script_run_begin = st->sr0;
9885 
9886         DEBUG_STATE_pp("pop");
9887         depth--;
9888         if (yes_state == st)
9889             yes_state = st->u.yes.prev_yes_state;
9890 
9891         state_num = st->resume_state + 1; /* failure = success + 1 */
9892         PERL_ASYNC_CHECK();
9893         goto reenter_switch;
9894     }
9895     result = 0;
9896 
9897   final_exit:
9898     if (rex->intflags & PREGf_VERBARG_SEEN) {
9899         SV *sv_err = get_sv("REGERROR", 1);
9900         SV *sv_mrk = get_sv("REGMARK", 1);
9901         if (result) {
9902             sv_commit = &PL_sv_no;
9903             if (!sv_yes_mark)
9904                 sv_yes_mark = &PL_sv_yes;
9905         } else {
9906             if (!sv_commit)
9907                 sv_commit = &PL_sv_yes;
9908             sv_yes_mark = &PL_sv_no;
9909         }
9910         assert(sv_err);
9911         assert(sv_mrk);
9912         sv_setsv(sv_err, sv_commit);
9913         sv_setsv(sv_mrk, sv_yes_mark);
9914     }
9915 
9916 
9917     if (last_pushed_cv) {
9918         dSP;
9919         /* see "Some notes about MULTICALL" above */
9920         POP_MULTICALL;
9921         PERL_UNUSED_VAR(SP);
9922     }
9923     else
9924         LEAVE_SCOPE(orig_savestack_ix);
9925 
9926     assert(!result ||  locinput - reginfo->strbeg >= 0);
9927     return result ?  locinput - reginfo->strbeg : -1;
9928 }
9929 
9930 /*
9931  - regrepeat - repeatedly match something simple, report how many
9932  *
9933  * What 'simple' means is a node which can be the operand of a quantifier like
9934  * '+', or {1,3}
9935  *
9936  * startposp - pointer to a pointer to the start position.  This is updated
9937  *             to point to the byte following the highest successful
9938  *             match.
9939  * p         - the regnode to be repeatedly matched against.
9940  * loceol    - pointer to the end position beyond which we aren't supposed to
9941  *             look.
9942  * reginfo   - struct holding match state, such as utf8_target
9943  * max       - maximum number of things to match.
9944  * depth     - (for debugging) backtracking depth.
9945  */
9946 STATIC I32
9947 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9948             char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9949 {
9950     char *scan;     /* Pointer to current position in target string */
9951     I32 c;
9952     char *this_eol = loceol;   /* potentially adjusted version. */
9953     I32 hardcount = 0;  /* How many matches so far */
9954     bool utf8_target = reginfo->is_utf8_target;
9955     unsigned int to_complement = 0;  /* Invert the result? */
9956     _char_class_number classnum;
9957 
9958     PERL_ARGS_ASSERT_REGREPEAT;
9959 
9960     /* This routine is structured so that we switch on the input OP.  Each OP
9961      * case: statement contains a loop to repeatedly apply the OP, advancing
9962      * the input until it fails, or reaches the end of the input, or until it
9963      * reaches the upper limit of matches. */
9964 
9965     scan = *startposp;
9966     if (max == REG_INFTY)   /* This is a special marker to go to the platform's
9967                                max */
9968         max = I32_MAX;
9969     else if (! utf8_target && this_eol - scan > max)
9970         this_eol = scan + max;
9971 
9972     /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
9973      * down to the maximum of how far we should go in it (but leaving it set to
9974      * the real end if the maximum permissible would take us beyond that).
9975      * This allows us to make the loop exit condition that we haven't gone past
9976      * <this_eol> to also mean that we haven't exceeded the max permissible
9977      * count, saving a test each time through the loop.  But it assumes that
9978      * the OP matches a single byte, which is true for most of the OPs below
9979      * when applied to a non-UTF-8 target.  Those relatively few OPs that don't
9980      * have this characteristic have to compensate.
9981      *
9982      * There is no such adjustment for UTF-8 targets, since the number of bytes
9983      * per character can vary.  OPs will have to test both that the count is
9984      * less than the max permissible (using <hardcount> to keep track), and
9985      * that we are still within the bounds of the string (using <this_eol>.  A
9986      * few OPs match a single byte no matter what the encoding.  They can omit
9987      * the max test if, for the UTF-8 case, they do the adjustment that was
9988      * skipped above.
9989      *
9990      * Thus, the code above sets things up for the common case; and exceptional
9991      * cases need extra work; the common case is to make sure <scan> doesn't go
9992      * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9993      * count doesn't exceed the maximum permissible */
9994 
9995     switch (with_t_UTF8ness(OP(p), utf8_target)) {
9996       case REG_ANY_t8:
9997         while (scan < this_eol && hardcount < max && *scan != '\n') {
9998             scan += UTF8SKIP(scan);
9999             hardcount++;
10000         }
10001         break;
10002 
10003       case REG_ANY_tb:
10004         scan = (char *) memchr(scan, '\n', this_eol - scan);
10005         if (! scan) {
10006             scan = this_eol;
10007         }
10008         break;
10009 
10010       case SANY_t8:
10011         while (scan < this_eol && hardcount < max) {
10012             scan += UTF8SKIP(scan);
10013             hardcount++;
10014         }
10015         break;
10016 
10017       case SANY_tb:
10018         scan = this_eol;
10019         break;
10020 
10021       case EXACT_REQ8_tb:
10022       case LEXACT_REQ8_tb:
10023       case EXACTFU_REQ8_tb:
10024         break;
10025 
10026       case EXACTL_t8:
10027         if (UTF8_IS_ABOVE_LATIN1(*scan)) {
10028             _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
10029         }
10030         /* FALLTHROUGH */
10031 
10032       case EXACTL_tb:
10033       case EXACTFL_t8:
10034       case EXACTFL_tb:
10035       case EXACTFLU8_t8:
10036       case EXACTFLU8_tb:
10037         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10038         /* FALLTHROUGH */
10039 
10040       case EXACT_REQ8_t8:
10041       case LEXACT_REQ8_t8:
10042       case EXACTFU_REQ8_t8:
10043       case LEXACT_t8:
10044       case LEXACT_tb:
10045       case EXACT_t8:
10046       case EXACT_tb:
10047       case EXACTF_t8:
10048       case EXACTF_tb:
10049       case EXACTFAA_NO_TRIE_t8:
10050       case EXACTFAA_NO_TRIE_tb:
10051       case EXACTFAA_t8:
10052       case EXACTFAA_tb:
10053       case EXACTFU_t8:
10054       case EXACTFU_tb:
10055       case EXACTFUP_t8:
10056       case EXACTFUP_tb:
10057 
10058       {
10059         struct next_matchable_info Binfo;
10060         PERL_UINT_FAST8_T definitive_len;
10061 
10062         assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10063 
10064         /* Set up termination info, and quit if we can rule out that we've
10065          * gotten a match of the termination criteria */
10066         if (   ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10067             ||   scan + Binfo.min_length > this_eol
10068             || ! S_test_EXACTISH_ST(scan, Binfo))
10069         {
10070             break;
10071         }
10072 
10073         definitive_len = Binfo.initial_definitive;
10074 
10075         /* Here there are potential matches, and the first byte(s) matched our
10076          * filter
10077          *
10078          * If we got a definitive match of some initial bytes, there is no
10079          * possibility of false positives as far as it got */
10080         if (definitive_len > 0) {
10081 
10082             /* If as far as it got is the maximum possible, there were no false
10083              * positives at all.  Since we have everything set up, see how many
10084              * repeats there are. */
10085             if (definitive_len >= Binfo.max_length) {
10086 
10087                 /* We've already found one match */
10088                 scan += definitive_len;
10089                 hardcount++;
10090 
10091                 /* If want more than the one match, and there is room for more,
10092                  * see if there are any */
10093                 if (hardcount < max && scan + definitive_len <= this_eol) {
10094 
10095                     /* If the character is only a single byte long, just span
10096                      * all such bytes. */
10097                     if (definitive_len == 1) {
10098                         const char * orig_scan = scan;
10099 
10100                         if (this_eol - (scan - hardcount) > max) {
10101                             this_eol = scan - hardcount + max;
10102                         }
10103 
10104                         /* Use different routines depending on whether it's an
10105                          * exact match or matches with a mask */
10106                         if (Binfo.initial_exact == 1) {
10107                             scan = (char *) find_span_end((U8 *) scan,
10108                                                           (U8 *) this_eol,
10109                                                           Binfo.matches[0]);
10110                         }
10111                         else {
10112                             scan = (char *) find_span_end_mask(
10113                                                        (U8 *) scan,
10114                                                        (U8 *) this_eol,
10115                                                        Binfo.first_byte_anded,
10116                                                        Binfo.first_byte_mask);
10117                         }
10118 
10119                         hardcount += scan - orig_scan;
10120                     }
10121                     else { /* Here, the full character definitive match is more
10122                               than one byte */
10123                         while (   hardcount < max
10124                                && scan + definitive_len <= this_eol
10125                                && S_test_EXACTISH_ST(scan, Binfo))
10126                         {
10127                                 scan += definitive_len;
10128                                 hardcount++;
10129                         }
10130                     }
10131                 }
10132 
10133                 break;
10134             }   /* End of a full character is definitively matched */
10135 
10136             /* Here, an initial portion of the character matched definitively,
10137              * and the rest matched as well, but could have false positives */
10138 
10139             do {
10140                 PERL_INT_FAST8_T i;
10141                 U8 * matches = Binfo.matches;
10142 
10143                 /* The first bytes were definitive.  Look at the remaining */
10144                 for (i = 0; i < Binfo.count; i++) {
10145                     if (memEQ(scan + definitive_len,
10146                               matches + definitive_len,
10147                               Binfo.lengths[i] - definitive_len))
10148                     {
10149                         goto found_a_completion;
10150                     }
10151 
10152                     matches += Binfo.lengths[i];
10153                 }
10154 
10155                 /* Didn't find anything to complete our initial match.  Stop
10156                  * here */
10157                 break;
10158 
10159               found_a_completion:
10160 
10161                 /* Here, matched a full character, Include it in the result,
10162                  * and then look to see if the next char matches */
10163                 hardcount++;
10164                 scan += Binfo.lengths[i];
10165 
10166             } while (   hardcount < max
10167                      && scan + definitive_len < this_eol
10168                      && S_test_EXACTISH_ST(scan, Binfo));
10169 
10170             /* Here, have advanced as far as possible */
10171             break;
10172         } /* End of found some initial bytes that definitively matched */
10173 
10174         /* Here, we can't rule out that we have found the beginning of 'B', but
10175          * there were no initial bytes that could rule out anything
10176          * definitively. Use brute force to examine all the possibilities */
10177         while (scan < this_eol && hardcount < max) {
10178             PERL_INT_FAST8_T i;
10179             U8 * matches = Binfo.matches;
10180 
10181             for (i = 0; i < Binfo.count; i++) {
10182                 if (memEQ(scan, matches, Binfo.lengths[i])) {
10183                     goto found1;
10184                 }
10185 
10186                 matches += Binfo.lengths[i];
10187             }
10188 
10189             break;
10190 
10191           found1:
10192             hardcount++;
10193             scan += Binfo.lengths[i];
10194         }
10195 
10196         break;
10197       }
10198 
10199       case ANYOFPOSIXL_t8:
10200       case ANYOFL_t8:
10201         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10202          CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10203 
10204         /* FALLTHROUGH */
10205       case ANYOFD_t8:
10206       case ANYOF_t8:
10207         while (   hardcount < max
10208                && scan < this_eol
10209                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10210         {
10211             scan += UTF8SKIP(scan);
10212             hardcount++;
10213         }
10214         break;
10215 
10216       case ANYOFPOSIXL_tb:
10217       case ANYOFL_tb:
10218         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10219          CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10220         /* FALLTHROUGH */
10221 
10222       case ANYOFD_tb:
10223       case ANYOF_tb:
10224         if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
10225             while (   scan < this_eol
10226                    && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10227                 scan++;
10228         }
10229         else {
10230             while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10231                 scan++;
10232         }
10233         break;
10234 
10235       case ANYOFM_t8:
10236         if (this_eol - scan > max) {
10237 
10238             /* We didn't adjust <this_eol> at the beginning of this routine
10239              * because is UTF-8, but it is actually ok to do so, since here, to
10240              * match, 1 char == 1 byte. */
10241             this_eol = scan + max;
10242         }
10243         /* FALLTHROUGH */
10244 
10245       case ANYOFM_tb:
10246         scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10247                                            (U8) ARG(p), FLAGS(p));
10248         break;
10249 
10250       case NANYOFM_t8:
10251         while (     hardcount < max
10252                &&   scan < this_eol
10253                &&  (*scan & FLAGS(p)) != ARG(p))
10254         {
10255             scan += UTF8SKIP(scan);
10256             hardcount++;
10257         }
10258         break;
10259 
10260       case NANYOFM_tb:
10261         scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10262                                          (U8) ARG(p), FLAGS(p));
10263         break;
10264 
10265       case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10266       case ANYOFHb_tb:
10267       case ANYOFHr_tb:
10268       case ANYOFHs_tb:
10269         break;
10270 
10271       case ANYOFH_t8:
10272         while (  hardcount < max
10273                && scan < this_eol
10274                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10275                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10276         {
10277             scan += UTF8SKIP(scan);
10278             hardcount++;
10279         }
10280         break;
10281 
10282       case ANYOFHb_t8:
10283         /* we know the first byte must be the FLAGS field */
10284         while (   hardcount < max
10285                && scan < this_eol
10286                && (U8) *scan == ANYOF_FLAGS(p)
10287                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10288         {
10289             scan += UTF8SKIP(scan);
10290             hardcount++;
10291         }
10292         break;
10293 
10294       case ANYOFHr_t8:
10295         while (  hardcount < max
10296                && scan < this_eol
10297                && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10298                           LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10299                           HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10300                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10301                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10302         {
10303             scan += UTF8SKIP(scan);
10304             hardcount++;
10305         }
10306         break;
10307 
10308       case ANYOFHs_t8:
10309         while (   hardcount < max
10310                && scan + FLAGS(p) < this_eol
10311                && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10312                && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10313         {
10314             scan += UTF8SKIP(scan);
10315             hardcount++;
10316         }
10317         break;
10318 
10319       case ANYOFR_t8:
10320         while (   hardcount < max
10321                && scan < this_eol
10322                && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10323                && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10324                                             (U8 *) this_eol,
10325                                             NULL),
10326                               ANYOFRbase(p), ANYOFRdelta(p)))
10327         {
10328             scan += UTF8SKIP(scan);
10329             hardcount++;
10330         }
10331         break;
10332 
10333       case ANYOFR_tb:
10334         while (   hardcount < max
10335                && scan < this_eol
10336                && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10337         {
10338             scan++;
10339             hardcount++;
10340         }
10341         break;
10342 
10343       case ANYOFRb_t8:
10344         while (   hardcount < max
10345                && scan < this_eol
10346                && (U8) *scan == ANYOF_FLAGS(p)
10347                && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10348                                             (U8 *) this_eol,
10349                                             NULL),
10350                               ANYOFRbase(p), ANYOFRdelta(p)))
10351         {
10352             scan += UTF8SKIP(scan);
10353             hardcount++;
10354         }
10355         break;
10356 
10357       case ANYOFRb_tb:
10358         while (   hardcount < max
10359                && scan < this_eol
10360                && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10361         {
10362             scan++;
10363             hardcount++;
10364         }
10365         break;
10366 
10367     /* The argument (FLAGS) to all the POSIX node types is the class number */
10368 
10369       case NPOSIXL_tb:
10370         to_complement = 1;
10371         /* FALLTHROUGH */
10372 
10373       case POSIXL_tb:
10374         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10375         while (   scan < this_eol
10376                && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10377         {
10378             scan++;
10379         }
10380         break;
10381 
10382       case NPOSIXL_t8:
10383         to_complement = 1;
10384         /* FALLTHROUGH */
10385 
10386       case POSIXL_t8:
10387         while (   hardcount < max && scan < this_eol
10388                && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10389                                                       (U8 *) scan,
10390                                                       (U8 *) this_eol)))
10391         {
10392             scan += UTF8SKIP(scan);
10393             hardcount++;
10394         }
10395         break;
10396 
10397       case POSIXD_tb:
10398         /* FALLTHROUGH */
10399 
10400       case POSIXA_t8:
10401         if (this_eol - scan > max) {
10402 
10403             /* We didn't adjust <this_eol> at the beginning of this routine
10404              * because is UTF-8, but it is actually ok to do so, since here, to
10405              * match, 1 char == 1 byte. */
10406             this_eol = scan + max;
10407         }
10408         /* FALLTHROUGH */
10409 
10410       case POSIXA_tb:
10411         while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
10412             scan++;
10413         }
10414         break;
10415 
10416       case NPOSIXD_tb:
10417         /* FALLTHROUGH */
10418 
10419       case NPOSIXA_tb:
10420         while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
10421             scan++;
10422         }
10423         break;
10424 
10425       case NPOSIXA_t8:
10426 
10427         /* The complement of something that matches only ASCII matches all
10428          * non-ASCII, plus everything in ASCII that isn't in the class. */
10429         while (   hardcount < max && scan < this_eol
10430                && (   ! isASCII_utf8_safe(scan, loceol)
10431                    || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
10432             {
10433                 scan += UTF8SKIP(scan);
10434                 hardcount++;
10435             }
10436         break;
10437 
10438       case NPOSIXU_tb:
10439         to_complement = 1;
10440         /* FALLTHROUGH */
10441 
10442       case POSIXU_tb:
10443         while (   scan < this_eol
10444                && to_complement ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
10445         {
10446             scan++;
10447         }
10448         break;
10449 
10450       case NPOSIXU_t8:
10451       case NPOSIXD_t8:
10452         to_complement = 1;
10453         /* FALLTHROUGH */
10454 
10455       case POSIXD_t8:
10456       case POSIXU_t8:
10457         classnum = (_char_class_number) FLAGS(p);
10458         switch (classnum) {
10459           default:
10460             while (   hardcount < max && scan < this_eol
10461                    && to_complement
10462                     ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10463                        utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10464             {
10465                 scan += UTF8SKIP(scan);
10466                 hardcount++;
10467             }
10468             break;
10469 
10470             /* For the classes below, the knowledge of how to handle every code
10471              * point is compiled into Perl via a macro.  This code is written
10472              * for making the loops as tight as possible.  It could be
10473              * refactored to save space instead. */
10474 
10475           case _CC_ENUM_SPACE:
10476             while (   hardcount < max
10477                    && scan < this_eol
10478                    && (to_complement
10479                                    ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10480             {
10481                 scan += UTF8SKIP(scan);
10482                 hardcount++;
10483             }
10484             break;
10485           case _CC_ENUM_BLANK:
10486             while (   hardcount < max
10487                    && scan < this_eol
10488                    && (to_complement
10489                                 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10490             {
10491                 scan += UTF8SKIP(scan);
10492                 hardcount++;
10493             }
10494             break;
10495           case _CC_ENUM_XDIGIT:
10496             while (   hardcount < max
10497                    && scan < this_eol
10498                    && (to_complement
10499                                ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10500             {
10501                 scan += UTF8SKIP(scan);
10502                 hardcount++;
10503             }
10504             break;
10505           case _CC_ENUM_VERTSPACE:
10506             while (   hardcount < max
10507                    && scan < this_eol
10508                    && (to_complement
10509                                ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10510             {
10511                 scan += UTF8SKIP(scan);
10512                 hardcount++;
10513             }
10514             break;
10515           case _CC_ENUM_CNTRL:
10516             while (   hardcount < max
10517                    && scan < this_eol
10518                    && (to_complement
10519                                ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10520             {
10521                 scan += UTF8SKIP(scan);
10522                 hardcount++;
10523             }
10524             break;
10525         }
10526         break;
10527 
10528       case LNBREAK_t8:
10529         while (    hardcount < max && scan < this_eol
10530                && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10531         {
10532             scan += c;
10533             hardcount++;
10534         }
10535         break;
10536 
10537       case LNBREAK_tb:
10538         /* LNBREAK can match one or two latin chars, which is ok, but we have
10539          * to use hardcount in this situation, and throw away the adjustment to
10540          * <this_eol> done before the switch statement */
10541         while (
10542             hardcount < max && scan < loceol
10543             && (c = is_LNBREAK_latin1_safe(scan, loceol))
10544         ) {
10545             scan += c;
10546             hardcount++;
10547         }
10548         break;
10549 
10550     default:
10551         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10552                          " node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
10553         NOT_REACHED; /* NOTREACHED */
10554 
10555     }
10556 
10557     if (hardcount)
10558         c = hardcount;
10559     else
10560         c = scan - *startposp;
10561     *startposp = scan;
10562 
10563     DEBUG_r({
10564         DECLARE_AND_GET_RE_DEBUG_FLAGS;
10565         DEBUG_EXECUTE_r({
10566             SV * const prop = sv_newmortal();
10567             regprop(prog, prop, p, reginfo, NULL);
10568             Perl_re_exec_indentf( aTHX_
10569                         "%s can match %" IVdf " times out of %" IVdf "...\n",
10570                         depth, SvPVX_const(prop),(IV)c,(IV)max);
10571         });
10572     });
10573 
10574     return(c);
10575 }
10576 
10577 /*
10578  - reginclass - determine if a character falls into a character class
10579 
10580   n is the ANYOF-type regnode
10581   p is the target string
10582   p_end points to one byte beyond the end of the target string
10583   utf8_target tells whether p is in UTF-8.
10584 
10585   Returns true if matched; false otherwise.
10586 
10587   Note that this can be a synthetic start class, a combination of various
10588   nodes, so things you think might be mutually exclusive, such as locale,
10589   aren't.  It can match both locale and non-locale
10590 
10591  */
10592 
10593 STATIC bool
10594 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10595 {
10596     const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10597                         ? 0
10598                         : ANYOF_FLAGS(n);
10599     bool match = FALSE;
10600     UV c = *p;
10601 
10602     PERL_ARGS_ASSERT_REGINCLASS;
10603 
10604     /* If c is not already the code point, get it.  Note that
10605      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10606     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10607         STRLEN c_len = 0;
10608         const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10609         c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10610         if (c_len == (STRLEN)-1) {
10611             _force_out_malformed_utf8_message(p, p_end,
10612                                               utf8n_flags,
10613                                               1 /* 1 means die */ );
10614             NOT_REACHED; /* NOTREACHED */
10615         }
10616         if (     c > 255
10617             &&  (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10618             && ! ANYOFL_UTF8_LOCALE_REQD(flags))
10619         {
10620             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10621         }
10622     }
10623 
10624     /* If this character is potentially in the bitmap, check it */
10625     if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10626         if (ANYOF_BITMAP_TEST(n, c))
10627             match = TRUE;
10628         else if ((flags
10629                 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10630                   && OP(n) == ANYOFD
10631                   && ! utf8_target
10632                   && ! isASCII(c))
10633         {
10634             match = TRUE;
10635         }
10636         else if (flags & ANYOF_LOCALE_FLAGS) {
10637             if (  (flags & ANYOFL_FOLD)
10638                 && c < 256
10639                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10640             {
10641                 match = TRUE;
10642             }
10643             else if (   ANYOF_POSIXL_TEST_ANY_SET(n)
10644                      && c <= U8_MAX  /* param to isFOO_lc() */
10645             ) {
10646                 /* The data structure is arranged so bits 0, 2, 4, ... are set
10647                  * if the class includes the Posix character class given by
10648                  * bit/2; and 1, 3, 5, ... are set if the class includes the
10649                  * complemented Posix class given by int(bit/2), so the
10650                  * remainder modulo 2 tells us if to complement or not.
10651                  *
10652                  * Note that this code assumes that all the classes are closed
10653                  * under folding.  For example, if a character matches \w, then
10654                  * its fold does too; and vice versa.  This should be true for
10655                  * any well-behaved locale for all the currently defined Posix
10656                  * classes, except for :lower: and :upper:, which are handled
10657                  * by the pseudo-class :cased: which matches if either of the
10658                  * other two does.  To get rid of this assumption, an outer
10659                  * loop could be used below to iterate over both the source
10660                  * character, and its fold (if different) */
10661 
10662                 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10663 
10664                 do {
10665                     /* Find the next set bit indicating a class to try matching
10666                      * against */
10667                     U8 bit_pos = lsbit_pos32(posixl_bits);
10668 
10669                     if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10670                         match = TRUE;
10671                         break;
10672                     }
10673 
10674                     /* Remove this class from consideration; repeat */
10675                     POSIXL_CLEAR(posixl_bits, bit_pos);
10676                 } while(posixl_bits != 0);
10677             }
10678         }
10679     }
10680 
10681 
10682     /* If the bitmap didn't (or couldn't) match, and something outside the
10683      * bitmap could match, try that. */
10684     if (!match) {
10685         if (c >= NUM_ANYOF_CODE_POINTS
10686             && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
10687         {
10688             match = TRUE;	/* Everything above the bitmap matches */
10689         }
10690             /* Here doesn't match everything above the bitmap.  If there is
10691              * some information available beyond the bitmap, we may find a
10692              * match in it.  If so, this is most likely because the code point
10693              * is outside the bitmap range.  But rarely, it could be because of
10694              * some other reason.  If so, various flags are set to indicate
10695              * this possibility.  On ANYOFD nodes, there may be matches that
10696              * happen only when the target string is UTF-8; or for other node
10697              * types, because runtime lookup is needed, regardless of the
10698              * UTF-8ness of the target string.  Finally, under /il, there may
10699              * be some matches only possible if the locale is a UTF-8 one. */
10700         else if (    ARG(n) != ANYOF_ONLY_HAS_BITMAP
10701                  && (   c >= NUM_ANYOF_CODE_POINTS
10702                      || (   (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
10703                          && (   UNLIKELY(OP(n) != ANYOFD)
10704                              || (utf8_target && ! isASCII_uni(c)
10705 #                               if NUM_ANYOF_CODE_POINTS > 256
10706                                                                  && c < 256
10707 #                               endif
10708                                 )))
10709                      || (   ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10710                          && IN_UTF8_CTYPE_LOCALE)))
10711         {
10712             SV* only_utf8_locale = NULL;
10713             SV * const definition =
10714 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
10715                 get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10716                                             &only_utf8_locale, NULL);
10717 #else
10718                 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
10719                                              &only_utf8_locale, NULL);
10720 #endif
10721             if (definition) {
10722                 U8 utf8_buffer[2];
10723                 U8 * utf8_p;
10724                 if (utf8_target) {
10725                     utf8_p = (U8 *) p;
10726                 } else { /* Convert to utf8 */
10727                     utf8_p = utf8_buffer;
10728                     append_utf8_from_native_byte(*p, &utf8_p);
10729                     utf8_p = utf8_buffer;
10730                 }
10731 
10732                 /* Turkish locales have these hard-coded rules overriding
10733                  * normal ones */
10734                 if (   UNLIKELY(PL_in_utf8_turkic_locale)
10735                     && isALPHA_FOLD_EQ(*p, 'i'))
10736                 {
10737                     if (*p == 'i') {
10738                         if (_invlist_contains_cp(definition,
10739                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10740                         {
10741                             match = TRUE;
10742                         }
10743                     }
10744                     else if (*p == 'I') {
10745                         if (_invlist_contains_cp(definition,
10746                                                 LATIN_SMALL_LETTER_DOTLESS_I))
10747                         {
10748                             match = TRUE;
10749                         }
10750                     }
10751                 }
10752                 else if (_invlist_contains_cp(definition, c)) {
10753                     match = TRUE;
10754                 }
10755             }
10756             if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10757                 match = _invlist_contains_cp(only_utf8_locale, c);
10758             }
10759         }
10760 
10761         /* In a Turkic locale under folding, hard-code the I i case pair
10762          * matches */
10763         if (     UNLIKELY(PL_in_utf8_turkic_locale)
10764             && ! match
10765             &&   (flags & ANYOFL_FOLD)
10766             &&   utf8_target)
10767         {
10768             if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10769                 if (ANYOF_BITMAP_TEST(n, 'i')) {
10770                     match = TRUE;
10771                 }
10772             }
10773             else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10774                 if (ANYOF_BITMAP_TEST(n, 'I')) {
10775                     match = TRUE;
10776                 }
10777             }
10778         }
10779 
10780         if (UNICODE_IS_SUPER(c)
10781             && (flags
10782                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10783             && OP(n) != ANYOFD
10784             && ckWARN_d(WARN_NON_UNICODE))
10785         {
10786             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10787                 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10788         }
10789     }
10790 
10791 #if ANYOF_INVERT != 1
10792     /* Depending on compiler optimization cBOOL takes time, so if don't have to
10793      * use it, don't */
10794 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10795 #endif
10796 
10797     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10798     return (flags & ANYOF_INVERT) ^ match;
10799 }
10800 
10801 STATIC U8 *
10802 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10803 {
10804     /* return the position 'off' UTF-8 characters away from 's', forward if
10805      * 'off' >= 0, backwards if negative.  But don't go outside of position
10806      * 'lim', which better be < s  if off < 0 */
10807 
10808     PERL_ARGS_ASSERT_REGHOP3;
10809 
10810     if (off >= 0) {
10811         while (off-- && s < lim) {
10812             /* XXX could check well-formedness here */
10813             U8 *new_s = s + UTF8SKIP(s);
10814             if (new_s > lim) /* lim may be in the middle of a long character */
10815                 return s;
10816             s = new_s;
10817         }
10818     }
10819     else {
10820         while (off++ && s > lim) {
10821             s--;
10822             if (UTF8_IS_CONTINUED(*s)) {
10823                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10824                     s--;
10825                 if (! UTF8_IS_START(*s)) {
10826                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10827                 }
10828             }
10829             /* XXX could check well-formedness here */
10830         }
10831     }
10832     return s;
10833 }
10834 
10835 STATIC U8 *
10836 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10837 {
10838     PERL_ARGS_ASSERT_REGHOP4;
10839 
10840     if (off >= 0) {
10841         while (off-- && s < rlim) {
10842             /* XXX could check well-formedness here */
10843             s += UTF8SKIP(s);
10844         }
10845     }
10846     else {
10847         while (off++ && s > llim) {
10848             s--;
10849             if (UTF8_IS_CONTINUED(*s)) {
10850                 while (s > llim && UTF8_IS_CONTINUATION(*s))
10851                     s--;
10852                 if (! UTF8_IS_START(*s)) {
10853                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10854                 }
10855             }
10856             /* XXX could check well-formedness here */
10857         }
10858     }
10859     return s;
10860 }
10861 
10862 /* like reghop3, but returns NULL on overrun, rather than returning last
10863  * char pos */
10864 
10865 STATIC U8 *
10866 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10867 {
10868     PERL_ARGS_ASSERT_REGHOPMAYBE3;
10869 
10870     if (off >= 0) {
10871         while (off-- && s < lim) {
10872             /* XXX could check well-formedness here */
10873             s += UTF8SKIP(s);
10874         }
10875         if (off >= 0)
10876             return NULL;
10877     }
10878     else {
10879         while (off++ && s > lim) {
10880             s--;
10881             if (UTF8_IS_CONTINUED(*s)) {
10882                 while (s > lim && UTF8_IS_CONTINUATION(*s))
10883                     s--;
10884                 if (! UTF8_IS_START(*s)) {
10885                     Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10886                 }
10887             }
10888             /* XXX could check well-formedness here */
10889         }
10890         if (off <= 0)
10891             return NULL;
10892     }
10893     return s;
10894 }
10895 
10896 
10897 /* when executing a regex that may have (?{}), extra stuff needs setting
10898    up that will be visible to the called code, even before the current
10899    match has finished. In particular:
10900 
10901    * $_ is localised to the SV currently being matched;
10902    * pos($_) is created if necessary, ready to be updated on each call-out
10903      to code;
10904    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10905      isn't set until the current pattern is successfully finished), so that
10906      $1 etc of the match-so-far can be seen;
10907    * save the old values of subbeg etc of the current regex, and  set then
10908      to the current string (again, this is normally only done at the end
10909      of execution)
10910 */
10911 
10912 static void
10913 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10914 {
10915     MAGIC *mg;
10916     regexp *const rex = ReANY(reginfo->prog);
10917     regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10918 
10919     eval_state->rex = rex;
10920     eval_state->sv  = reginfo->sv;
10921 
10922     if (reginfo->sv) {
10923         /* Make $_ available to executed code. */
10924         if (reginfo->sv != DEFSV) {
10925             SAVE_DEFSV;
10926             DEFSV_set(reginfo->sv);
10927         }
10928         /* will be dec'd by S_cleanup_regmatch_info_aux */
10929         SvREFCNT_inc_NN(reginfo->sv);
10930 
10931         if (!(mg = mg_find_mglob(reginfo->sv))) {
10932             /* prepare for quick setting of pos */
10933             mg = sv_magicext_mglob(reginfo->sv);
10934             mg->mg_len = -1;
10935         }
10936         eval_state->pos_magic = mg;
10937         eval_state->pos       = mg->mg_len;
10938         eval_state->pos_flags = mg->mg_flags;
10939     }
10940     else
10941         eval_state->pos_magic = NULL;
10942 
10943     if (!PL_reg_curpm) {
10944         /* PL_reg_curpm is a fake PMOP that we can attach the current
10945          * regex to and point PL_curpm at, so that $1 et al are visible
10946          * within a /(?{})/. It's just allocated once per interpreter the
10947          * first time its needed */
10948         Newxz(PL_reg_curpm, 1, PMOP);
10949 #ifdef USE_ITHREADS
10950         {
10951             SV* const repointer = &PL_sv_undef;
10952             /* this regexp is also owned by the new PL_reg_curpm, which
10953                will try to free it.  */
10954             av_push(PL_regex_padav, repointer);
10955             PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
10956             PL_regex_pad = AvARRAY(PL_regex_padav);
10957         }
10958 #endif
10959     }
10960     SET_reg_curpm(reginfo->prog);
10961     eval_state->curpm = PL_curpm;
10962     PL_curpm_under = PL_curpm;
10963     PL_curpm = PL_reg_curpm;
10964     if (RXp_MATCH_COPIED(rex)) {
10965         /*  Here is a serious problem: we cannot rewrite subbeg,
10966             since it may be needed if this match fails.  Thus
10967             $` inside (?{}) could fail... */
10968         eval_state->subbeg     = rex->subbeg;
10969         eval_state->sublen     = rex->sublen;
10970         eval_state->suboffset  = rex->suboffset;
10971         eval_state->subcoffset = rex->subcoffset;
10972 #ifdef PERL_ANY_COW
10973         eval_state->saved_copy = rex->saved_copy;
10974 #endif
10975         RXp_MATCH_COPIED_off(rex);
10976     }
10977     else
10978         eval_state->subbeg = NULL;
10979     rex->subbeg = (char *)reginfo->strbeg;
10980     rex->suboffset = 0;
10981     rex->subcoffset = 0;
10982     rex->sublen = reginfo->strend - reginfo->strbeg;
10983 }
10984 
10985 
10986 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10987 
10988 static void
10989 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10990 {
10991     regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10992     regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
10993     regmatch_slab *s;
10994 
10995     Safefree(aux->poscache);
10996 
10997     if (eval_state) {
10998 
10999         /* undo the effects of S_setup_eval_state() */
11000 
11001         if (eval_state->subbeg) {
11002             regexp * const rex = eval_state->rex;
11003             rex->subbeg     = eval_state->subbeg;
11004             rex->sublen     = eval_state->sublen;
11005             rex->suboffset  = eval_state->suboffset;
11006             rex->subcoffset = eval_state->subcoffset;
11007 #ifdef PERL_ANY_COW
11008             rex->saved_copy = eval_state->saved_copy;
11009 #endif
11010             RXp_MATCH_COPIED_on(rex);
11011         }
11012         if (eval_state->pos_magic)
11013         {
11014             eval_state->pos_magic->mg_len = eval_state->pos;
11015             eval_state->pos_magic->mg_flags =
11016                  (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
11017                | (eval_state->pos_flags & MGf_BYTES);
11018         }
11019 
11020         PL_curpm = eval_state->curpm;
11021         SvREFCNT_dec(eval_state->sv);
11022     }
11023 
11024     PL_regmatch_state = aux->old_regmatch_state;
11025     PL_regmatch_slab  = aux->old_regmatch_slab;
11026 
11027     /* free all slabs above current one - this must be the last action
11028      * of this function, as aux and eval_state are allocated within
11029      * slabs and may be freed here */
11030 
11031     s = PL_regmatch_slab->next;
11032     if (s) {
11033         PL_regmatch_slab->next = NULL;
11034         while (s) {
11035             regmatch_slab * const osl = s;
11036             s = s->next;
11037             Safefree(osl);
11038         }
11039     }
11040 }
11041 
11042 
11043 STATIC void
11044 S_to_utf8_substr(pTHX_ regexp *prog)
11045 {
11046     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11047      * on the converted value */
11048 
11049     int i = 1;
11050 
11051     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11052 
11053     do {
11054         if (prog->substrs->data[i].substr
11055             && !prog->substrs->data[i].utf8_substr) {
11056             SV* const sv = newSVsv(prog->substrs->data[i].substr);
11057             prog->substrs->data[i].utf8_substr = sv;
11058             sv_utf8_upgrade(sv);
11059             if (SvVALID(prog->substrs->data[i].substr)) {
11060                 if (SvTAIL(prog->substrs->data[i].substr)) {
11061                     /* Trim the trailing \n that fbm_compile added last
11062                        time.  */
11063                     SvCUR_set(sv, SvCUR(sv) - 1);
11064                     /* Whilst this makes the SV technically "invalid" (as its
11065                        buffer is no longer followed by "\0") when fbm_compile()
11066                        adds the "\n" back, a "\0" is restored.  */
11067                     fbm_compile(sv, FBMcf_TAIL);
11068                 } else
11069                     fbm_compile(sv, 0);
11070             }
11071             if (prog->substrs->data[i].substr == prog->check_substr)
11072                 prog->check_utf8 = sv;
11073         }
11074     } while (i--);
11075 }
11076 
11077 STATIC bool
11078 S_to_byte_substr(pTHX_ regexp *prog)
11079 {
11080     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11081      * on the converted value; returns FALSE if can't be converted. */
11082 
11083     int i = 1;
11084 
11085     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11086 
11087     do {
11088         if (prog->substrs->data[i].utf8_substr
11089             && !prog->substrs->data[i].substr) {
11090             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11091             if (! sv_utf8_downgrade(sv, TRUE)) {
11092                 SvREFCNT_dec_NN(sv);
11093                 return FALSE;
11094             }
11095             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11096                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11097                     /* Trim the trailing \n that fbm_compile added last
11098                         time.  */
11099                     SvCUR_set(sv, SvCUR(sv) - 1);
11100                     fbm_compile(sv, FBMcf_TAIL);
11101                 } else
11102                     fbm_compile(sv, 0);
11103             }
11104             prog->substrs->data[i].substr = sv;
11105             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11106                 prog->check_substr = sv;
11107         }
11108     } while (i--);
11109 
11110     return TRUE;
11111 }
11112 
11113 #ifndef PERL_IN_XSUB_RE
11114 
11115 bool
11116 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11117 {
11118     /* Temporary helper function for toke.c.  Verify that the code point 'cp'
11119      * is a stand-alone grapheme.  The UTF-8 for 'cp' begins at position 's' in
11120      * the larger string bounded by 'strbeg' and 'strend'.
11121      *
11122      * 'cp' needs to be assigned (if not, a future version of the Unicode
11123      * Standard could make it something that combines with adjacent characters,
11124      * so code using it would then break), and there has to be a GCB break
11125      * before and after the character. */
11126 
11127 
11128     GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11129     const U8 * prev_cp_start;
11130 
11131     PERL_ARGS_ASSERT_IS_GRAPHEME;
11132 
11133     if (   UNLIKELY(UNICODE_IS_SUPER(cp))
11134         || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11135     {
11136         /* These are considered graphemes */
11137         return TRUE;
11138     }
11139 
11140     /* Otherwise, unassigned code points are forbidden */
11141     if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11142                                     _invlist_search(PL_Assigned_invlist, cp))))
11143     {
11144         return FALSE;
11145     }
11146 
11147     cp_gcb_val = getGCB_VAL_CP(cp);
11148 
11149     /* Find the GCB value of the previous code point in the input */
11150     prev_cp_start = utf8_hop_back(s, -1, strbeg);
11151     if (UNLIKELY(prev_cp_start == s)) {
11152         prev_cp_gcb_val = GCB_EDGE;
11153     }
11154     else {
11155         prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11156     }
11157 
11158     /* And check that is a grapheme boundary */
11159     if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11160                 TRUE /* is UTF-8 encoded */ ))
11161     {
11162         return FALSE;
11163     }
11164 
11165     /* Similarly verify there is a break between the current character and the
11166      * following one */
11167     s += UTF8SKIP(s);
11168     if (s >= strend) {
11169         next_cp_gcb_val = GCB_EDGE;
11170     }
11171     else {
11172         next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11173     }
11174 
11175     return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11176 }
11177 
11178 /*
11179 =for apidoc_section $unicode
11180 
11181 =for apidoc isSCRIPT_RUN
11182 
11183 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11184 not including C<send> form a "script run".  C<utf8_target> is TRUE iff the
11185 sequence starting at C<s> is to be treated as UTF-8.  To be precise, except for
11186 two degenerate cases given below, this function returns TRUE iff all code
11187 points in it come from any combination of three "scripts" given by the Unicode
11188 "Script Extensions" property: Common, Inherited, and possibly one other.
11189 Additionally all decimal digits must come from the same consecutive sequence of
11190 10.
11191 
11192 For example, if all the characters in the sequence are Greek, or Common, or
11193 Inherited, this function will return TRUE, provided any decimal digits in it
11194 are from the same block of digits in Common.  (These are the ASCII digits
11195 "0".."9" and additionally a block for full width forms of these, and several
11196 others used in mathematical notation.)   For scripts (unlike Greek) that have
11197 their own digits defined this will accept either digits from that set or from
11198 one of the Common digit sets, but not a combination of the two.  Some scripts,
11199 such as Arabic, have more than one set of digits.  All digits must come from
11200 the same set for this function to return TRUE.
11201 
11202 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11203 contain the script found, using the C<SCX_enum> typedef.  Its value will be
11204 C<SCX_INVALID> if the function returns FALSE.
11205 
11206 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11207 will be C<SCX_INVALID>.
11208 
11209 If the sequence contains a single code point which is unassigned to a character
11210 in the version of Unicode being used, the function will return TRUE, and the
11211 script will be C<SCX_Unknown>.  Any other combination of unassigned code points
11212 in the input sequence will result in the function treating the input as not
11213 being a script run.
11214 
11215 The returned script will be C<SCX_Inherited> iff all the code points in it are
11216 from the Inherited script.
11217 
11218 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11219 it are from the Inherited or Common scripts.
11220 
11221 =cut
11222 
11223 */
11224 
11225 bool
11226 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11227 {
11228     /* Basically, it looks at each character in the sequence to see if the
11229      * above conditions are met; if not it fails.  It uses an inversion map to
11230      * find the enum corresponding to the script of each character.  But this
11231      * is complicated by the fact that a few code points can be in any of
11232      * several scripts.  The data has been constructed so that there are
11233      * additional enum values (all negative) for these situations.  The
11234      * absolute value of those is an index into another table which contains
11235      * pointers to auxiliary tables for each such situation.  Each aux array
11236      * lists all the scripts for the given situation.  There is another,
11237      * parallel, table that gives the number of entries in each aux table.
11238      * These are all defined in charclass_invlists.h */
11239 
11240     /* XXX Here are the additional things UTS 39 says could be done:
11241      *
11242      * Forbid sequences of the same nonspacing mark
11243      *
11244      * Check to see that all the characters are in the sets of exemplar
11245      * characters for at least one language in the Unicode Common Locale Data
11246      * Repository [CLDR]. */
11247 
11248 
11249     /* Things that match /\d/u */
11250     SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
11251     UV * decimals_array = invlist_array(decimals_invlist);
11252 
11253     /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11254      * not currently known) */
11255     UV zero_of_run = 0;
11256 
11257     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
11258     SCX_enum script_of_char = SCX_INVALID;
11259 
11260     /* If the script remains not fully determined from iteration to iteration,
11261      * this is the current intersection of the possiblities.  */
11262     SCX_enum * intersection = NULL;
11263     PERL_UINT_FAST8_T intersection_len = 0;
11264 
11265     bool retval = TRUE;
11266     SCX_enum * ret_script = NULL;
11267 
11268     assert(send >= s);
11269 
11270     PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11271 
11272     /* All code points in 0..255 are either Common or Latin, so must be a
11273      * script run.  We can return immediately unless we need to know which
11274      * script it is. */
11275     if (! utf8_target && LIKELY(send > s)) {
11276         if (ret_script == NULL) {
11277             return TRUE;
11278         }
11279 
11280         /* If any character is Latin, the run is Latin */
11281         while (s < send) {
11282             if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11283                 *ret_script = SCX_Latin;
11284                 return TRUE;
11285             }
11286         }
11287 
11288         /* Here, all are Common */
11289         *ret_script = SCX_Common;
11290         return TRUE;
11291     }
11292 
11293     /* Look at each character in the sequence */
11294     while (s < send) {
11295         /* If the current character being examined is a digit, this is the code
11296          * point of the zero for its sequence of 10 */
11297         UV zero_of_char;
11298 
11299         UV cp;
11300 
11301         /* The code allows all scripts to use the ASCII digits.  This is
11302          * because they are in the Common script.  Hence any ASCII ones found
11303          * are ok, unless and until a digit from another set has already been
11304          * encountered.  digit ranges in Common are not similarly blessed) */
11305         if (UNLIKELY(isDIGIT(*s))) {
11306             if (UNLIKELY(script_of_run == SCX_Unknown)) {
11307                 retval = FALSE;
11308                 break;
11309             }
11310             if (zero_of_run) {
11311                 if (zero_of_run != '0') {
11312                     retval = FALSE;
11313                     break;
11314                 }
11315             }
11316             else {
11317                 zero_of_run = '0';
11318             }
11319             s++;
11320             continue;
11321         }
11322 
11323         /* Here, isn't an ASCII digit.  Find the code point of the character */
11324         if (! UTF8_IS_INVARIANT(*s)) {
11325             Size_t len;
11326             cp = valid_utf8_to_uvchr((U8 *) s, &len);
11327             s += len;
11328         }
11329         else {
11330             cp = *(s++);
11331         }
11332 
11333         /* If is within the range [+0 .. +9] of the script's zero, it also is a
11334          * digit in that script.  We can skip the rest of this code for this
11335          * character. */
11336         if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11337             continue;
11338         }
11339 
11340         /* Find the character's script.  The correct values are hard-coded here
11341          * for small-enough code points. */
11342         if (cp < 0x2B9) {   /* From inspection of Unicode db; extremely
11343                                unlikely to change */
11344             if (       cp > 255
11345                 || (   isALPHA_L1(cp)
11346                     && LIKELY(cp != MICRO_SIGN_NATIVE)))
11347             {
11348                 script_of_char = SCX_Latin;
11349             }
11350             else {
11351                 script_of_char = SCX_Common;
11352             }
11353         }
11354         else {
11355             script_of_char = _Perl_SCX_invmap[
11356                                        _invlist_search(PL_SCX_invlist, cp)];
11357         }
11358 
11359         /* We arbitrarily accept a single unassigned character, but not in
11360          * combination with anything else, and not a run of them. */
11361         if (   UNLIKELY(script_of_run == SCX_Unknown)
11362             || UNLIKELY(   script_of_run != SCX_INVALID
11363                         && script_of_char == SCX_Unknown))
11364         {
11365             retval = FALSE;
11366             break;
11367         }
11368 
11369         /* For the first character, or the run is inherited, the run's script
11370          * is set to the char's */
11371         if (   UNLIKELY(script_of_run == SCX_INVALID)
11372             || UNLIKELY(script_of_run == SCX_Inherited))
11373         {
11374             script_of_run = script_of_char;
11375         }
11376 
11377         /* For the character's script to be Unknown, it must be the first
11378          * character in the sequence (for otherwise a test above would have
11379          * prevented us from reaching here), and we have set the run's script
11380          * to it.  Nothing further to be done for this character */
11381         if (UNLIKELY(script_of_char == SCX_Unknown)) {
11382             continue;
11383         }
11384 
11385         /* We accept 'inherited' script characters currently even at the
11386          * beginning.  (We know that no characters in Inherited are digits, or
11387          * we'd have to check for that) */
11388         if (UNLIKELY(script_of_char == SCX_Inherited)) {
11389             continue;
11390         }
11391 
11392         /* If the run so far is Common, and the new character isn't, change the
11393          * run's script to that of this character */
11394         if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11395             script_of_run = script_of_char;
11396         }
11397 
11398         /* Now we can see if the script of the new character is the same as
11399          * that of the run */
11400         if (LIKELY(script_of_char == script_of_run)) {
11401             /* By far the most common case */
11402             goto scripts_match;
11403         }
11404 
11405         /* Here, the script of the run isn't Common.  But characters in Common
11406          * match any script */
11407         if (script_of_char == SCX_Common) {
11408             goto scripts_match;
11409         }
11410 
11411 #ifndef HAS_SCX_AUX_TABLES
11412 
11413         /* Too early a Unicode version to have a code point belonging to more
11414          * than one script, so, if the scripts don't exactly match, fail */
11415         PERL_UNUSED_VAR(intersection_len);
11416         retval = FALSE;
11417         break;
11418 
11419 #else
11420 
11421         /* Here there is no exact match between the character's script and the
11422          * run's.  And we've handled the special cases of scripts Unknown,
11423          * Inherited, and Common.
11424          *
11425          * Negative script numbers signify that the value may be any of several
11426          * scripts, and we need to look at auxiliary information to make our
11427          * deterimination.  But if both are non-negative, we can fail now */
11428         if (LIKELY(script_of_char >= 0)) {
11429             const SCX_enum * search_in;
11430             PERL_UINT_FAST8_T search_in_len;
11431             PERL_UINT_FAST8_T i;
11432 
11433             if (LIKELY(script_of_run >= 0)) {
11434                 retval = FALSE;
11435                 break;
11436             }
11437 
11438             /* Use the previously constructed set of possible scripts, if any.
11439              * */
11440             if (intersection) {
11441                 search_in = intersection;
11442                 search_in_len = intersection_len;
11443             }
11444             else {
11445                 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11446                 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11447             }
11448 
11449             for (i = 0; i < search_in_len; i++) {
11450                 if (search_in[i] == script_of_char) {
11451                     script_of_run = script_of_char;
11452                     goto scripts_match;
11453                 }
11454             }
11455 
11456             retval = FALSE;
11457             break;
11458         }
11459         else if (LIKELY(script_of_run >= 0)) {
11460             /* script of character could be one of several, but run is a single
11461              * script */
11462             const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11463             const PERL_UINT_FAST8_T search_in_len
11464                                      = SCX_AUX_TABLE_lengths[-script_of_char];
11465             PERL_UINT_FAST8_T i;
11466 
11467             for (i = 0; i < search_in_len; i++) {
11468                 if (search_in[i] == script_of_run) {
11469                     script_of_char = script_of_run;
11470                     goto scripts_match;
11471                 }
11472             }
11473 
11474             retval = FALSE;
11475             break;
11476         }
11477         else {
11478             /* Both run and char could be in one of several scripts.  If the
11479              * intersection is empty, then this character isn't in this script
11480              * run.  Otherwise, we need to calculate the intersection to use
11481              * for future iterations of the loop, unless we are already at the
11482              * final character */
11483             const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11484             const PERL_UINT_FAST8_T char_len
11485                                       = SCX_AUX_TABLE_lengths[-script_of_char];
11486             const SCX_enum * search_run;
11487             PERL_UINT_FAST8_T run_len;
11488 
11489             SCX_enum * new_overlap = NULL;
11490             PERL_UINT_FAST8_T i, j;
11491 
11492             if (intersection) {
11493                 search_run = intersection;
11494                 run_len = intersection_len;
11495             }
11496             else {
11497                 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11498                 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11499             }
11500 
11501             intersection_len = 0;
11502 
11503             for (i = 0; i < run_len; i++) {
11504                 for (j = 0; j < char_len; j++) {
11505                     if (search_run[i] == search_char[j]) {
11506 
11507                         /* Here, the script at i,j matches.  That means this
11508                          * character is in the run.  But continue on to find
11509                          * the complete intersection, for the next loop
11510                          * iteration, and for the digit check after it.
11511                          *
11512                          * On the first found common script, we malloc space
11513                          * for the intersection list for the worst case of the
11514                          * intersection, which is the minimum of the number of
11515                          * scripts remaining in each set. */
11516                         if (intersection_len == 0) {
11517                             Newx(new_overlap,
11518                                  MIN(run_len - i, char_len - j),
11519                                  SCX_enum);
11520                         }
11521                         new_overlap[intersection_len++] = search_run[i];
11522                     }
11523                 }
11524             }
11525 
11526             /* Here we've looked through everything.  If they have no scripts
11527              * in common, not a run */
11528             if (intersection_len == 0) {
11529                 retval = FALSE;
11530                 break;
11531             }
11532 
11533             /* If there is only a single script in common, set to that.
11534              * Otherwise, use the intersection going forward */
11535             Safefree(intersection);
11536             intersection = NULL;
11537             if (intersection_len == 1) {
11538                 script_of_run = script_of_char = new_overlap[0];
11539                 Safefree(new_overlap);
11540                 new_overlap = NULL;
11541             }
11542             else {
11543                 intersection = new_overlap;
11544             }
11545         }
11546 
11547 #endif
11548 
11549   scripts_match:
11550 
11551         /* Here, the script of the character is compatible with that of the
11552          * run.  That means that in most cases, it continues the script run.
11553          * Either it and the run match exactly, or one or both can be in any of
11554          * several scripts, and the intersection is not empty.  However, if the
11555          * character is a decimal digit, it could still mean failure if it is
11556          * from the wrong sequence of 10.  So, we need to look at if it's a
11557          * digit.  We've already handled the 10 digits [0-9], and the next
11558          * lowest one is this one: */
11559         if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11560             continue;   /* Not a digit; this character is part of the run */
11561         }
11562 
11563         /* If we have a definitive '0' for the script of this character, we
11564          * know that for this to be a digit, it must be in the range of +0..+9
11565          * of that zero. */
11566         if (   script_of_char >= 0
11567             && (zero_of_char = script_zeros[script_of_char]))
11568         {
11569             if (! withinCOUNT(cp, zero_of_char, 9)) {
11570                 continue;   /* Not a digit; this character is part of the run
11571                              */
11572             }
11573 
11574         }
11575         else {  /* Need to look up if this character is a digit or not */
11576             SSize_t index_of_zero_of_char;
11577             index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11578             if (     UNLIKELY(index_of_zero_of_char < 0)
11579                 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11580             {
11581                 continue;   /* Not a digit; this character is part of the run.
11582                              */
11583             }
11584 
11585             zero_of_char = decimals_array[index_of_zero_of_char];
11586         }
11587 
11588         /* Here, the character is a decimal digit, and the zero of its sequence
11589          * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
11590          * they better be the same. */
11591         if (zero_of_run) {
11592             if (zero_of_run != zero_of_char) {
11593                 retval = FALSE;
11594                 break;
11595             }
11596         }
11597         else {  /* Otherwise we now have a zero for this run */
11598             zero_of_run = zero_of_char;
11599         }
11600     } /* end of looping through CLOSESR text */
11601 
11602     Safefree(intersection);
11603 
11604     if (ret_script != NULL) {
11605         if (retval) {
11606             *ret_script = script_of_run;
11607         }
11608         else {
11609             *ret_script = SCX_INVALID;
11610         }
11611     }
11612 
11613     return retval;
11614 }
11615 
11616 #endif /* ifndef PERL_IN_XSUB_RE */
11617 
11618 /*
11619  * ex: set ts=8 sts=4 sw=4 et:
11620  */
11621