xref: /openbsd-src/gnu/usr.bin/perl/regcomp.c (revision ae3cb403620ab940fbaabb3055fac045a63d56b7)
1 /*    regcomp.c
2  */
3 
4 /*
5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6  *
7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8  */
9 
10 /* This file contains functions for compiling a regular expression.  See
11  * also regexec.c which funnily enough, contains functions for executing
12  * a regular expression.
13  *
14  * This file is also copied at build time to ext/re/re_comp.c, where
15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16  * This causes the main functions to be compiled under new names and with
17  * debugging support added, which makes "use re 'debug'" work.
18  */
19 
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21  * confused with the original package (see point 3 below).  Thanks, Henry!
22  */
23 
24 /* Additional note: this code is very heavily munged from Henry's version
25  * in places.  In some spots I've traded clarity for efficiency, so don't
26  * blame Henry for some of the lack of readability.
27  */
28 
29 /* The names of the functions have been changed from regcomp and
30  * regexec to pregcomp and pregexec in order to avoid conflicts
31  * with the POSIX routines of the same names.
32 */
33 
34 #ifdef PERL_EXT_RE_BUILD
35 #include "re_top.h"
36 #endif
37 
38 /*
39  * pregcomp and pregexec -- regsub and regerror are not used in perl
40  *
41  *	Copyright (c) 1986 by University of Toronto.
42  *	Written by Henry Spencer.  Not derived from licensed software.
43  *
44  *	Permission is granted to anyone to use this software for any
45  *	purpose on any computer system, and to redistribute it freely,
46  *	subject to the following restrictions:
47  *
48  *	1. The author is not responsible for the consequences of use of
49  *		this software, no matter how awful, even if they arise
50  *		from defects in it.
51  *
52  *	2. The origin of this software must not be misrepresented, either
53  *		by explicit claim or by omission.
54  *
55  *	3. Altered versions must be plainly marked as such, and must not
56  *		be misrepresented as being the original software.
57  *
58  *
59  ****    Alterations to Henry's code are...
60  ****
61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63  ****    by Larry Wall and others
64  ****
65  ****    You may distribute under the terms of either the GNU General Public
66  ****    License or the Artistic License, as specified in the README file.
67 
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGCOMP_C
75 #include "perl.h"
76 
77 #ifndef PERL_IN_XSUB_RE
78 #  include "INTERN.h"
79 #endif
80 
81 #define REG_COMP_C
82 #ifdef PERL_IN_XSUB_RE
83 #  include "re_comp.h"
84 EXTERN_C const struct regexp_engine my_reg_engine;
85 #else
86 #  include "regcomp.h"
87 #endif
88 
89 #include "dquote_inline.h"
90 #include "invlist_inline.h"
91 #include "unicode_constants.h"
92 
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 
100 #ifndef STATIC
101 #define	STATIC	static
102 #endif
103 
104 #ifndef MIN
105 #define MIN(a,b) ((a) < (b) ? (a) : (b))
106 #endif
107 
108 #ifndef MAX
109 #define MAX(a,b) ((a) > (b) ? (a) : (b))
110 #endif
111 
112 /* this is a chain of data about sub patterns we are processing that
113    need to be handled separately/specially in study_chunk. Its so
114    we can simulate recursion without losing state.  */
115 struct scan_frame;
116 typedef struct scan_frame {
117     regnode *last_regnode;      /* last node to process in this frame */
118     regnode *next_regnode;      /* next node to process when last is reached */
119     U32 prev_recursed_depth;
120     I32 stopparen;              /* what stopparen do we use */
121     U32 is_top_frame;           /* what flags do we use? */
122 
123     struct scan_frame *this_prev_frame; /* this previous frame */
124     struct scan_frame *prev_frame;      /* previous frame */
125     struct scan_frame *next_frame;      /* next frame */
126 } scan_frame;
127 
128 /* Certain characters are output as a sequence with the first being a
129  * backslash. */
130 #define isBACKSLASHED_PUNCT(c)                                              \
131                     ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
132 
133 
134 struct RExC_state_t {
135     U32		flags;			/* RXf_* are we folding, multilining? */
136     U32		pm_flags;		/* PMf_* stuff from the calling PMOP */
137     char	*precomp;		/* uncompiled string. */
138     char	*precomp_end;		/* pointer to end of uncompiled string. */
139     REGEXP	*rx_sv;			/* The SV that is the regexp. */
140     regexp	*rx;                    /* perl core regexp structure */
141     regexp_internal	*rxi;           /* internal data for regexp object
142                                            pprivate field */
143     char	*start;			/* Start of input for compile */
144     char	*end;			/* End of input for compile */
145     char	*parse;			/* Input-scan pointer. */
146     char        *adjusted_start;        /* 'start', adjusted.  See code use */
147     STRLEN      precomp_adj;            /* an offset beyond precomp.  See code use */
148     SSize_t	whilem_seen;		/* number of WHILEM in this expr */
149     regnode	*emit_start;		/* Start of emitted-code area */
150     regnode	*emit_bound;		/* First regnode outside of the
151                                            allocated space */
152     regnode	*emit;			/* Code-emit pointer; if = &emit_dummy,
153                                            implies compiling, so don't emit */
154     regnode_ssc	emit_dummy;		/* placeholder for emit to point to;
155                                            large enough for the largest
156                                            non-EXACTish node, so can use it as
157                                            scratch in pass1 */
158     I32		naughty;		/* How bad is this pattern? */
159     I32		sawback;		/* Did we see \1, ...? */
160     U32		seen;
161     SSize_t	size;			/* Code size. */
162     I32                npar;            /* Capture buffer count, (OPEN) plus
163                                            one. ("par" 0 is the whole
164                                            pattern)*/
165     I32		nestroot;		/* root parens we are in - used by
166                                            accept */
167     I32		extralen;
168     I32		seen_zerolen;
169     regnode	**open_parens;		/* pointers to open parens */
170     regnode	**close_parens;		/* pointers to close parens */
171     regnode     *end_op;                /* END node in program */
172     I32		utf8;		/* whether the pattern is utf8 or not */
173     I32		orig_utf8;	/* whether the pattern was originally in utf8 */
174 				/* XXX use this for future optimisation of case
175 				 * where pattern must be upgraded to utf8. */
176     I32		uni_semantics;	/* If a d charset modifier should use unicode
177 				   rules, even if the pattern is not in
178 				   utf8 */
179     HV		*paren_names;		/* Paren names */
180 
181     regnode	**recurse;		/* Recurse regops */
182     I32                recurse_count;                /* Number of recurse regops we have generated */
183     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
184                                            through */
185     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
186     I32		in_lookbehind;
187     I32		contains_locale;
188     I32		contains_i;
189     I32		override_recoding;
190 #ifdef EBCDIC
191     I32		recode_x_to_native;
192 #endif
193     I32		in_multi_char_class;
194     struct reg_code_block *code_blocks;	/* positions of literal (?{})
195 					    within pattern */
196     int		num_code_blocks;	/* size of code_blocks[] */
197     int		code_index;		/* next code_blocks[] slot */
198     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
199     scan_frame *frame_head;
200     scan_frame *frame_last;
201     U32         frame_count;
202     AV         *warn_text;
203 #ifdef ADD_TO_REGEXEC
204     char 	*starttry;		/* -Dr: where regtry was called. */
205 #define RExC_starttry	(pRExC_state->starttry)
206 #endif
207     SV		*runtime_code_qr;	/* qr with the runtime code blocks */
208 #ifdef DEBUGGING
209     const char  *lastparse;
210     I32         lastnum;
211     AV          *paren_name_list;       /* idx -> name */
212     U32         study_chunk_recursed_count;
213     SV          *mysv1;
214     SV          *mysv2;
215 #define RExC_lastparse	(pRExC_state->lastparse)
216 #define RExC_lastnum	(pRExC_state->lastnum)
217 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
218 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
219 #define RExC_mysv	(pRExC_state->mysv1)
220 #define RExC_mysv1	(pRExC_state->mysv1)
221 #define RExC_mysv2	(pRExC_state->mysv2)
222 
223 #endif
224     bool        seen_unfolded_sharp_s;
225     bool        strict;
226     bool        study_started;
227 };
228 
229 #define RExC_flags	(pRExC_state->flags)
230 #define RExC_pm_flags	(pRExC_state->pm_flags)
231 #define RExC_precomp	(pRExC_state->precomp)
232 #define RExC_precomp_adj (pRExC_state->precomp_adj)
233 #define RExC_adjusted_start  (pRExC_state->adjusted_start)
234 #define RExC_precomp_end (pRExC_state->precomp_end)
235 #define RExC_rx_sv	(pRExC_state->rx_sv)
236 #define RExC_rx		(pRExC_state->rx)
237 #define RExC_rxi	(pRExC_state->rxi)
238 #define RExC_start	(pRExC_state->start)
239 #define RExC_end	(pRExC_state->end)
240 #define RExC_parse	(pRExC_state->parse)
241 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
242 
243 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
244  * EXACTF node, hence was parsed under /di rules.  If later in the parse,
245  * something forces the pattern into using /ui rules, the sharp s should be
246  * folded into the sequence 'ss', which takes up more space than previously
247  * calculated.  This means that the sizing pass needs to be restarted.  (The
248  * node also becomes an EXACTFU_SS.)  For all other characters, an EXACTF node
249  * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
250  * so there is no need to resize [perl #125990]. */
251 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
252 
253 #ifdef RE_TRACK_PATTERN_OFFSETS
254 #define RExC_offsets	(pRExC_state->rxi->u.offsets) /* I am not like the
255                                                          others */
256 #endif
257 #define RExC_emit	(pRExC_state->emit)
258 #define RExC_emit_dummy	(pRExC_state->emit_dummy)
259 #define RExC_emit_start	(pRExC_state->emit_start)
260 #define RExC_emit_bound	(pRExC_state->emit_bound)
261 #define RExC_sawback	(pRExC_state->sawback)
262 #define RExC_seen	(pRExC_state->seen)
263 #define RExC_size	(pRExC_state->size)
264 #define RExC_maxlen        (pRExC_state->maxlen)
265 #define RExC_npar	(pRExC_state->npar)
266 #define RExC_nestroot   (pRExC_state->nestroot)
267 #define RExC_extralen	(pRExC_state->extralen)
268 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
269 #define RExC_utf8	(pRExC_state->utf8)
270 #define RExC_uni_semantics	(pRExC_state->uni_semantics)
271 #define RExC_orig_utf8	(pRExC_state->orig_utf8)
272 #define RExC_open_parens	(pRExC_state->open_parens)
273 #define RExC_close_parens	(pRExC_state->close_parens)
274 #define RExC_end_op	(pRExC_state->end_op)
275 #define RExC_paren_names	(pRExC_state->paren_names)
276 #define RExC_recurse	(pRExC_state->recurse)
277 #define RExC_recurse_count	(pRExC_state->recurse_count)
278 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
279 #define RExC_study_chunk_recursed_bytes  \
280                                    (pRExC_state->study_chunk_recursed_bytes)
281 #define RExC_in_lookbehind	(pRExC_state->in_lookbehind)
282 #define RExC_contains_locale	(pRExC_state->contains_locale)
283 #define RExC_contains_i (pRExC_state->contains_i)
284 #define RExC_override_recoding (pRExC_state->override_recoding)
285 #ifdef EBCDIC
286 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
287 #endif
288 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
289 #define RExC_frame_head (pRExC_state->frame_head)
290 #define RExC_frame_last (pRExC_state->frame_last)
291 #define RExC_frame_count (pRExC_state->frame_count)
292 #define RExC_strict (pRExC_state->strict)
293 #define RExC_study_started      (pRExC_state->study_started)
294 #define RExC_warn_text (pRExC_state->warn_text)
295 
296 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
297  * a flag to disable back-off on the fixed/floating substrings - if it's
298  * a high complexity pattern we assume the benefit of avoiding a full match
299  * is worth the cost of checking for the substrings even if they rarely help.
300  */
301 #define RExC_naughty	(pRExC_state->naughty)
302 #define TOO_NAUGHTY (10)
303 #define MARK_NAUGHTY(add) \
304     if (RExC_naughty < TOO_NAUGHTY) \
305         RExC_naughty += (add)
306 #define MARK_NAUGHTY_EXP(exp, add) \
307     if (RExC_naughty < TOO_NAUGHTY) \
308         RExC_naughty += RExC_naughty / (exp) + (add)
309 
310 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
311 #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
312 	((*s) == '{' && regcurly(s)))
313 
314 /*
315  * Flags to be passed up and down.
316  */
317 #define	WORST		0	/* Worst case. */
318 #define	HASWIDTH	0x01	/* Known to match non-null strings. */
319 
320 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
321  * character.  (There needs to be a case: in the switch statement in regexec.c
322  * for any node marked SIMPLE.)  Note that this is not the same thing as
323  * REGNODE_SIMPLE */
324 #define	SIMPLE		0x02
325 #define	SPSTART		0x04	/* Starts with * or + */
326 #define POSTPONED	0x08    /* (?1),(?&name), (??{...}) or similar */
327 #define TRYAGAIN	0x10	/* Weeded out a declaration. */
328 #define RESTART_PASS1   0x20    /* Need to restart sizing pass */
329 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PASS1, need to
330                                    calcuate sizes as UTF-8 */
331 
332 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
333 
334 /* whether trie related optimizations are enabled */
335 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
336 #define TRIE_STUDY_OPT
337 #define FULL_TRIE_STUDY
338 #define TRIE_STCLASS
339 #endif
340 
341 
342 
343 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
344 #define PBITVAL(paren) (1 << ((paren) & 7))
345 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
346 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
347 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
348 
349 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
350                                      if (!UTF) {                           \
351                                          assert(PASS1);                    \
352                                          *flagp = RESTART_PASS1|NEED_UTF8; \
353                                          return NULL;                      \
354                                      }                                     \
355                              } STMT_END
356 
357 /* Change from /d into /u rules, and restart the parse if we've already seen
358  * something whose size would increase as a result, by setting *flagp and
359  * returning 'restart_retval'.  RExC_uni_semantics is a flag that indicates
360  * we've change to /u during the parse.  */
361 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
362     STMT_START {                                                            \
363             if (DEPENDS_SEMANTICS) {                                        \
364                 assert(PASS1);                                              \
365                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
366                 RExC_uni_semantics = 1;                                     \
367                 if (RExC_seen_unfolded_sharp_s) {                           \
368                     *flagp |= RESTART_PASS1;                                \
369                     return restart_retval;                                  \
370                 }                                                           \
371             }                                                               \
372     } STMT_END
373 
374 /* This converts the named class defined in regcomp.h to its equivalent class
375  * number defined in handy.h. */
376 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
377 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
378 
379 #define _invlist_union_complement_2nd(a, b, output) \
380                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
381 #define _invlist_intersection_complement_2nd(a, b, output) \
382                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
383 
384 /* About scan_data_t.
385 
386   During optimisation we recurse through the regexp program performing
387   various inplace (keyhole style) optimisations. In addition study_chunk
388   and scan_commit populate this data structure with information about
389   what strings MUST appear in the pattern. We look for the longest
390   string that must appear at a fixed location, and we look for the
391   longest string that may appear at a floating location. So for instance
392   in the pattern:
393 
394     /FOO[xX]A.*B[xX]BAR/
395 
396   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
397   strings (because they follow a .* construct). study_chunk will identify
398   both FOO and BAR as being the longest fixed and floating strings respectively.
399 
400   The strings can be composites, for instance
401 
402      /(f)(o)(o)/
403 
404   will result in a composite fixed substring 'foo'.
405 
406   For each string some basic information is maintained:
407 
408   - offset or min_offset
409     This is the position the string must appear at, or not before.
410     It also implicitly (when combined with minlenp) tells us how many
411     characters must match before the string we are searching for.
412     Likewise when combined with minlenp and the length of the string it
413     tells us how many characters must appear after the string we have
414     found.
415 
416   - max_offset
417     Only used for floating strings. This is the rightmost point that
418     the string can appear at. If set to SSize_t_MAX it indicates that the
419     string can occur infinitely far to the right.
420 
421   - minlenp
422     A pointer to the minimum number of characters of the pattern that the
423     string was found inside. This is important as in the case of positive
424     lookahead or positive lookbehind we can have multiple patterns
425     involved. Consider
426 
427     /(?=FOO).*F/
428 
429     The minimum length of the pattern overall is 3, the minimum length
430     of the lookahead part is 3, but the minimum length of the part that
431     will actually match is 1. So 'FOO's minimum length is 3, but the
432     minimum length for the F is 1. This is important as the minimum length
433     is used to determine offsets in front of and behind the string being
434     looked for.  Since strings can be composites this is the length of the
435     pattern at the time it was committed with a scan_commit. Note that
436     the length is calculated by study_chunk, so that the minimum lengths
437     are not known until the full pattern has been compiled, thus the
438     pointer to the value.
439 
440   - lookbehind
441 
442     In the case of lookbehind the string being searched for can be
443     offset past the start point of the final matching string.
444     If this value was just blithely removed from the min_offset it would
445     invalidate some of the calculations for how many chars must match
446     before or after (as they are derived from min_offset and minlen and
447     the length of the string being searched for).
448     When the final pattern is compiled and the data is moved from the
449     scan_data_t structure into the regexp structure the information
450     about lookbehind is factored in, with the information that would
451     have been lost precalculated in the end_shift field for the
452     associated string.
453 
454   The fields pos_min and pos_delta are used to store the minimum offset
455   and the delta to the maximum offset at the current point in the pattern.
456 
457 */
458 
459 typedef struct scan_data_t {
460     /*I32 len_min;      unused */
461     /*I32 len_delta;    unused */
462     SSize_t pos_min;
463     SSize_t pos_delta;
464     SV *last_found;
465     SSize_t last_end;	    /* min value, <0 unless valid. */
466     SSize_t last_start_min;
467     SSize_t last_start_max;
468     SV **longest;	    /* Either &l_fixed, or &l_float. */
469     SV *longest_fixed;      /* longest fixed string found in pattern */
470     SSize_t offset_fixed;   /* offset where it starts */
471     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
472     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
473     SV *longest_float;      /* longest floating string found in pattern */
474     SSize_t offset_float_min; /* earliest point in string it can appear */
475     SSize_t offset_float_max; /* latest point in string it can appear */
476     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
477     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
478     I32 flags;
479     I32 whilem_c;
480     SSize_t *last_closep;
481     regnode_ssc *start_class;
482 } scan_data_t;
483 
484 /*
485  * Forward declarations for pregcomp()'s friends.
486  */
487 
488 static const scan_data_t zero_scan_data =
489   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
490 
491 #define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
492 #define SF_BEFORE_SEOL		0x0001
493 #define SF_BEFORE_MEOL		0x0002
494 #define SF_FIX_BEFORE_EOL	(SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
495 #define SF_FL_BEFORE_EOL	(SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
496 
497 #define SF_FIX_SHIFT_EOL	(+2)
498 #define SF_FL_SHIFT_EOL		(+4)
499 
500 #define SF_FIX_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
501 #define SF_FIX_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
502 
503 #define SF_FL_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
504 #define SF_FL_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
505 #define SF_IS_INF		0x0040
506 #define SF_HAS_PAR		0x0080
507 #define SF_IN_PAR		0x0100
508 #define SF_HAS_EVAL		0x0200
509 
510 
511 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
512  * longest substring in the pattern. When it is not set the optimiser keeps
513  * track of position, but does not keep track of the actual strings seen,
514  *
515  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
516  * /foo/i will not.
517  *
518  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
519  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
520  * turned off because of the alternation (BRANCH). */
521 #define SCF_DO_SUBSTR		0x0400
522 
523 #define SCF_DO_STCLASS_AND	0x0800
524 #define SCF_DO_STCLASS_OR	0x1000
525 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
526 #define SCF_WHILEM_VISITED_POS	0x2000
527 
528 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
529 #define SCF_SEEN_ACCEPT         0x8000
530 #define SCF_TRIE_DOING_RESTUDY 0x10000
531 #define SCF_IN_DEFINE          0x20000
532 
533 
534 
535 
536 #define UTF cBOOL(RExC_utf8)
537 
538 /* The enums for all these are ordered so things work out correctly */
539 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
540 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
541                                                      == REGEX_DEPENDS_CHARSET)
542 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
543 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
544                                                      >= REGEX_UNICODE_CHARSET)
545 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
546                                             == REGEX_ASCII_RESTRICTED_CHARSET)
547 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
548                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
549 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
550                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
551 
552 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
553 
554 /* For programs that want to be strictly Unicode compatible by dying if any
555  * attempt is made to match a non-Unicode code point against a Unicode
556  * property.  */
557 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
558 
559 #define OOB_NAMEDCLASS		-1
560 
561 /* There is no code point that is out-of-bounds, so this is problematic.  But
562  * its only current use is to initialize a variable that is always set before
563  * looked at. */
564 #define OOB_UNICODE		0xDEADBEEF
565 
566 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
567 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
568 
569 
570 /* length of regex to show in messages that don't mark a position within */
571 #define RegexLengthToShowInErrorMessages 127
572 
573 /*
574  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
575  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
576  * op/pragma/warn/regcomp.
577  */
578 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
579 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
580 
581 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
582                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
583 
584 /* The code in this file in places uses one level of recursion with parsing
585  * rebased to an alternate string constructed by us in memory.  This can take
586  * the form of something that is completely different from the input, or
587  * something that uses the input as part of the alternate.  In the first case,
588  * there should be no possibility of an error, as we are in complete control of
589  * the alternate string.  But in the second case we don't control the input
590  * portion, so there may be errors in that.  Here's an example:
591  *      /[abc\x{DF}def]/ui
592  * is handled specially because \x{df} folds to a sequence of more than one
593  * character, 'ss'.  What is done is to create and parse an alternate string,
594  * which looks like this:
595  *      /(?:\x{DF}|[abc\x{DF}def])/ui
596  * where it uses the input unchanged in the middle of something it constructs,
597  * which is a branch for the DF outside the character class, and clustering
598  * parens around the whole thing. (It knows enough to skip the DF inside the
599  * class while in this substitute parse.) 'abc' and 'def' may have errors that
600  * need to be reported.  The general situation looks like this:
601  *
602  *              sI                       tI               xI       eI
603  * Input:       ----------------------------------------------------
604  * Constructed:         ---------------------------------------------------
605  *                      sC               tC               xC       eC     EC
606  *
607  * The input string sI..eI is the input pattern.  The string sC..EC is the
608  * constructed substitute parse string.  The portions sC..tC and eC..EC are
609  * constructed by us.  The portion tC..eC is an exact duplicate of the input
610  * pattern tI..eI.  In the diagram, these are vertically aligned.  Suppose that
611  * while parsing, we find an error at xC.  We want to display a message showing
612  * the real input string.  Thus we need to find the point xI in it which
613  * corresponds to xC.  xC >= tC, since the portion of the string sC..tC has
614  * been constructed by us, and so shouldn't have errors.  We get:
615  *
616  *      xI = sI + (tI - sI) + (xC - tC)
617  *
618  * and, the offset into sI is:
619  *
620  *      (xI - sI) = (tI - sI) + (xC - tC)
621  *
622  * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
623  * and we save tC as RExC_adjusted_start.
624  *
625  * During normal processing of the input pattern, everything points to that,
626  * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
627  */
628 
629 #define tI_sI           RExC_precomp_adj
630 #define tC              RExC_adjusted_start
631 #define sC              RExC_precomp
632 #define xI_offset(xC)   ((IV) (tI_sI + (xC - tC)))
633 #define xI(xC)          (sC + xI_offset(xC))
634 #define eC              RExC_precomp_end
635 
636 #define REPORT_LOCATION_ARGS(xC)                                            \
637     UTF8fARG(UTF,                                                           \
638              (xI(xC) > eC) /* Don't run off end */                          \
639               ? eC - sC   /* Length before the <--HERE */                   \
640               : xI_offset(xC),                                              \
641              sC),         /* The input pattern printed up to the <--HERE */ \
642     UTF8fARG(UTF,                                                           \
643              (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */    \
644              (xI(xC) > eC) ? eC : xI(xC))     /* pattern after <--HERE */
645 
646 /* Used to point after bad bytes for an error message, but avoid skipping
647  * past a nul byte. */
648 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
649 
650 /*
651  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
652  * arg. Show regex, up to a maximum length. If it's too long, chop and add
653  * "...".
654  */
655 #define _FAIL(code) STMT_START {					\
656     const char *ellipses = "";						\
657     IV len = RExC_precomp_end - RExC_precomp;					\
658 									\
659     if (!SIZE_ONLY)							\
660 	SAVEFREESV(RExC_rx_sv);						\
661     if (len > RegexLengthToShowInErrorMessages) {			\
662 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
663 	len = RegexLengthToShowInErrorMessages - 10;			\
664 	ellipses = "...";						\
665     }									\
666     code;                                                               \
667 } STMT_END
668 
669 #define	FAIL(msg) _FAIL(			    \
670     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",	    \
671 	    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
672 
673 #define	FAIL2(msg,arg) _FAIL(			    \
674     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",	    \
675 	    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
676 
677 /*
678  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
679  */
680 #define	Simple_vFAIL(m) STMT_START {					\
681     Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
682 	    m, REPORT_LOCATION_ARGS(RExC_parse));	                \
683 } STMT_END
684 
685 /*
686  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
687  */
688 #define	vFAIL(m) STMT_START {				\
689     if (!SIZE_ONLY)					\
690 	SAVEFREESV(RExC_rx_sv);				\
691     Simple_vFAIL(m);					\
692 } STMT_END
693 
694 /*
695  * Like Simple_vFAIL(), but accepts two arguments.
696  */
697 #define	Simple_vFAIL2(m,a1) STMT_START {			\
698     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,		\
699                       REPORT_LOCATION_ARGS(RExC_parse));	\
700 } STMT_END
701 
702 /*
703  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
704  */
705 #define	vFAIL2(m,a1) STMT_START {			\
706     if (!SIZE_ONLY)					\
707 	SAVEFREESV(RExC_rx_sv);				\
708     Simple_vFAIL2(m, a1);				\
709 } STMT_END
710 
711 
712 /*
713  * Like Simple_vFAIL(), but accepts three arguments.
714  */
715 #define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
716     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,		\
717 	    REPORT_LOCATION_ARGS(RExC_parse));	                \
718 } STMT_END
719 
720 /*
721  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
722  */
723 #define	vFAIL3(m,a1,a2) STMT_START {			\
724     if (!SIZE_ONLY)					\
725 	SAVEFREESV(RExC_rx_sv);				\
726     Simple_vFAIL3(m, a1, a2);				\
727 } STMT_END
728 
729 /*
730  * Like Simple_vFAIL(), but accepts four arguments.
731  */
732 #define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
733     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,	\
734 	    REPORT_LOCATION_ARGS(RExC_parse));	                \
735 } STMT_END
736 
737 #define	vFAIL4(m,a1,a2,a3) STMT_START {			\
738     if (!SIZE_ONLY)					\
739 	SAVEFREESV(RExC_rx_sv);				\
740     Simple_vFAIL4(m, a1, a2, a3);			\
741 } STMT_END
742 
743 /* A specialized version of vFAIL2 that works with UTF8f */
744 #define vFAIL2utf8f(m, a1) STMT_START {             \
745     if (!SIZE_ONLY)                                 \
746         SAVEFREESV(RExC_rx_sv);                     \
747     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
748             REPORT_LOCATION_ARGS(RExC_parse));      \
749 } STMT_END
750 
751 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
752     if (!SIZE_ONLY)                                     \
753         SAVEFREESV(RExC_rx_sv);                         \
754     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
755             REPORT_LOCATION_ARGS(RExC_parse));          \
756 } STMT_END
757 
758 /* These have asserts in them because of [perl #122671] Many warnings in
759  * regcomp.c can occur twice.  If they get output in pass1 and later in that
760  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
761  * would get output again.  So they should be output in pass2, and these
762  * asserts make sure new warnings follow that paradigm. */
763 
764 /* m is not necessarily a "literal string", in this macro */
765 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
766     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
767                                        "%s" REPORT_LOCATION,            \
768                                   m, REPORT_LOCATION_ARGS(loc));        \
769 } STMT_END
770 
771 #define	ckWARNreg(loc,m) STMT_START {					\
772     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
773                                           m REPORT_LOCATION,	        \
774 	                                  REPORT_LOCATION_ARGS(loc));   \
775 } STMT_END
776 
777 #define	vWARN(loc, m) STMT_START {				        \
778     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
779                                        m REPORT_LOCATION,               \
780                                        REPORT_LOCATION_ARGS(loc));      \
781 } STMT_END
782 
783 #define	vWARN_dep(loc, m) STMT_START {				        \
784     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),       \
785                                        m REPORT_LOCATION,               \
786 	                               REPORT_LOCATION_ARGS(loc));      \
787 } STMT_END
788 
789 #define	ckWARNdep(loc,m) STMT_START {				        \
790     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),  \
791 	                                    m REPORT_LOCATION,          \
792 	                                    REPORT_LOCATION_ARGS(loc)); \
793 } STMT_END
794 
795 #define	ckWARNregdep(loc,m) STMT_START {				    \
796     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,      \
797                                                       WARN_REGEXP),         \
798 	                                     m REPORT_LOCATION,             \
799 	                                     REPORT_LOCATION_ARGS(loc));    \
800 } STMT_END
801 
802 #define	ckWARN2reg_d(loc,m, a1) STMT_START {				    \
803     __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),          \
804 	                                    m REPORT_LOCATION,              \
805 	                                    a1, REPORT_LOCATION_ARGS(loc)); \
806 } STMT_END
807 
808 #define	ckWARN2reg(loc, m, a1) STMT_START {                                 \
809     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
810                                           m REPORT_LOCATION,	            \
811                                           a1, REPORT_LOCATION_ARGS(loc));   \
812 } STMT_END
813 
814 #define	vWARN3(loc, m, a1, a2) STMT_START {				    \
815     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),               \
816                                        m REPORT_LOCATION,                   \
817 	                               a1, a2, REPORT_LOCATION_ARGS(loc));  \
818 } STMT_END
819 
820 #define	ckWARN3reg(loc, m, a1, a2) STMT_START {				    \
821     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),            \
822                                           m REPORT_LOCATION,                \
823 	                                  a1, a2,                           \
824                                           REPORT_LOCATION_ARGS(loc));       \
825 } STMT_END
826 
827 #define	vWARN4(loc, m, a1, a2, a3) STMT_START {				\
828     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
829                                        m REPORT_LOCATION,               \
830 	                               a1, a2, a3,                      \
831                                        REPORT_LOCATION_ARGS(loc));      \
832 } STMT_END
833 
834 #define	ckWARN4reg(loc, m, a1, a2, a3) STMT_START {			\
835     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
836                                           m REPORT_LOCATION,            \
837 	                                  a1, a2, a3,                   \
838                                           REPORT_LOCATION_ARGS(loc));   \
839 } STMT_END
840 
841 #define	vWARN5(loc, m, a1, a2, a3, a4) STMT_START {			\
842     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP),           \
843                                        m REPORT_LOCATION,		\
844 	                               a1, a2, a3, a4,                  \
845                                        REPORT_LOCATION_ARGS(loc));      \
846 } STMT_END
847 
848 /* Macros for recording node offsets.   20001227 mjd@plover.com
849  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
850  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
851  * Element 0 holds the number n.
852  * Position is 1 indexed.
853  */
854 #ifndef RE_TRACK_PATTERN_OFFSETS
855 #define Set_Node_Offset_To_R(node,byte)
856 #define Set_Node_Offset(node,byte)
857 #define Set_Cur_Node_Offset
858 #define Set_Node_Length_To_R(node,len)
859 #define Set_Node_Length(node,len)
860 #define Set_Node_Cur_Length(node,start)
861 #define Node_Offset(n)
862 #define Node_Length(n)
863 #define Set_Node_Offset_Length(node,offset,len)
864 #define ProgLen(ri) ri->u.proglen
865 #define SetProgLen(ri,x) ri->u.proglen = x
866 #else
867 #define ProgLen(ri) ri->u.offsets[0]
868 #define SetProgLen(ri,x) ri->u.offsets[0] = x
869 #define Set_Node_Offset_To_R(node,byte) STMT_START {			\
870     if (! SIZE_ONLY) {							\
871 	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
872 		    __LINE__, (int)(node), (int)(byte)));		\
873 	if((node) < 0) {						\
874 	    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
875                                          (int)(node));                  \
876 	} else {							\
877 	    RExC_offsets[2*(node)-1] = (byte);				\
878 	}								\
879     }									\
880 } STMT_END
881 
882 #define Set_Node_Offset(node,byte) \
883     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
884 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
885 
886 #define Set_Node_Length_To_R(node,len) STMT_START {			\
887     if (! SIZE_ONLY) {							\
888 	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
889 		__LINE__, (int)(node), (int)(len)));			\
890 	if((node) < 0) {						\
891 	    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
892                                          (int)(node));                  \
893 	} else {							\
894 	    RExC_offsets[2*(node)] = (len);				\
895 	}								\
896     }									\
897 } STMT_END
898 
899 #define Set_Node_Length(node,len) \
900     Set_Node_Length_To_R((node)-RExC_emit_start, len)
901 #define Set_Node_Cur_Length(node, start)                \
902     Set_Node_Length(node, RExC_parse - start)
903 
904 /* Get offsets and lengths */
905 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
906 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
907 
908 #define Set_Node_Offset_Length(node,offset,len) STMT_START {	\
909     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));	\
910     Set_Node_Length_To_R((node)-RExC_emit_start, (len));	\
911 } STMT_END
912 #endif
913 
914 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
915 #define EXPERIMENTAL_INPLACESCAN
916 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
917 
918 #ifdef DEBUGGING
919 int
920 Perl_re_printf(pTHX_ const char *fmt, ...)
921 {
922     va_list ap;
923     int result;
924     PerlIO *f= Perl_debug_log;
925     PERL_ARGS_ASSERT_RE_PRINTF;
926     va_start(ap, fmt);
927     result = PerlIO_vprintf(f, fmt, ap);
928     va_end(ap);
929     return result;
930 }
931 
932 int
933 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
934 {
935     va_list ap;
936     int result;
937     PerlIO *f= Perl_debug_log;
938     PERL_ARGS_ASSERT_RE_INDENTF;
939     va_start(ap, depth);
940     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
941     result = PerlIO_vprintf(f, fmt, ap);
942     va_end(ap);
943     return result;
944 }
945 #endif /* DEBUGGING */
946 
947 #define DEBUG_RExC_seen()                                                   \
948         DEBUG_OPTIMISE_MORE_r({                                             \
949             Perl_re_printf( aTHX_ "RExC_seen: ");                                       \
950                                                                             \
951             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
952                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                            \
953                                                                             \
954             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
955                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");                          \
956                                                                             \
957             if (RExC_seen & REG_GPOS_SEEN)                                  \
958                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                                \
959                                                                             \
960             if (RExC_seen & REG_RECURSE_SEEN)                               \
961                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                             \
962                                                                             \
963             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
964                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");                  \
965                                                                             \
966             if (RExC_seen & REG_VERBARG_SEEN)                               \
967                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                             \
968                                                                             \
969             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
970                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                            \
971                                                                             \
972             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
973                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");                      \
974                                                                             \
975             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
976                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");                      \
977                                                                             \
978             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
979                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");                \
980                                                                             \
981             Perl_re_printf( aTHX_ "\n");                                                \
982         });
983 
984 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
985   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
986 
987 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str)                    \
988     if ( ( flags ) ) {                                                      \
989         Perl_re_printf( aTHX_  "%s", open_str);                                         \
990         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL);                     \
991         DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL);                     \
992         DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF);                             \
993         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR);                            \
994         DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR);                             \
995         DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL);                           \
996         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR);                         \
997         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND);                    \
998         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR);                     \
999         DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS);                        \
1000         DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS);                \
1001         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY);                      \
1002         DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT);                       \
1003         DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY);                \
1004         DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE);                         \
1005         Perl_re_printf( aTHX_  "%s", close_str);                                        \
1006     }
1007 
1008 
1009 #define DEBUG_STUDYDATA(str,data,depth)                              \
1010 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
1011     Perl_re_indentf( aTHX_  "" str "Pos:%"IVdf"/%"IVdf                           \
1012         " Flags: 0x%"UVXf,                                           \
1013         depth,                                                       \
1014         (IV)((data)->pos_min),                                       \
1015         (IV)((data)->pos_delta),                                     \
1016         (UV)((data)->flags)                                          \
1017     );                                                               \
1018     DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]");                 \
1019     Perl_re_printf( aTHX_                                                        \
1020         " Whilem_c: %"IVdf" Lcp: %"IVdf" %s",                        \
1021         (IV)((data)->whilem_c),                                      \
1022         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
1023         is_inf ? "INF " : ""                                         \
1024     );                                                               \
1025     if ((data)->last_found)                                          \
1026         Perl_re_printf( aTHX_                                                    \
1027             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
1028             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
1029             SvPVX_const((data)->last_found),                         \
1030             (IV)((data)->last_end),                                  \
1031             (IV)((data)->last_start_min),                            \
1032             (IV)((data)->last_start_max),                            \
1033             ((data)->longest &&                                      \
1034              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
1035             SvPVX_const((data)->longest_fixed),                      \
1036             (IV)((data)->offset_fixed),                              \
1037             ((data)->longest &&                                      \
1038              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
1039             SvPVX_const((data)->longest_float),                      \
1040             (IV)((data)->offset_float_min),                          \
1041             (IV)((data)->offset_float_max)                           \
1042         );                                                           \
1043     Perl_re_printf( aTHX_ "\n");                                                 \
1044 });
1045 
1046 
1047 /* =========================================================
1048  * BEGIN edit_distance stuff.
1049  *
1050  * This calculates how many single character changes of any type are needed to
1051  * transform a string into another one.  It is taken from version 3.1 of
1052  *
1053  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1054  */
1055 
1056 /* Our unsorted dictionary linked list.   */
1057 /* Note we use UVs, not chars. */
1058 
1059 struct dictionary{
1060   UV key;
1061   UV value;
1062   struct dictionary* next;
1063 };
1064 typedef struct dictionary item;
1065 
1066 
1067 PERL_STATIC_INLINE item*
1068 push(UV key,item* curr)
1069 {
1070     item* head;
1071     Newxz(head, 1, item);
1072     head->key = key;
1073     head->value = 0;
1074     head->next = curr;
1075     return head;
1076 }
1077 
1078 
1079 PERL_STATIC_INLINE item*
1080 find(item* head, UV key)
1081 {
1082     item* iterator = head;
1083     while (iterator){
1084         if (iterator->key == key){
1085             return iterator;
1086         }
1087         iterator = iterator->next;
1088     }
1089 
1090     return NULL;
1091 }
1092 
1093 PERL_STATIC_INLINE item*
1094 uniquePush(item* head,UV key)
1095 {
1096     item* iterator = head;
1097 
1098     while (iterator){
1099         if (iterator->key == key) {
1100             return head;
1101         }
1102         iterator = iterator->next;
1103     }
1104 
1105     return push(key,head);
1106 }
1107 
1108 PERL_STATIC_INLINE void
1109 dict_free(item* head)
1110 {
1111     item* iterator = head;
1112 
1113     while (iterator) {
1114         item* temp = iterator;
1115         iterator = iterator->next;
1116         Safefree(temp);
1117     }
1118 
1119     head = NULL;
1120 }
1121 
1122 /* End of Dictionary Stuff */
1123 
1124 /* All calculations/work are done here */
1125 STATIC int
1126 S_edit_distance(const UV* src,
1127                 const UV* tgt,
1128                 const STRLEN x,             /* length of src[] */
1129                 const STRLEN y,             /* length of tgt[] */
1130                 const SSize_t maxDistance
1131 )
1132 {
1133     item *head = NULL;
1134     UV swapCount,swapScore,targetCharCount,i,j;
1135     UV *scores;
1136     UV score_ceil = x + y;
1137 
1138     PERL_ARGS_ASSERT_EDIT_DISTANCE;
1139 
1140     /* intialize matrix start values */
1141     Newxz(scores, ( (x + 2) * (y + 2)), UV);
1142     scores[0] = score_ceil;
1143     scores[1 * (y + 2) + 0] = score_ceil;
1144     scores[0 * (y + 2) + 1] = score_ceil;
1145     scores[1 * (y + 2) + 1] = 0;
1146     head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1147 
1148     /* work loops    */
1149     /* i = src index */
1150     /* j = tgt index */
1151     for (i=1;i<=x;i++) {
1152         if (i < x)
1153             head = uniquePush(head,src[i]);
1154         scores[(i+1) * (y + 2) + 1] = i;
1155         scores[(i+1) * (y + 2) + 0] = score_ceil;
1156         swapCount = 0;
1157 
1158         for (j=1;j<=y;j++) {
1159             if (i == 1) {
1160                 if(j < y)
1161                 head = uniquePush(head,tgt[j]);
1162                 scores[1 * (y + 2) + (j + 1)] = j;
1163                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1164             }
1165 
1166             targetCharCount = find(head,tgt[j-1])->value;
1167             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1168 
1169             if (src[i-1] != tgt[j-1]){
1170                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1171             }
1172             else {
1173                 swapCount = j;
1174                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1175             }
1176         }
1177 
1178         find(head,src[i-1])->value = i;
1179     }
1180 
1181     {
1182         IV score = scores[(x+1) * (y + 2) + (y + 1)];
1183         dict_free(head);
1184         Safefree(scores);
1185         return (maxDistance != 0 && maxDistance < score)?(-1):score;
1186     }
1187 }
1188 
1189 /* END of edit_distance() stuff
1190  * ========================================================= */
1191 
1192 /* is c a control character for which we have a mnemonic? */
1193 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1194 
1195 STATIC const char *
1196 S_cntrl_to_mnemonic(const U8 c)
1197 {
1198     /* Returns the mnemonic string that represents character 'c', if one
1199      * exists; NULL otherwise.  The only ones that exist for the purposes of
1200      * this routine are a few control characters */
1201 
1202     switch (c) {
1203         case '\a':       return "\\a";
1204         case '\b':       return "\\b";
1205         case ESC_NATIVE: return "\\e";
1206         case '\f':       return "\\f";
1207         case '\n':       return "\\n";
1208         case '\r':       return "\\r";
1209         case '\t':       return "\\t";
1210     }
1211 
1212     return NULL;
1213 }
1214 
1215 /* Mark that we cannot extend a found fixed substring at this point.
1216    Update the longest found anchored substring and the longest found
1217    floating substrings if needed. */
1218 
1219 STATIC void
1220 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1221                     SSize_t *minlenp, int is_inf)
1222 {
1223     const STRLEN l = CHR_SVLEN(data->last_found);
1224     const STRLEN old_l = CHR_SVLEN(*data->longest);
1225     GET_RE_DEBUG_FLAGS_DECL;
1226 
1227     PERL_ARGS_ASSERT_SCAN_COMMIT;
1228 
1229     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1230 	SvSetMagicSV(*data->longest, data->last_found);
1231 	if (*data->longest == data->longest_fixed) {
1232 	    data->offset_fixed = l ? data->last_start_min : data->pos_min;
1233 	    if (data->flags & SF_BEFORE_EOL)
1234 		data->flags
1235 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1236 	    else
1237 		data->flags &= ~SF_FIX_BEFORE_EOL;
1238 	    data->minlen_fixed=minlenp;
1239 	    data->lookbehind_fixed=0;
1240 	}
1241 	else { /* *data->longest == data->longest_float */
1242 	    data->offset_float_min = l ? data->last_start_min : data->pos_min;
1243 	    data->offset_float_max = (l
1244                           ? data->last_start_max
1245                           : (data->pos_delta > SSize_t_MAX - data->pos_min
1246 					 ? SSize_t_MAX
1247 					 : data->pos_min + data->pos_delta));
1248 	    if (is_inf
1249 		 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1250 		data->offset_float_max = SSize_t_MAX;
1251 	    if (data->flags & SF_BEFORE_EOL)
1252 		data->flags
1253 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1254 	    else
1255 		data->flags &= ~SF_FL_BEFORE_EOL;
1256             data->minlen_float=minlenp;
1257             data->lookbehind_float=0;
1258 	}
1259     }
1260     SvCUR_set(data->last_found, 0);
1261     {
1262 	SV * const sv = data->last_found;
1263 	if (SvUTF8(sv) && SvMAGICAL(sv)) {
1264 	    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1265 	    if (mg)
1266 		mg->mg_len = 0;
1267 	}
1268     }
1269     data->last_end = -1;
1270     data->flags &= ~SF_BEFORE_EOL;
1271     DEBUG_STUDYDATA("commit: ",data,0);
1272 }
1273 
1274 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1275  * list that describes which code points it matches */
1276 
1277 STATIC void
1278 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1279 {
1280     /* Set the SSC 'ssc' to match an empty string or any code point */
1281 
1282     PERL_ARGS_ASSERT_SSC_ANYTHING;
1283 
1284     assert(is_ANYOF_SYNTHETIC(ssc));
1285 
1286     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1287     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1288     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
1289 }
1290 
1291 STATIC int
1292 S_ssc_is_anything(const regnode_ssc *ssc)
1293 {
1294     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1295      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
1296      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1297      * in any way, so there's no point in using it */
1298 
1299     UV start, end;
1300     bool ret;
1301 
1302     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1303 
1304     assert(is_ANYOF_SYNTHETIC(ssc));
1305 
1306     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1307         return FALSE;
1308     }
1309 
1310     /* See if the list consists solely of the range 0 - Infinity */
1311     invlist_iterinit(ssc->invlist);
1312     ret = invlist_iternext(ssc->invlist, &start, &end)
1313           && start == 0
1314           && end == UV_MAX;
1315 
1316     invlist_iterfinish(ssc->invlist);
1317 
1318     if (ret) {
1319         return TRUE;
1320     }
1321 
1322     /* If e.g., both \w and \W are set, matches everything */
1323     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1324         int i;
1325         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1326             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1327                 return TRUE;
1328             }
1329         }
1330     }
1331 
1332     return FALSE;
1333 }
1334 
1335 STATIC void
1336 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1337 {
1338     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
1339      * string, any code point, or any posix class under locale */
1340 
1341     PERL_ARGS_ASSERT_SSC_INIT;
1342 
1343     Zero(ssc, 1, regnode_ssc);
1344     set_ANYOF_SYNTHETIC(ssc);
1345     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1346     ssc_anything(ssc);
1347 
1348     /* If any portion of the regex is to operate under locale rules that aren't
1349      * fully known at compile time, initialization includes it.  The reason
1350      * this isn't done for all regexes is that the optimizer was written under
1351      * the assumption that locale was all-or-nothing.  Given the complexity and
1352      * lack of documentation in the optimizer, and that there are inadequate
1353      * test cases for locale, many parts of it may not work properly, it is
1354      * safest to avoid locale unless necessary. */
1355     if (RExC_contains_locale) {
1356 	ANYOF_POSIXL_SETALL(ssc);
1357     }
1358     else {
1359 	ANYOF_POSIXL_ZERO(ssc);
1360     }
1361 }
1362 
1363 STATIC int
1364 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1365                         const regnode_ssc *ssc)
1366 {
1367     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1368      * to the list of code points matched, and locale posix classes; hence does
1369      * not check its flags) */
1370 
1371     UV start, end;
1372     bool ret;
1373 
1374     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1375 
1376     assert(is_ANYOF_SYNTHETIC(ssc));
1377 
1378     invlist_iterinit(ssc->invlist);
1379     ret = invlist_iternext(ssc->invlist, &start, &end)
1380           && start == 0
1381           && end == UV_MAX;
1382 
1383     invlist_iterfinish(ssc->invlist);
1384 
1385     if (! ret) {
1386         return FALSE;
1387     }
1388 
1389     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1390         return FALSE;
1391     }
1392 
1393     return TRUE;
1394 }
1395 
1396 STATIC SV*
1397 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1398                                const regnode_charclass* const node)
1399 {
1400     /* Returns a mortal inversion list defining which code points are matched
1401      * by 'node', which is of type ANYOF.  Handles complementing the result if
1402      * appropriate.  If some code points aren't knowable at this time, the
1403      * returned list must, and will, contain every code point that is a
1404      * possibility. */
1405 
1406     SV* invlist = NULL;
1407     SV* only_utf8_locale_invlist = NULL;
1408     unsigned int i;
1409     const U32 n = ARG(node);
1410     bool new_node_has_latin1 = FALSE;
1411 
1412     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1413 
1414     /* Look at the data structure created by S_set_ANYOF_arg() */
1415     if (n != ANYOF_ONLY_HAS_BITMAP) {
1416         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1417         AV * const av = MUTABLE_AV(SvRV(rv));
1418         SV **const ary = AvARRAY(av);
1419         assert(RExC_rxi->data->what[n] == 's');
1420 
1421         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1422             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1423         }
1424         else if (ary[0] && ary[0] != &PL_sv_undef) {
1425 
1426             /* Here, no compile-time swash, and there are things that won't be
1427              * known until runtime -- we have to assume it could be anything */
1428             invlist = sv_2mortal(_new_invlist(1));
1429             return _add_range_to_invlist(invlist, 0, UV_MAX);
1430         }
1431         else if (ary[3] && ary[3] != &PL_sv_undef) {
1432 
1433             /* Here no compile-time swash, and no run-time only data.  Use the
1434              * node's inversion list */
1435             invlist = sv_2mortal(invlist_clone(ary[3]));
1436         }
1437 
1438         /* Get the code points valid only under UTF-8 locales */
1439         if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1440             && ary[2] && ary[2] != &PL_sv_undef)
1441         {
1442             only_utf8_locale_invlist = ary[2];
1443         }
1444     }
1445 
1446     if (! invlist) {
1447         invlist = sv_2mortal(_new_invlist(0));
1448     }
1449 
1450     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1451      * code points, and an inversion list for the others, but if there are code
1452      * points that should match only conditionally on the target string being
1453      * UTF-8, those are placed in the inversion list, and not the bitmap.
1454      * Since there are circumstances under which they could match, they are
1455      * included in the SSC.  But if the ANYOF node is to be inverted, we have
1456      * to exclude them here, so that when we invert below, the end result
1457      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
1458      * have to do this here before we add the unconditionally matched code
1459      * points */
1460     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1461         _invlist_intersection_complement_2nd(invlist,
1462                                              PL_UpperLatin1,
1463                                              &invlist);
1464     }
1465 
1466     /* Add in the points from the bit map */
1467     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1468         if (ANYOF_BITMAP_TEST(node, i)) {
1469             unsigned int start = i++;
1470 
1471             for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1472                 /* empty */
1473             }
1474             invlist = _add_range_to_invlist(invlist, start, i-1);
1475             new_node_has_latin1 = TRUE;
1476         }
1477     }
1478 
1479     /* If this can match all upper Latin1 code points, have to add them
1480      * as well.  But don't add them if inverting, as when that gets done below,
1481      * it would exclude all these characters, including the ones it shouldn't
1482      * that were added just above */
1483     if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1484         && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1485     {
1486         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1487     }
1488 
1489     /* Similarly for these */
1490     if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1491         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1492     }
1493 
1494     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1495         _invlist_invert(invlist);
1496     }
1497     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1498 
1499         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1500          * locale.  We can skip this if there are no 0-255 at all. */
1501         _invlist_union(invlist, PL_Latin1, &invlist);
1502     }
1503 
1504     /* Similarly add the UTF-8 locale possible matches.  These have to be
1505      * deferred until after the non-UTF-8 locale ones are taken care of just
1506      * above, or it leads to wrong results under ANYOF_INVERT */
1507     if (only_utf8_locale_invlist) {
1508         _invlist_union_maybe_complement_2nd(invlist,
1509                                             only_utf8_locale_invlist,
1510                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1511                                             &invlist);
1512     }
1513 
1514     return invlist;
1515 }
1516 
1517 /* These two functions currently do the exact same thing */
1518 #define ssc_init_zero		ssc_init
1519 
1520 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1521 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1522 
1523 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1524  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1525  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1526 
1527 STATIC void
1528 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1529                 const regnode_charclass *and_with)
1530 {
1531     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1532      * another SSC or a regular ANYOF class.  Can create false positives. */
1533 
1534     SV* anded_cp_list;
1535     U8  anded_flags;
1536 
1537     PERL_ARGS_ASSERT_SSC_AND;
1538 
1539     assert(is_ANYOF_SYNTHETIC(ssc));
1540 
1541     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1542      * the code point inversion list and just the relevant flags */
1543     if (is_ANYOF_SYNTHETIC(and_with)) {
1544         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1545         anded_flags = ANYOF_FLAGS(and_with);
1546 
1547         /* XXX This is a kludge around what appears to be deficiencies in the
1548          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1549          * there are paths through the optimizer where it doesn't get weeded
1550          * out when it should.  And if we don't make some extra provision for
1551          * it like the code just below, it doesn't get added when it should.
1552          * This solution is to add it only when AND'ing, which is here, and
1553          * only when what is being AND'ed is the pristine, original node
1554          * matching anything.  Thus it is like adding it to ssc_anything() but
1555          * only when the result is to be AND'ed.  Probably the same solution
1556          * could be adopted for the same problem we have with /l matching,
1557          * which is solved differently in S_ssc_init(), and that would lead to
1558          * fewer false positives than that solution has.  But if this solution
1559          * creates bugs, the consequences are only that a warning isn't raised
1560          * that should be; while the consequences for having /l bugs is
1561          * incorrect matches */
1562         if (ssc_is_anything((regnode_ssc *)and_with)) {
1563             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1564         }
1565     }
1566     else {
1567         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1568         if (OP(and_with) == ANYOFD) {
1569             anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1570         }
1571         else {
1572             anded_flags = ANYOF_FLAGS(and_with)
1573             &( ANYOF_COMMON_FLAGS
1574               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1575               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1576             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1577                 anded_flags &=
1578                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1579             }
1580         }
1581     }
1582 
1583     ANYOF_FLAGS(ssc) &= anded_flags;
1584 
1585     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1586      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1587      * 'and_with' may be inverted.  When not inverted, we have the situation of
1588      * computing:
1589      *  (C1 | P1) & (C2 | P2)
1590      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1591      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1592      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1593      *                    <=  ((C1 & C2) | P1 | P2)
1594      * Alternatively, the last few steps could be:
1595      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1596      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1597      *                    <=  (C1 | C2 | (P1 & P2))
1598      * We favor the second approach if either P1 or P2 is non-empty.  This is
1599      * because these components are a barrier to doing optimizations, as what
1600      * they match cannot be known until the moment of matching as they are
1601      * dependent on the current locale, 'AND"ing them likely will reduce or
1602      * eliminate them.
1603      * But we can do better if we know that C1,P1 are in their initial state (a
1604      * frequent occurrence), each matching everything:
1605      *  (<everything>) & (C2 | P2) =  C2 | P2
1606      * Similarly, if C2,P2 are in their initial state (again a frequent
1607      * occurrence), the result is a no-op
1608      *  (C1 | P1) & (<everything>) =  C1 | P1
1609      *
1610      * Inverted, we have
1611      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1612      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1613      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1614      * */
1615 
1616     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1617         && ! is_ANYOF_SYNTHETIC(and_with))
1618     {
1619         unsigned int i;
1620 
1621         ssc_intersection(ssc,
1622                          anded_cp_list,
1623                          FALSE /* Has already been inverted */
1624                          );
1625 
1626         /* If either P1 or P2 is empty, the intersection will be also; can skip
1627          * the loop */
1628         if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1629             ANYOF_POSIXL_ZERO(ssc);
1630         }
1631         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1632 
1633             /* Note that the Posix class component P from 'and_with' actually
1634              * looks like:
1635              *      P = Pa | Pb | ... | Pn
1636              * where each component is one posix class, such as in [\w\s].
1637              * Thus
1638              *      ~P = ~(Pa | Pb | ... | Pn)
1639              *         = ~Pa & ~Pb & ... & ~Pn
1640              *        <= ~Pa | ~Pb | ... | ~Pn
1641              * The last is something we can easily calculate, but unfortunately
1642              * is likely to have many false positives.  We could do better
1643              * in some (but certainly not all) instances if two classes in
1644              * P have known relationships.  For example
1645              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1646              * So
1647              *      :lower: & :print: = :lower:
1648              * And similarly for classes that must be disjoint.  For example,
1649              * since \s and \w can have no elements in common based on rules in
1650              * the POSIX standard,
1651              *      \w & ^\S = nothing
1652              * Unfortunately, some vendor locales do not meet the Posix
1653              * standard, in particular almost everything by Microsoft.
1654              * The loop below just changes e.g., \w into \W and vice versa */
1655 
1656             regnode_charclass_posixl temp;
1657             int add = 1;    /* To calculate the index of the complement */
1658 
1659             ANYOF_POSIXL_ZERO(&temp);
1660             for (i = 0; i < ANYOF_MAX; i++) {
1661                 assert(i % 2 != 0
1662                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1663                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1664 
1665                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1666                     ANYOF_POSIXL_SET(&temp, i + add);
1667                 }
1668                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1669             }
1670             ANYOF_POSIXL_AND(&temp, ssc);
1671 
1672         } /* else ssc already has no posixes */
1673     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1674          in its initial state */
1675     else if (! is_ANYOF_SYNTHETIC(and_with)
1676              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1677     {
1678         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1679          * copy it over 'ssc' */
1680         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1681             if (is_ANYOF_SYNTHETIC(and_with)) {
1682                 StructCopy(and_with, ssc, regnode_ssc);
1683             }
1684             else {
1685                 ssc->invlist = anded_cp_list;
1686                 ANYOF_POSIXL_ZERO(ssc);
1687                 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1688                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1689                 }
1690             }
1691         }
1692         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1693                  || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1694         {
1695             /* One or the other of P1, P2 is non-empty. */
1696             if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1697                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1698             }
1699             ssc_union(ssc, anded_cp_list, FALSE);
1700         }
1701         else { /* P1 = P2 = empty */
1702             ssc_intersection(ssc, anded_cp_list, FALSE);
1703         }
1704     }
1705 }
1706 
1707 STATIC void
1708 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1709                const regnode_charclass *or_with)
1710 {
1711     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1712      * another SSC or a regular ANYOF class.  Can create false positives if
1713      * 'or_with' is to be inverted. */
1714 
1715     SV* ored_cp_list;
1716     U8 ored_flags;
1717 
1718     PERL_ARGS_ASSERT_SSC_OR;
1719 
1720     assert(is_ANYOF_SYNTHETIC(ssc));
1721 
1722     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1723      * the code point inversion list and just the relevant flags */
1724     if (is_ANYOF_SYNTHETIC(or_with)) {
1725         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1726         ored_flags = ANYOF_FLAGS(or_with);
1727     }
1728     else {
1729         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1730         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1731         if (OP(or_with) != ANYOFD) {
1732             ored_flags
1733             |= ANYOF_FLAGS(or_with)
1734              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1735                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1736             if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1737                 ored_flags |=
1738                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1739             }
1740         }
1741     }
1742 
1743     ANYOF_FLAGS(ssc) |= ored_flags;
1744 
1745     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1746      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1747      * 'or_with' may be inverted.  When not inverted, we have the simple
1748      * situation of computing:
1749      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1750      * If P1|P2 yields a situation with both a class and its complement are
1751      * set, like having both \w and \W, this matches all code points, and we
1752      * can delete these from the P component of the ssc going forward.  XXX We
1753      * might be able to delete all the P components, but I (khw) am not certain
1754      * about this, and it is better to be safe.
1755      *
1756      * Inverted, we have
1757      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1758      *                         <=  (C1 | P1) | ~C2
1759      *                         <=  (C1 | ~C2) | P1
1760      * (which results in actually simpler code than the non-inverted case)
1761      * */
1762 
1763     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1764         && ! is_ANYOF_SYNTHETIC(or_with))
1765     {
1766         /* We ignore P2, leaving P1 going forward */
1767     }   /* else  Not inverted */
1768     else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1769         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1770         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1771             unsigned int i;
1772             for (i = 0; i < ANYOF_MAX; i += 2) {
1773                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1774                 {
1775                     ssc_match_all_cp(ssc);
1776                     ANYOF_POSIXL_CLEAR(ssc, i);
1777                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1778                 }
1779             }
1780         }
1781     }
1782 
1783     ssc_union(ssc,
1784               ored_cp_list,
1785               FALSE /* Already has been inverted */
1786               );
1787 }
1788 
1789 PERL_STATIC_INLINE void
1790 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1791 {
1792     PERL_ARGS_ASSERT_SSC_UNION;
1793 
1794     assert(is_ANYOF_SYNTHETIC(ssc));
1795 
1796     _invlist_union_maybe_complement_2nd(ssc->invlist,
1797                                         invlist,
1798                                         invert2nd,
1799                                         &ssc->invlist);
1800 }
1801 
1802 PERL_STATIC_INLINE void
1803 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1804                          SV* const invlist,
1805                          const bool invert2nd)
1806 {
1807     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1808 
1809     assert(is_ANYOF_SYNTHETIC(ssc));
1810 
1811     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1812                                                invlist,
1813                                                invert2nd,
1814                                                &ssc->invlist);
1815 }
1816 
1817 PERL_STATIC_INLINE void
1818 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1819 {
1820     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1821 
1822     assert(is_ANYOF_SYNTHETIC(ssc));
1823 
1824     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1825 }
1826 
1827 PERL_STATIC_INLINE void
1828 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1829 {
1830     /* AND just the single code point 'cp' into the SSC 'ssc' */
1831 
1832     SV* cp_list = _new_invlist(2);
1833 
1834     PERL_ARGS_ASSERT_SSC_CP_AND;
1835 
1836     assert(is_ANYOF_SYNTHETIC(ssc));
1837 
1838     cp_list = add_cp_to_invlist(cp_list, cp);
1839     ssc_intersection(ssc, cp_list,
1840                      FALSE /* Not inverted */
1841                      );
1842     SvREFCNT_dec_NN(cp_list);
1843 }
1844 
1845 PERL_STATIC_INLINE void
1846 S_ssc_clear_locale(regnode_ssc *ssc)
1847 {
1848     /* Set the SSC 'ssc' to not match any locale things */
1849     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1850 
1851     assert(is_ANYOF_SYNTHETIC(ssc));
1852 
1853     ANYOF_POSIXL_ZERO(ssc);
1854     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1855 }
1856 
1857 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1858 
1859 STATIC bool
1860 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1861 {
1862     /* The synthetic start class is used to hopefully quickly winnow down
1863      * places where a pattern could start a match in the target string.  If it
1864      * doesn't really narrow things down that much, there isn't much point to
1865      * having the overhead of using it.  This function uses some very crude
1866      * heuristics to decide if to use the ssc or not.
1867      *
1868      * It returns TRUE if 'ssc' rules out more than half what it considers to
1869      * be the "likely" possible matches, but of course it doesn't know what the
1870      * actual things being matched are going to be; these are only guesses
1871      *
1872      * For /l matches, it assumes that the only likely matches are going to be
1873      *      in the 0-255 range, uniformly distributed, so half of that is 127
1874      * For /a and /d matches, it assumes that the likely matches will be just
1875      *      the ASCII range, so half of that is 63
1876      * For /u and there isn't anything matching above the Latin1 range, it
1877      *      assumes that that is the only range likely to be matched, and uses
1878      *      half that as the cut-off: 127.  If anything matches above Latin1,
1879      *      it assumes that all of Unicode could match (uniformly), except for
1880      *      non-Unicode code points and things in the General Category "Other"
1881      *      (unassigned, private use, surrogates, controls and formats).  This
1882      *      is a much large number. */
1883 
1884     U32 count = 0;      /* Running total of number of code points matched by
1885                            'ssc' */
1886     UV start, end;      /* Start and end points of current range in inversion
1887                            list */
1888     const U32 max_code_points = (LOC)
1889                                 ?  256
1890                                 : ((   ! UNI_SEMANTICS
1891                                      || invlist_highest(ssc->invlist) < 256)
1892                                   ? 128
1893                                   : NON_OTHER_COUNT);
1894     const U32 max_match = max_code_points / 2;
1895 
1896     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1897 
1898     invlist_iterinit(ssc->invlist);
1899     while (invlist_iternext(ssc->invlist, &start, &end)) {
1900         if (start >= max_code_points) {
1901             break;
1902         }
1903         end = MIN(end, max_code_points - 1);
1904         count += end - start + 1;
1905         if (count >= max_match) {
1906             invlist_iterfinish(ssc->invlist);
1907             return FALSE;
1908         }
1909     }
1910 
1911     return TRUE;
1912 }
1913 
1914 
1915 STATIC void
1916 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1917 {
1918     /* The inversion list in the SSC is marked mortal; now we need a more
1919      * permanent copy, which is stored the same way that is done in a regular
1920      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1921      * map */
1922 
1923     SV* invlist = invlist_clone(ssc->invlist);
1924 
1925     PERL_ARGS_ASSERT_SSC_FINALIZE;
1926 
1927     assert(is_ANYOF_SYNTHETIC(ssc));
1928 
1929     /* The code in this file assumes that all but these flags aren't relevant
1930      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1931      * by the time we reach here */
1932     assert(! (ANYOF_FLAGS(ssc)
1933         & ~( ANYOF_COMMON_FLAGS
1934             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1935             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
1936 
1937     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1938 
1939     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1940                                 NULL, NULL, NULL, FALSE);
1941 
1942     /* Make sure is clone-safe */
1943     ssc->invlist = NULL;
1944 
1945     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1946         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1947     }
1948 
1949     if (RExC_contains_locale) {
1950         OP(ssc) = ANYOFL;
1951     }
1952 
1953     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1954 }
1955 
1956 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1957 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1958 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1959 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1960                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1961                                : 0 )
1962 
1963 
1964 #ifdef DEBUGGING
1965 /*
1966    dump_trie(trie,widecharmap,revcharmap)
1967    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1968    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1969 
1970    These routines dump out a trie in a somewhat readable format.
1971    The _interim_ variants are used for debugging the interim
1972    tables that are used to generate the final compressed
1973    representation which is what dump_trie expects.
1974 
1975    Part of the reason for their existence is to provide a form
1976    of documentation as to how the different representations function.
1977 
1978 */
1979 
1980 /*
1981   Dumps the final compressed table form of the trie to Perl_debug_log.
1982   Used for debugging make_trie().
1983 */
1984 
1985 STATIC void
1986 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1987 	    AV *revcharmap, U32 depth)
1988 {
1989     U32 state;
1990     SV *sv=sv_newmortal();
1991     int colwidth= widecharmap ? 6 : 4;
1992     U16 word;
1993     GET_RE_DEBUG_FLAGS_DECL;
1994 
1995     PERL_ARGS_ASSERT_DUMP_TRIE;
1996 
1997     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
1998         depth+1, "Match","Base","Ofs" );
1999 
2000     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2001 	SV ** const tmp = av_fetch( revcharmap, state, 0);
2002         if ( tmp ) {
2003             Perl_re_printf( aTHX_  "%*s",
2004                 colwidth,
2005                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2006 	                    PL_colors[0], PL_colors[1],
2007 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2008 	                    PERL_PV_ESCAPE_FIRSTCHAR
2009                 )
2010             );
2011         }
2012     }
2013     Perl_re_printf( aTHX_  "\n");
2014     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2015 
2016     for( state = 0 ; state < trie->uniquecharcount ; state++ )
2017         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
2018     Perl_re_printf( aTHX_  "\n");
2019 
2020     for( state = 1 ; state < trie->statecount ; state++ ) {
2021 	const U32 base = trie->states[ state ].trans.base;
2022 
2023         Perl_re_indentf( aTHX_  "#%4"UVXf"|", depth+1, (UV)state);
2024 
2025         if ( trie->states[ state ].wordnum ) {
2026             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
2027         } else {
2028             Perl_re_printf( aTHX_  "%6s", "" );
2029         }
2030 
2031         Perl_re_printf( aTHX_  " @%4"UVXf" ", (UV)base );
2032 
2033         if ( base ) {
2034             U32 ofs = 0;
2035 
2036             while( ( base + ofs  < trie->uniquecharcount ) ||
2037                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
2038                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
2039                                                                     != state))
2040                     ofs++;
2041 
2042             Perl_re_printf( aTHX_  "+%2"UVXf"[ ", (UV)ofs);
2043 
2044             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2045                 if ( ( base + ofs >= trie->uniquecharcount )
2046                         && ( base + ofs - trie->uniquecharcount
2047                                                         < trie->lasttrans )
2048                         && trie->trans[ base + ofs
2049                                     - trie->uniquecharcount ].check == state )
2050                 {
2051                    Perl_re_printf( aTHX_  "%*"UVXf, colwidth,
2052                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2053                    );
2054                 } else {
2055                     Perl_re_printf( aTHX_  "%*s",colwidth,"   ." );
2056                 }
2057             }
2058 
2059             Perl_re_printf( aTHX_  "]");
2060 
2061         }
2062         Perl_re_printf( aTHX_  "\n" );
2063     }
2064     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
2065                                 depth);
2066     for (word=1; word <= trie->wordcount; word++) {
2067         Perl_re_printf( aTHX_  " %d:(%d,%d)",
2068 	    (int)word, (int)(trie->wordinfo[word].prev),
2069 	    (int)(trie->wordinfo[word].len));
2070     }
2071     Perl_re_printf( aTHX_  "\n" );
2072 }
2073 /*
2074   Dumps a fully constructed but uncompressed trie in list form.
2075   List tries normally only are used for construction when the number of
2076   possible chars (trie->uniquecharcount) is very high.
2077   Used for debugging make_trie().
2078 */
2079 STATIC void
2080 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2081 			 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2082 			 U32 depth)
2083 {
2084     U32 state;
2085     SV *sv=sv_newmortal();
2086     int colwidth= widecharmap ? 6 : 4;
2087     GET_RE_DEBUG_FLAGS_DECL;
2088 
2089     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2090 
2091     /* print out the table precompression.  */
2092     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
2093             depth+1 );
2094     Perl_re_indentf( aTHX_  "%s",
2095             depth+1, "------:-----+-----------------\n" );
2096 
2097     for( state=1 ; state < next_alloc ; state ++ ) {
2098         U16 charid;
2099 
2100         Perl_re_indentf( aTHX_  " %4"UVXf" :",
2101             depth+1, (UV)state  );
2102         if ( ! trie->states[ state ].wordnum ) {
2103             Perl_re_printf( aTHX_  "%5s| ","");
2104         } else {
2105             Perl_re_printf( aTHX_  "W%4x| ",
2106                 trie->states[ state ].wordnum
2107             );
2108         }
2109         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2110 	    SV ** const tmp = av_fetch( revcharmap,
2111                                         TRIE_LIST_ITEM(state,charid).forid, 0);
2112 	    if ( tmp ) {
2113                 Perl_re_printf( aTHX_  "%*s:%3X=%4"UVXf" | ",
2114                     colwidth,
2115                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2116                               colwidth,
2117                               PL_colors[0], PL_colors[1],
2118                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2119                               | PERL_PV_ESCAPE_FIRSTCHAR
2120                     ) ,
2121                     TRIE_LIST_ITEM(state,charid).forid,
2122                     (UV)TRIE_LIST_ITEM(state,charid).newstate
2123                 );
2124                 if (!(charid % 10))
2125                     Perl_re_printf( aTHX_  "\n%*s| ",
2126                         (int)((depth * 2) + 14), "");
2127             }
2128         }
2129         Perl_re_printf( aTHX_  "\n");
2130     }
2131 }
2132 
2133 /*
2134   Dumps a fully constructed but uncompressed trie in table form.
2135   This is the normal DFA style state transition table, with a few
2136   twists to facilitate compression later.
2137   Used for debugging make_trie().
2138 */
2139 STATIC void
2140 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2141 			  HV *widecharmap, AV *revcharmap, U32 next_alloc,
2142 			  U32 depth)
2143 {
2144     U32 state;
2145     U16 charid;
2146     SV *sv=sv_newmortal();
2147     int colwidth= widecharmap ? 6 : 4;
2148     GET_RE_DEBUG_FLAGS_DECL;
2149 
2150     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2151 
2152     /*
2153        print out the table precompression so that we can do a visual check
2154        that they are identical.
2155      */
2156 
2157     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
2158 
2159     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2160 	SV ** const tmp = av_fetch( revcharmap, charid, 0);
2161         if ( tmp ) {
2162             Perl_re_printf( aTHX_  "%*s",
2163                 colwidth,
2164                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2165 	                    PL_colors[0], PL_colors[1],
2166 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2167 	                    PERL_PV_ESCAPE_FIRSTCHAR
2168                 )
2169             );
2170         }
2171     }
2172 
2173     Perl_re_printf( aTHX_ "\n");
2174     Perl_re_indentf( aTHX_  "State+-", depth+1 );
2175 
2176     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2177         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
2178     }
2179 
2180     Perl_re_printf( aTHX_  "\n" );
2181 
2182     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2183 
2184         Perl_re_indentf( aTHX_  "%4"UVXf" : ",
2185             depth+1,
2186             (UV)TRIE_NODENUM( state ) );
2187 
2188         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2189             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2190             if (v)
2191                 Perl_re_printf( aTHX_  "%*"UVXf, colwidth, v );
2192             else
2193                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
2194         }
2195         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2196             Perl_re_printf( aTHX_  " (%4"UVXf")\n",
2197                                             (UV)trie->trans[ state ].check );
2198         } else {
2199             Perl_re_printf( aTHX_  " (%4"UVXf") W%4X\n",
2200                                             (UV)trie->trans[ state ].check,
2201             trie->states[ TRIE_NODENUM( state ) ].wordnum );
2202         }
2203     }
2204 }
2205 
2206 #endif
2207 
2208 
2209 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2210   startbranch: the first branch in the whole branch sequence
2211   first      : start branch of sequence of branch-exact nodes.
2212 	       May be the same as startbranch
2213   last       : Thing following the last branch.
2214 	       May be the same as tail.
2215   tail       : item following the branch sequence
2216   count      : words in the sequence
2217   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2218   depth      : indent depth
2219 
2220 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2221 
2222 A trie is an N'ary tree where the branches are determined by digital
2223 decomposition of the key. IE, at the root node you look up the 1st character and
2224 follow that branch repeat until you find the end of the branches. Nodes can be
2225 marked as "accepting" meaning they represent a complete word. Eg:
2226 
2227   /he|she|his|hers/
2228 
2229 would convert into the following structure. Numbers represent states, letters
2230 following numbers represent valid transitions on the letter from that state, if
2231 the number is in square brackets it represents an accepting state, otherwise it
2232 will be in parenthesis.
2233 
2234       +-h->+-e->[3]-+-r->(8)-+-s->[9]
2235       |    |
2236       |   (2)
2237       |    |
2238      (1)   +-i->(6)-+-s->[7]
2239       |
2240       +-s->(3)-+-h->(4)-+-e->[5]
2241 
2242       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2243 
2244 This shows that when matching against the string 'hers' we will begin at state 1
2245 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2246 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2247 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2248 single traverse. We store a mapping from accepting to state to which word was
2249 matched, and then when we have multiple possibilities we try to complete the
2250 rest of the regex in the order in which they occurred in the alternation.
2251 
2252 The only prior NFA like behaviour that would be changed by the TRIE support is
2253 the silent ignoring of duplicate alternations which are of the form:
2254 
2255  / (DUPE|DUPE) X? (?{ ... }) Y /x
2256 
2257 Thus EVAL blocks following a trie may be called a different number of times with
2258 and without the optimisation. With the optimisations dupes will be silently
2259 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2260 the following demonstrates:
2261 
2262  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2263 
2264 which prints out 'word' three times, but
2265 
2266  'words'=~/(word|word|word)(?{ print $1 })S/
2267 
2268 which doesnt print it out at all. This is due to other optimisations kicking in.
2269 
2270 Example of what happens on a structural level:
2271 
2272 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2273 
2274    1: CURLYM[1] {1,32767}(18)
2275    5:   BRANCH(8)
2276    6:     EXACT <ac>(16)
2277    8:   BRANCH(11)
2278    9:     EXACT <ad>(16)
2279   11:   BRANCH(14)
2280   12:     EXACT <ab>(16)
2281   16:   SUCCEED(0)
2282   17:   NOTHING(18)
2283   18: END(0)
2284 
2285 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2286 and should turn into:
2287 
2288    1: CURLYM[1] {1,32767}(18)
2289    5:   TRIE(16)
2290 	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2291 	  <ac>
2292 	  <ad>
2293 	  <ab>
2294   16:   SUCCEED(0)
2295   17:   NOTHING(18)
2296   18: END(0)
2297 
2298 Cases where tail != last would be like /(?foo|bar)baz/:
2299 
2300    1: BRANCH(4)
2301    2:   EXACT <foo>(8)
2302    4: BRANCH(7)
2303    5:   EXACT <bar>(8)
2304    7: TAIL(8)
2305    8: EXACT <baz>(10)
2306   10: END(0)
2307 
2308 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2309 and would end up looking like:
2310 
2311     1: TRIE(8)
2312       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2313 	<foo>
2314 	<bar>
2315    7: TAIL(8)
2316    8: EXACT <baz>(10)
2317   10: END(0)
2318 
2319     d = uvchr_to_utf8_flags(d, uv, 0);
2320 
2321 is the recommended Unicode-aware way of saying
2322 
2323     *(d++) = uv;
2324 */
2325 
2326 #define TRIE_STORE_REVCHAR(val)                                            \
2327     STMT_START {                                                           \
2328 	if (UTF) {							   \
2329             SV *zlopp = newSV(UTF8_MAXBYTES);				   \
2330 	    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);	   \
2331             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2332 	    SvCUR_set(zlopp, kapow - flrbbbbb);				   \
2333 	    SvPOK_on(zlopp);						   \
2334 	    SvUTF8_on(zlopp);						   \
2335 	    av_push(revcharmap, zlopp);					   \
2336 	} else {							   \
2337             char ooooff = (char)val;                                           \
2338 	    av_push(revcharmap, newSVpvn(&ooooff, 1));			   \
2339 	}								   \
2340         } STMT_END
2341 
2342 /* This gets the next character from the input, folding it if not already
2343  * folded. */
2344 #define TRIE_READ_CHAR STMT_START {                                           \
2345     wordlen++;                                                                \
2346     if ( UTF ) {                                                              \
2347         /* if it is UTF then it is either already folded, or does not need    \
2348          * folding */                                                         \
2349         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
2350     }                                                                         \
2351     else if (folder == PL_fold_latin1) {                                      \
2352         /* This folder implies Unicode rules, which in the range expressible  \
2353          *  by not UTF is the lower case, with the two exceptions, one of     \
2354          *  which should have been taken care of before calling this */       \
2355         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
2356         uvc = toLOWER_L1(*uc);                                                \
2357         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
2358         len = 1;                                                              \
2359     } else {                                                                  \
2360         /* raw data, will be folded later if needed */                        \
2361         uvc = (U32)*uc;                                                       \
2362         len = 1;                                                              \
2363     }                                                                         \
2364 } STMT_END
2365 
2366 
2367 
2368 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
2369     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
2370 	U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
2371 	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2372     }                                                           \
2373     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
2374     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
2375     TRIE_LIST_CUR( state )++;                                   \
2376 } STMT_END
2377 
2378 #define TRIE_LIST_NEW(state) STMT_START {                       \
2379     Newxz( trie->states[ state ].trans.list,               \
2380 	4, reg_trie_trans_le );                                 \
2381      TRIE_LIST_CUR( state ) = 1;                                \
2382      TRIE_LIST_LEN( state ) = 4;                                \
2383 } STMT_END
2384 
2385 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
2386     U16 dupe= trie->states[ state ].wordnum;                    \
2387     regnode * const noper_next = regnext( noper );              \
2388                                                                 \
2389     DEBUG_r({                                                   \
2390         /* store the word for dumping */                        \
2391         SV* tmp;                                                \
2392         if (OP(noper) != NOTHING)                               \
2393             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);	\
2394         else                                                    \
2395             tmp = newSVpvn_utf8( "", 0, UTF );			\
2396         av_push( trie_words, tmp );                             \
2397     });                                                         \
2398                                                                 \
2399     curword++;                                                  \
2400     trie->wordinfo[curword].prev   = 0;                         \
2401     trie->wordinfo[curword].len    = wordlen;                   \
2402     trie->wordinfo[curword].accept = state;                     \
2403                                                                 \
2404     if ( noper_next < tail ) {                                  \
2405         if (!trie->jump)                                        \
2406             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2407                                                  sizeof(U16) ); \
2408         trie->jump[curword] = (U16)(noper_next - convert);      \
2409         if (!jumper)                                            \
2410             jumper = noper_next;                                \
2411         if (!nextbranch)                                        \
2412             nextbranch= regnext(cur);                           \
2413     }                                                           \
2414                                                                 \
2415     if ( dupe ) {                                               \
2416         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
2417         /* chain, so that when the bits of chain are later    */\
2418         /* linked together, the dups appear in the chain      */\
2419 	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2420 	trie->wordinfo[dupe].prev = curword;                    \
2421     } else {                                                    \
2422         /* we haven't inserted this word yet.                */ \
2423         trie->states[ state ].wordnum = curword;                \
2424     }                                                           \
2425 } STMT_END
2426 
2427 
2428 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)		\
2429      ( ( base + charid >=  ucharcount					\
2430          && base + charid < ubound					\
2431          && state == trie->trans[ base - ucharcount + charid ].check	\
2432          && trie->trans[ base - ucharcount + charid ].next )		\
2433            ? trie->trans[ base - ucharcount + charid ].next		\
2434            : ( state==1 ? special : 0 )					\
2435       )
2436 
2437 #define MADE_TRIE       1
2438 #define MADE_JUMP_TRIE  2
2439 #define MADE_EXACT_TRIE 4
2440 
2441 STATIC I32
2442 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2443                   regnode *first, regnode *last, regnode *tail,
2444                   U32 word_count, U32 flags, U32 depth)
2445 {
2446     /* first pass, loop through and scan words */
2447     reg_trie_data *trie;
2448     HV *widecharmap = NULL;
2449     AV *revcharmap = newAV();
2450     regnode *cur;
2451     STRLEN len = 0;
2452     UV uvc = 0;
2453     U16 curword = 0;
2454     U32 next_alloc = 0;
2455     regnode *jumper = NULL;
2456     regnode *nextbranch = NULL;
2457     regnode *convert = NULL;
2458     U32 *prev_states; /* temp array mapping each state to previous one */
2459     /* we just use folder as a flag in utf8 */
2460     const U8 * folder = NULL;
2461 
2462 #ifdef DEBUGGING
2463     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2464     AV *trie_words = NULL;
2465     /* along with revcharmap, this only used during construction but both are
2466      * useful during debugging so we store them in the struct when debugging.
2467      */
2468 #else
2469     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2470     STRLEN trie_charcount=0;
2471 #endif
2472     SV *re_trie_maxbuff;
2473     GET_RE_DEBUG_FLAGS_DECL;
2474 
2475     PERL_ARGS_ASSERT_MAKE_TRIE;
2476 #ifndef DEBUGGING
2477     PERL_UNUSED_ARG(depth);
2478 #endif
2479 
2480     switch (flags) {
2481         case EXACT: case EXACTL: break;
2482 	case EXACTFA:
2483         case EXACTFU_SS:
2484 	case EXACTFU:
2485 	case EXACTFLU8: folder = PL_fold_latin1; break;
2486 	case EXACTF:  folder = PL_fold; break;
2487         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2488     }
2489 
2490     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2491     trie->refcount = 1;
2492     trie->startstate = 1;
2493     trie->wordcount = word_count;
2494     RExC_rxi->data->data[ data_slot ] = (void*)trie;
2495     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2496     if (flags == EXACT || flags == EXACTL)
2497 	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2498     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2499                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2500 
2501     DEBUG_r({
2502         trie_words = newAV();
2503     });
2504 
2505     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2506     assert(re_trie_maxbuff);
2507     if (!SvIOK(re_trie_maxbuff)) {
2508         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2509     }
2510     DEBUG_TRIE_COMPILE_r({
2511         Perl_re_indentf( aTHX_
2512           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2513           depth+1,
2514           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2515           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2516     });
2517 
2518    /* Find the node we are going to overwrite */
2519     if ( first == startbranch && OP( last ) != BRANCH ) {
2520         /* whole branch chain */
2521         convert = first;
2522     } else {
2523         /* branch sub-chain */
2524         convert = NEXTOPER( first );
2525     }
2526 
2527     /*  -- First loop and Setup --
2528 
2529        We first traverse the branches and scan each word to determine if it
2530        contains widechars, and how many unique chars there are, this is
2531        important as we have to build a table with at least as many columns as we
2532        have unique chars.
2533 
2534        We use an array of integers to represent the character codes 0..255
2535        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2536        the native representation of the character value as the key and IV's for
2537        the coded index.
2538 
2539        *TODO* If we keep track of how many times each character is used we can
2540        remap the columns so that the table compression later on is more
2541        efficient in terms of memory by ensuring the most common value is in the
2542        middle and the least common are on the outside.  IMO this would be better
2543        than a most to least common mapping as theres a decent chance the most
2544        common letter will share a node with the least common, meaning the node
2545        will not be compressible. With a middle is most common approach the worst
2546        case is when we have the least common nodes twice.
2547 
2548      */
2549 
2550     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2551         regnode *noper = NEXTOPER( cur );
2552         const U8 *uc;
2553         const U8 *e;
2554         int foldlen = 0;
2555         U32 wordlen      = 0;         /* required init */
2556         STRLEN minchars = 0;
2557         STRLEN maxchars = 0;
2558         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2559                                                bitmap?*/
2560 
2561         if (OP(noper) == NOTHING) {
2562             regnode *noper_next= regnext(noper);
2563             if (noper_next < tail)
2564                 noper= noper_next;
2565         }
2566 
2567         if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2568             uc= (U8*)STRING(noper);
2569             e= uc + STR_LEN(noper);
2570         } else {
2571             trie->minlen= 0;
2572             continue;
2573         }
2574 
2575 
2576         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2577             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2578                                           regardless of encoding */
2579             if (OP( noper ) == EXACTFU_SS) {
2580                 /* false positives are ok, so just set this */
2581                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2582             }
2583         }
2584         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2585                                            branch */
2586             TRIE_CHARCOUNT(trie)++;
2587             TRIE_READ_CHAR;
2588 
2589             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2590              * is in effect.  Under /i, this character can match itself, or
2591              * anything that folds to it.  If not under /i, it can match just
2592              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2593              * all fold to k, and all are single characters.   But some folds
2594              * expand to more than one character, so for example LATIN SMALL
2595              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2596              * the string beginning at 'uc' is 'ffi', it could be matched by
2597              * three characters, or just by the one ligature character. (It
2598              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2599              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2600              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2601              * match.)  The trie needs to know the minimum and maximum number
2602              * of characters that could match so that it can use size alone to
2603              * quickly reject many match attempts.  The max is simple: it is
2604              * the number of folded characters in this branch (since a fold is
2605              * never shorter than what folds to it. */
2606 
2607             maxchars++;
2608 
2609             /* And the min is equal to the max if not under /i (indicated by
2610              * 'folder' being NULL), or there are no multi-character folds.  If
2611              * there is a multi-character fold, the min is incremented just
2612              * once, for the character that folds to the sequence.  Each
2613              * character in the sequence needs to be added to the list below of
2614              * characters in the trie, but we count only the first towards the
2615              * min number of characters needed.  This is done through the
2616              * variable 'foldlen', which is returned by the macros that look
2617              * for these sequences as the number of bytes the sequence
2618              * occupies.  Each time through the loop, we decrement 'foldlen' by
2619              * how many bytes the current char occupies.  Only when it reaches
2620              * 0 do we increment 'minchars' or look for another multi-character
2621              * sequence. */
2622             if (folder == NULL) {
2623                 minchars++;
2624             }
2625             else if (foldlen > 0) {
2626                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2627             }
2628             else {
2629                 minchars++;
2630 
2631                 /* See if *uc is the beginning of a multi-character fold.  If
2632                  * so, we decrement the length remaining to look at, to account
2633                  * for the current character this iteration.  (We can use 'uc'
2634                  * instead of the fold returned by TRIE_READ_CHAR because for
2635                  * non-UTF, the latin1_safe macro is smart enough to account
2636                  * for all the unfolded characters, and because for UTF, the
2637                  * string will already have been folded earlier in the
2638                  * compilation process */
2639                 if (UTF) {
2640                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2641                         foldlen -= UTF8SKIP(uc);
2642                     }
2643                 }
2644                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2645                     foldlen--;
2646                 }
2647             }
2648 
2649             /* The current character (and any potential folds) should be added
2650              * to the possible matching characters for this position in this
2651              * branch */
2652             if ( uvc < 256 ) {
2653                 if ( folder ) {
2654                     U8 folded= folder[ (U8) uvc ];
2655                     if ( !trie->charmap[ folded ] ) {
2656                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2657                         TRIE_STORE_REVCHAR( folded );
2658                     }
2659                 }
2660                 if ( !trie->charmap[ uvc ] ) {
2661                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2662                     TRIE_STORE_REVCHAR( uvc );
2663                 }
2664                 if ( set_bit ) {
2665 		    /* store the codepoint in the bitmap, and its folded
2666 		     * equivalent. */
2667                     TRIE_BITMAP_SET(trie, uvc);
2668 
2669 		    /* store the folded codepoint */
2670                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2671 
2672 		    if ( !UTF ) {
2673 			/* store first byte of utf8 representation of
2674 			   variant codepoints */
2675 			if (! UVCHR_IS_INVARIANT(uvc)) {
2676 			    TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2677 			}
2678 		    }
2679                     set_bit = 0; /* We've done our bit :-) */
2680                 }
2681             } else {
2682 
2683                 /* XXX We could come up with the list of code points that fold
2684                  * to this using PL_utf8_foldclosures, except not for
2685                  * multi-char folds, as there may be multiple combinations
2686                  * there that could work, which needs to wait until runtime to
2687                  * resolve (The comment about LIGATURE FFI above is such an
2688                  * example */
2689 
2690                 SV** svpp;
2691                 if ( !widecharmap )
2692                     widecharmap = newHV();
2693 
2694                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2695 
2696                 if ( !svpp )
2697                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2698 
2699                 if ( !SvTRUE( *svpp ) ) {
2700                     sv_setiv( *svpp, ++trie->uniquecharcount );
2701                     TRIE_STORE_REVCHAR(uvc);
2702                 }
2703             }
2704         } /* end loop through characters in this branch of the trie */
2705 
2706         /* We take the min and max for this branch and combine to find the min
2707          * and max for all branches processed so far */
2708         if( cur == first ) {
2709             trie->minlen = minchars;
2710             trie->maxlen = maxchars;
2711         } else if (minchars < trie->minlen) {
2712             trie->minlen = minchars;
2713         } else if (maxchars > trie->maxlen) {
2714             trie->maxlen = maxchars;
2715         }
2716     } /* end first pass */
2717     DEBUG_TRIE_COMPILE_r(
2718         Perl_re_indentf( aTHX_
2719                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2720                 depth+1,
2721                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2722 		(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2723 		(int)trie->minlen, (int)trie->maxlen )
2724     );
2725 
2726     /*
2727         We now know what we are dealing with in terms of unique chars and
2728         string sizes so we can calculate how much memory a naive
2729         representation using a flat table  will take. If it's over a reasonable
2730         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2731         conservative but potentially much slower representation using an array
2732         of lists.
2733 
2734         At the end we convert both representations into the same compressed
2735         form that will be used in regexec.c for matching with. The latter
2736         is a form that cannot be used to construct with but has memory
2737         properties similar to the list form and access properties similar
2738         to the table form making it both suitable for fast searches and
2739         small enough that its feasable to store for the duration of a program.
2740 
2741         See the comment in the code where the compressed table is produced
2742         inplace from the flat tabe representation for an explanation of how
2743         the compression works.
2744 
2745     */
2746 
2747 
2748     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2749     prev_states[1] = 0;
2750 
2751     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2752                                                     > SvIV(re_trie_maxbuff) )
2753     {
2754         /*
2755             Second Pass -- Array Of Lists Representation
2756 
2757             Each state will be represented by a list of charid:state records
2758             (reg_trie_trans_le) the first such element holds the CUR and LEN
2759             points of the allocated array. (See defines above).
2760 
2761             We build the initial structure using the lists, and then convert
2762             it into the compressed table form which allows faster lookups
2763             (but cant be modified once converted).
2764         */
2765 
2766         STRLEN transcount = 1;
2767 
2768         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
2769             depth+1));
2770 
2771 	trie->states = (reg_trie_state *)
2772 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2773 				  sizeof(reg_trie_state) );
2774         TRIE_LIST_NEW(1);
2775         next_alloc = 2;
2776 
2777         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2778 
2779             regnode *noper   = NEXTOPER( cur );
2780 	    U32 state        = 1;         /* required init */
2781 	    U16 charid       = 0;         /* sanity init */
2782             U32 wordlen      = 0;         /* required init */
2783 
2784             if (OP(noper) == NOTHING) {
2785                 regnode *noper_next= regnext(noper);
2786                 if (noper_next < tail)
2787                     noper= noper_next;
2788             }
2789 
2790             if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2791                 const U8 *uc= (U8*)STRING(noper);
2792                 const U8 *e= uc + STR_LEN(noper);
2793 
2794                 for ( ; uc < e ; uc += len ) {
2795 
2796                     TRIE_READ_CHAR;
2797 
2798                     if ( uvc < 256 ) {
2799                         charid = trie->charmap[ uvc ];
2800 		    } else {
2801                         SV** const svpp = hv_fetch( widecharmap,
2802                                                     (char*)&uvc,
2803                                                     sizeof( UV ),
2804                                                     0);
2805                         if ( !svpp ) {
2806                             charid = 0;
2807                         } else {
2808                             charid=(U16)SvIV( *svpp );
2809                         }
2810 		    }
2811                     /* charid is now 0 if we dont know the char read, or
2812                      * nonzero if we do */
2813                     if ( charid ) {
2814 
2815                         U16 check;
2816                         U32 newstate = 0;
2817 
2818                         charid--;
2819                         if ( !trie->states[ state ].trans.list ) {
2820                             TRIE_LIST_NEW( state );
2821 			}
2822                         for ( check = 1;
2823                               check <= TRIE_LIST_USED( state );
2824                               check++ )
2825                         {
2826                             if ( TRIE_LIST_ITEM( state, check ).forid
2827                                                                     == charid )
2828                             {
2829                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2830                                 break;
2831                             }
2832                         }
2833                         if ( ! newstate ) {
2834                             newstate = next_alloc++;
2835 			    prev_states[newstate] = state;
2836                             TRIE_LIST_PUSH( state, charid, newstate );
2837                             transcount++;
2838                         }
2839                         state = newstate;
2840                     } else {
2841                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2842 		    }
2843 		}
2844 	    }
2845             TRIE_HANDLE_WORD(state);
2846 
2847         } /* end second pass */
2848 
2849         /* next alloc is the NEXT state to be allocated */
2850         trie->statecount = next_alloc;
2851         trie->states = (reg_trie_state *)
2852 	    PerlMemShared_realloc( trie->states,
2853 				   next_alloc
2854 				   * sizeof(reg_trie_state) );
2855 
2856         /* and now dump it out before we compress it */
2857         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2858 							 revcharmap, next_alloc,
2859 							 depth+1)
2860         );
2861 
2862         trie->trans = (reg_trie_trans *)
2863 	    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2864         {
2865             U32 state;
2866             U32 tp = 0;
2867             U32 zp = 0;
2868 
2869 
2870             for( state=1 ; state < next_alloc ; state ++ ) {
2871                 U32 base=0;
2872 
2873                 /*
2874                 DEBUG_TRIE_COMPILE_MORE_r(
2875                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
2876                 );
2877                 */
2878 
2879                 if (trie->states[state].trans.list) {
2880                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2881                     U16 maxid=minid;
2882 		    U16 idx;
2883 
2884                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2885 			const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2886 			if ( forid < minid ) {
2887 			    minid=forid;
2888 			} else if ( forid > maxid ) {
2889 			    maxid=forid;
2890 			}
2891                     }
2892                     if ( transcount < tp + maxid - minid + 1) {
2893                         transcount *= 2;
2894 			trie->trans = (reg_trie_trans *)
2895 			    PerlMemShared_realloc( trie->trans,
2896 						     transcount
2897 						     * sizeof(reg_trie_trans) );
2898                         Zero( trie->trans + (transcount / 2),
2899                               transcount / 2,
2900                               reg_trie_trans );
2901                     }
2902                     base = trie->uniquecharcount + tp - minid;
2903                     if ( maxid == minid ) {
2904                         U32 set = 0;
2905                         for ( ; zp < tp ; zp++ ) {
2906                             if ( ! trie->trans[ zp ].next ) {
2907                                 base = trie->uniquecharcount + zp - minid;
2908                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2909                                                                    1).newstate;
2910                                 trie->trans[ zp ].check = state;
2911                                 set = 1;
2912                                 break;
2913                             }
2914                         }
2915                         if ( !set ) {
2916                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2917                                                                    1).newstate;
2918                             trie->trans[ tp ].check = state;
2919                             tp++;
2920                             zp = tp;
2921                         }
2922                     } else {
2923                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2924                             const U32 tid = base
2925                                            - trie->uniquecharcount
2926                                            + TRIE_LIST_ITEM( state, idx ).forid;
2927                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2928                                                                 idx ).newstate;
2929                             trie->trans[ tid ].check = state;
2930                         }
2931                         tp += ( maxid - minid + 1 );
2932                     }
2933                     Safefree(trie->states[ state ].trans.list);
2934                 }
2935                 /*
2936                 DEBUG_TRIE_COMPILE_MORE_r(
2937                     Perl_re_printf( aTHX_  " base: %d\n",base);
2938                 );
2939                 */
2940                 trie->states[ state ].trans.base=base;
2941             }
2942             trie->lasttrans = tp + 1;
2943         }
2944     } else {
2945         /*
2946            Second Pass -- Flat Table Representation.
2947 
2948            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2949            each.  We know that we will need Charcount+1 trans at most to store
2950            the data (one row per char at worst case) So we preallocate both
2951            structures assuming worst case.
2952 
2953            We then construct the trie using only the .next slots of the entry
2954            structs.
2955 
2956            We use the .check field of the first entry of the node temporarily
2957            to make compression both faster and easier by keeping track of how
2958            many non zero fields are in the node.
2959 
2960            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2961            transition.
2962 
2963            There are two terms at use here: state as a TRIE_NODEIDX() which is
2964            a number representing the first entry of the node, and state as a
2965            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2966            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2967            if there are 2 entrys per node. eg:
2968 
2969              A B       A B
2970           1. 2 4    1. 3 7
2971           2. 0 3    3. 0 5
2972           3. 0 0    5. 0 0
2973           4. 0 0    7. 0 0
2974 
2975            The table is internally in the right hand, idx form. However as we
2976            also have to deal with the states array which is indexed by nodenum
2977            we have to use TRIE_NODENUM() to convert.
2978 
2979         */
2980         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
2981             depth+1));
2982 
2983 	trie->trans = (reg_trie_trans *)
2984 	    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2985 				  * trie->uniquecharcount + 1,
2986 				  sizeof(reg_trie_trans) );
2987         trie->states = (reg_trie_state *)
2988 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2989 				  sizeof(reg_trie_state) );
2990         next_alloc = trie->uniquecharcount + 1;
2991 
2992 
2993         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2994 
2995             regnode *noper   = NEXTOPER( cur );
2996 
2997             U32 state        = 1;         /* required init */
2998 
2999             U16 charid       = 0;         /* sanity init */
3000             U32 accept_state = 0;         /* sanity init */
3001 
3002             U32 wordlen      = 0;         /* required init */
3003 
3004             if (OP(noper) == NOTHING) {
3005                 regnode *noper_next= regnext(noper);
3006                 if (noper_next < tail)
3007                     noper= noper_next;
3008             }
3009 
3010             if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3011                 const U8 *uc= (U8*)STRING(noper);
3012                 const U8 *e= uc + STR_LEN(noper);
3013 
3014                 for ( ; uc < e ; uc += len ) {
3015 
3016                     TRIE_READ_CHAR;
3017 
3018                     if ( uvc < 256 ) {
3019                         charid = trie->charmap[ uvc ];
3020                     } else {
3021                         SV* const * const svpp = hv_fetch( widecharmap,
3022                                                            (char*)&uvc,
3023                                                            sizeof( UV ),
3024                                                            0);
3025                         charid = svpp ? (U16)SvIV(*svpp) : 0;
3026                     }
3027                     if ( charid ) {
3028                         charid--;
3029                         if ( !trie->trans[ state + charid ].next ) {
3030                             trie->trans[ state + charid ].next = next_alloc;
3031                             trie->trans[ state ].check++;
3032 			    prev_states[TRIE_NODENUM(next_alloc)]
3033 				    = TRIE_NODENUM(state);
3034                             next_alloc += trie->uniquecharcount;
3035                         }
3036                         state = trie->trans[ state + charid ].next;
3037                     } else {
3038                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
3039                     }
3040                     /* charid is now 0 if we dont know the char read, or
3041                      * nonzero if we do */
3042                 }
3043             }
3044             accept_state = TRIE_NODENUM( state );
3045             TRIE_HANDLE_WORD(accept_state);
3046 
3047         } /* end second pass */
3048 
3049         /* and now dump it out before we compress it */
3050         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3051 							  revcharmap,
3052 							  next_alloc, depth+1));
3053 
3054         {
3055         /*
3056            * Inplace compress the table.*
3057 
3058            For sparse data sets the table constructed by the trie algorithm will
3059            be mostly 0/FAIL transitions or to put it another way mostly empty.
3060            (Note that leaf nodes will not contain any transitions.)
3061 
3062            This algorithm compresses the tables by eliminating most such
3063            transitions, at the cost of a modest bit of extra work during lookup:
3064 
3065            - Each states[] entry contains a .base field which indicates the
3066            index in the state[] array wheres its transition data is stored.
3067 
3068            - If .base is 0 there are no valid transitions from that node.
3069 
3070            - If .base is nonzero then charid is added to it to find an entry in
3071            the trans array.
3072 
3073            -If trans[states[state].base+charid].check!=state then the
3074            transition is taken to be a 0/Fail transition. Thus if there are fail
3075            transitions at the front of the node then the .base offset will point
3076            somewhere inside the previous nodes data (or maybe even into a node
3077            even earlier), but the .check field determines if the transition is
3078            valid.
3079 
3080            XXX - wrong maybe?
3081            The following process inplace converts the table to the compressed
3082            table: We first do not compress the root node 1,and mark all its
3083            .check pointers as 1 and set its .base pointer as 1 as well. This
3084            allows us to do a DFA construction from the compressed table later,
3085            and ensures that any .base pointers we calculate later are greater
3086            than 0.
3087 
3088            - We set 'pos' to indicate the first entry of the second node.
3089 
3090            - We then iterate over the columns of the node, finding the first and
3091            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3092            and set the .check pointers accordingly, and advance pos
3093            appropriately and repreat for the next node. Note that when we copy
3094            the next pointers we have to convert them from the original
3095            NODEIDX form to NODENUM form as the former is not valid post
3096            compression.
3097 
3098            - If a node has no transitions used we mark its base as 0 and do not
3099            advance the pos pointer.
3100 
3101            - If a node only has one transition we use a second pointer into the
3102            structure to fill in allocated fail transitions from other states.
3103            This pointer is independent of the main pointer and scans forward
3104            looking for null transitions that are allocated to a state. When it
3105            finds one it writes the single transition into the "hole".  If the
3106            pointer doesnt find one the single transition is appended as normal.
3107 
3108            - Once compressed we can Renew/realloc the structures to release the
3109            excess space.
3110 
3111            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3112            specifically Fig 3.47 and the associated pseudocode.
3113 
3114            demq
3115         */
3116         const U32 laststate = TRIE_NODENUM( next_alloc );
3117 	U32 state, charid;
3118         U32 pos = 0, zp=0;
3119         trie->statecount = laststate;
3120 
3121         for ( state = 1 ; state < laststate ; state++ ) {
3122             U8 flag = 0;
3123 	    const U32 stateidx = TRIE_NODEIDX( state );
3124 	    const U32 o_used = trie->trans[ stateidx ].check;
3125 	    U32 used = trie->trans[ stateidx ].check;
3126             trie->trans[ stateidx ].check = 0;
3127 
3128             for ( charid = 0;
3129                   used && charid < trie->uniquecharcount;
3130                   charid++ )
3131             {
3132                 if ( flag || trie->trans[ stateidx + charid ].next ) {
3133                     if ( trie->trans[ stateidx + charid ].next ) {
3134                         if (o_used == 1) {
3135                             for ( ; zp < pos ; zp++ ) {
3136                                 if ( ! trie->trans[ zp ].next ) {
3137                                     break;
3138                                 }
3139                             }
3140                             trie->states[ state ].trans.base
3141                                                     = zp
3142                                                       + trie->uniquecharcount
3143                                                       - charid ;
3144                             trie->trans[ zp ].next
3145                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3146                                                              + charid ].next );
3147                             trie->trans[ zp ].check = state;
3148                             if ( ++zp > pos ) pos = zp;
3149                             break;
3150                         }
3151                         used--;
3152                     }
3153                     if ( !flag ) {
3154                         flag = 1;
3155                         trie->states[ state ].trans.base
3156                                        = pos + trie->uniquecharcount - charid ;
3157                     }
3158                     trie->trans[ pos ].next
3159                         = SAFE_TRIE_NODENUM(
3160                                        trie->trans[ stateidx + charid ].next );
3161                     trie->trans[ pos ].check = state;
3162                     pos++;
3163                 }
3164             }
3165         }
3166         trie->lasttrans = pos + 1;
3167         trie->states = (reg_trie_state *)
3168 	    PerlMemShared_realloc( trie->states, laststate
3169 				   * sizeof(reg_trie_state) );
3170         DEBUG_TRIE_COMPILE_MORE_r(
3171             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
3172                 depth+1,
3173                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3174                        + 1 ),
3175                 (IV)next_alloc,
3176                 (IV)pos,
3177                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3178             );
3179 
3180         } /* end table compress */
3181     }
3182     DEBUG_TRIE_COMPILE_MORE_r(
3183             Perl_re_indentf( aTHX_  "Statecount:%"UVxf" Lasttrans:%"UVxf"\n",
3184                 depth+1,
3185                 (UV)trie->statecount,
3186                 (UV)trie->lasttrans)
3187     );
3188     /* resize the trans array to remove unused space */
3189     trie->trans = (reg_trie_trans *)
3190 	PerlMemShared_realloc( trie->trans, trie->lasttrans
3191 			       * sizeof(reg_trie_trans) );
3192 
3193     {   /* Modify the program and insert the new TRIE node */
3194         U8 nodetype =(U8)(flags & 0xFF);
3195         char *str=NULL;
3196 
3197 #ifdef DEBUGGING
3198         regnode *optimize = NULL;
3199 #ifdef RE_TRACK_PATTERN_OFFSETS
3200 
3201         U32 mjd_offset = 0;
3202         U32 mjd_nodelen = 0;
3203 #endif /* RE_TRACK_PATTERN_OFFSETS */
3204 #endif /* DEBUGGING */
3205         /*
3206            This means we convert either the first branch or the first Exact,
3207            depending on whether the thing following (in 'last') is a branch
3208            or not and whther first is the startbranch (ie is it a sub part of
3209            the alternation or is it the whole thing.)
3210            Assuming its a sub part we convert the EXACT otherwise we convert
3211            the whole branch sequence, including the first.
3212          */
3213         /* Find the node we are going to overwrite */
3214         if ( first != startbranch || OP( last ) == BRANCH ) {
3215             /* branch sub-chain */
3216             NEXT_OFF( first ) = (U16)(last - first);
3217 #ifdef RE_TRACK_PATTERN_OFFSETS
3218             DEBUG_r({
3219                 mjd_offset= Node_Offset((convert));
3220                 mjd_nodelen= Node_Length((convert));
3221             });
3222 #endif
3223             /* whole branch chain */
3224         }
3225 #ifdef RE_TRACK_PATTERN_OFFSETS
3226         else {
3227             DEBUG_r({
3228                 const  regnode *nop = NEXTOPER( convert );
3229                 mjd_offset= Node_Offset((nop));
3230                 mjd_nodelen= Node_Length((nop));
3231             });
3232         }
3233         DEBUG_OPTIMISE_r(
3234             Perl_re_indentf( aTHX_  "MJD offset:%"UVuf" MJD length:%"UVuf"\n",
3235                 depth+1,
3236                 (UV)mjd_offset, (UV)mjd_nodelen)
3237         );
3238 #endif
3239         /* But first we check to see if there is a common prefix we can
3240            split out as an EXACT and put in front of the TRIE node.  */
3241         trie->startstate= 1;
3242         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
3243             U32 state;
3244             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3245                 U32 ofs = 0;
3246                 I32 idx = -1;
3247                 U32 count = 0;
3248                 const U32 base = trie->states[ state ].trans.base;
3249 
3250                 if ( trie->states[state].wordnum )
3251                         count = 1;
3252 
3253                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3254                     if ( ( base + ofs >= trie->uniquecharcount ) &&
3255                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3256                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3257                     {
3258                         if ( ++count > 1 ) {
3259                             SV **tmp = av_fetch( revcharmap, ofs, 0);
3260 			    const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
3261                             if ( state == 1 ) break;
3262                             if ( count == 2 ) {
3263                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3264                                 DEBUG_OPTIMISE_r(
3265                                     Perl_re_indentf( aTHX_  "New Start State=%"UVuf" Class: [",
3266                                         depth+1,
3267                                         (UV)state));
3268 				if (idx >= 0) {
3269 				    SV ** const tmp = av_fetch( revcharmap, idx, 0);
3270 				    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3271 
3272                                     TRIE_BITMAP_SET(trie,*ch);
3273                                     if ( folder )
3274                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
3275                                     DEBUG_OPTIMISE_r(
3276                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
3277                                     );
3278 				}
3279 			    }
3280 			    TRIE_BITMAP_SET(trie,*ch);
3281 			    if ( folder )
3282 				TRIE_BITMAP_SET(trie,folder[ *ch ]);
3283                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3284 			}
3285                         idx = ofs;
3286 		    }
3287                 }
3288                 if ( count == 1 ) {
3289                     SV **tmp = av_fetch( revcharmap, idx, 0);
3290                     STRLEN len;
3291                     char *ch = SvPV( *tmp, len );
3292                     DEBUG_OPTIMISE_r({
3293                         SV *sv=sv_newmortal();
3294                         Perl_re_indentf( aTHX_  "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
3295                             depth+1,
3296                             (UV)state, (UV)idx,
3297                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3298 	                        PL_colors[0], PL_colors[1],
3299 	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3300 	                        PERL_PV_ESCAPE_FIRSTCHAR
3301                             )
3302                         );
3303                     });
3304                     if ( state==1 ) {
3305                         OP( convert ) = nodetype;
3306                         str=STRING(convert);
3307                         STR_LEN(convert)=0;
3308                     }
3309                     STR_LEN(convert) += len;
3310                     while (len--)
3311                         *str++ = *ch++;
3312 		} else {
3313 #ifdef DEBUGGING
3314 		    if (state>1)
3315                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3316 #endif
3317 		    break;
3318 		}
3319 	    }
3320 	    trie->prefixlen = (state-1);
3321             if (str) {
3322                 regnode *n = convert+NODE_SZ_STR(convert);
3323                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3324                 trie->startstate = state;
3325                 trie->minlen -= (state - 1);
3326                 trie->maxlen -= (state - 1);
3327 #ifdef DEBUGGING
3328                /* At least the UNICOS C compiler choked on this
3329                 * being argument to DEBUG_r(), so let's just have
3330                 * it right here. */
3331                if (
3332 #ifdef PERL_EXT_RE_BUILD
3333                    1
3334 #else
3335                    DEBUG_r_TEST
3336 #endif
3337                    ) {
3338                    regnode *fix = convert;
3339                    U32 word = trie->wordcount;
3340                    mjd_nodelen++;
3341                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3342                    while( ++fix < n ) {
3343                        Set_Node_Offset_Length(fix, 0, 0);
3344                    }
3345                    while (word--) {
3346                        SV ** const tmp = av_fetch( trie_words, word, 0 );
3347                        if (tmp) {
3348                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
3349                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3350                            else
3351                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3352                        }
3353                    }
3354                }
3355 #endif
3356                 if (trie->maxlen) {
3357                     convert = n;
3358 		} else {
3359                     NEXT_OFF(convert) = (U16)(tail - convert);
3360                     DEBUG_r(optimize= n);
3361                 }
3362             }
3363         }
3364         if (!jumper)
3365             jumper = last;
3366         if ( trie->maxlen ) {
3367 	    NEXT_OFF( convert ) = (U16)(tail - convert);
3368 	    ARG_SET( convert, data_slot );
3369 	    /* Store the offset to the first unabsorbed branch in
3370 	       jump[0], which is otherwise unused by the jump logic.
3371 	       We use this when dumping a trie and during optimisation. */
3372 	    if (trie->jump)
3373 	        trie->jump[0] = (U16)(nextbranch - convert);
3374 
3375             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3376 	     *   and there is a bitmap
3377 	     *   and the first "jump target" node we found leaves enough room
3378 	     * then convert the TRIE node into a TRIEC node, with the bitmap
3379 	     * embedded inline in the opcode - this is hypothetically faster.
3380 	     */
3381             if ( !trie->states[trie->startstate].wordnum
3382 		 && trie->bitmap
3383 		 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3384             {
3385                 OP( convert ) = TRIEC;
3386                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3387                 PerlMemShared_free(trie->bitmap);
3388                 trie->bitmap= NULL;
3389             } else
3390                 OP( convert ) = TRIE;
3391 
3392             /* store the type in the flags */
3393             convert->flags = nodetype;
3394             DEBUG_r({
3395             optimize = convert
3396                       + NODE_STEP_REGNODE
3397                       + regarglen[ OP( convert ) ];
3398             });
3399             /* XXX We really should free up the resource in trie now,
3400                    as we won't use them - (which resources?) dmq */
3401         }
3402         /* needed for dumping*/
3403         DEBUG_r(if (optimize) {
3404             regnode *opt = convert;
3405 
3406             while ( ++opt < optimize) {
3407                 Set_Node_Offset_Length(opt,0,0);
3408             }
3409             /*
3410                 Try to clean up some of the debris left after the
3411                 optimisation.
3412              */
3413             while( optimize < jumper ) {
3414                 mjd_nodelen += Node_Length((optimize));
3415                 OP( optimize ) = OPTIMIZED;
3416                 Set_Node_Offset_Length(optimize,0,0);
3417                 optimize++;
3418             }
3419             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3420         });
3421     } /* end node insert */
3422 
3423     /*  Finish populating the prev field of the wordinfo array.  Walk back
3424      *  from each accept state until we find another accept state, and if
3425      *  so, point the first word's .prev field at the second word. If the
3426      *  second already has a .prev field set, stop now. This will be the
3427      *  case either if we've already processed that word's accept state,
3428      *  or that state had multiple words, and the overspill words were
3429      *  already linked up earlier.
3430      */
3431     {
3432 	U16 word;
3433 	U32 state;
3434 	U16 prev;
3435 
3436 	for (word=1; word <= trie->wordcount; word++) {
3437 	    prev = 0;
3438 	    if (trie->wordinfo[word].prev)
3439 		continue;
3440 	    state = trie->wordinfo[word].accept;
3441 	    while (state) {
3442 		state = prev_states[state];
3443 		if (!state)
3444 		    break;
3445 		prev = trie->states[state].wordnum;
3446 		if (prev)
3447 		    break;
3448 	    }
3449 	    trie->wordinfo[word].prev = prev;
3450 	}
3451 	Safefree(prev_states);
3452     }
3453 
3454 
3455     /* and now dump out the compressed format */
3456     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3457 
3458     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3459 #ifdef DEBUGGING
3460     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3461     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3462 #else
3463     SvREFCNT_dec_NN(revcharmap);
3464 #endif
3465     return trie->jump
3466            ? MADE_JUMP_TRIE
3467            : trie->startstate>1
3468              ? MADE_EXACT_TRIE
3469              : MADE_TRIE;
3470 }
3471 
3472 STATIC regnode *
3473 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3474 {
3475 /* The Trie is constructed and compressed now so we can build a fail array if
3476  * it's needed
3477 
3478    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3479    3.32 in the
3480    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3481    Ullman 1985/88
3482    ISBN 0-201-10088-6
3483 
3484    We find the fail state for each state in the trie, this state is the longest
3485    proper suffix of the current state's 'word' that is also a proper prefix of
3486    another word in our trie. State 1 represents the word '' and is thus the
3487    default fail state. This allows the DFA not to have to restart after its
3488    tried and failed a word at a given point, it simply continues as though it
3489    had been matching the other word in the first place.
3490    Consider
3491       'abcdgu'=~/abcdefg|cdgu/
3492    When we get to 'd' we are still matching the first word, we would encounter
3493    'g' which would fail, which would bring us to the state representing 'd' in
3494    the second word where we would try 'g' and succeed, proceeding to match
3495    'cdgu'.
3496  */
3497  /* add a fail transition */
3498     const U32 trie_offset = ARG(source);
3499     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3500     U32 *q;
3501     const U32 ucharcount = trie->uniquecharcount;
3502     const U32 numstates = trie->statecount;
3503     const U32 ubound = trie->lasttrans + ucharcount;
3504     U32 q_read = 0;
3505     U32 q_write = 0;
3506     U32 charid;
3507     U32 base = trie->states[ 1 ].trans.base;
3508     U32 *fail;
3509     reg_ac_data *aho;
3510     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3511     regnode *stclass;
3512     GET_RE_DEBUG_FLAGS_DECL;
3513 
3514     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3515     PERL_UNUSED_CONTEXT;
3516 #ifndef DEBUGGING
3517     PERL_UNUSED_ARG(depth);
3518 #endif
3519 
3520     if ( OP(source) == TRIE ) {
3521         struct regnode_1 *op = (struct regnode_1 *)
3522             PerlMemShared_calloc(1, sizeof(struct regnode_1));
3523         StructCopy(source,op,struct regnode_1);
3524         stclass = (regnode *)op;
3525     } else {
3526         struct regnode_charclass *op = (struct regnode_charclass *)
3527             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3528         StructCopy(source,op,struct regnode_charclass);
3529         stclass = (regnode *)op;
3530     }
3531     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3532 
3533     ARG_SET( stclass, data_slot );
3534     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3535     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3536     aho->trie=trie_offset;
3537     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3538     Copy( trie->states, aho->states, numstates, reg_trie_state );
3539     Newxz( q, numstates, U32);
3540     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3541     aho->refcount = 1;
3542     fail = aho->fail;
3543     /* initialize fail[0..1] to be 1 so that we always have
3544        a valid final fail state */
3545     fail[ 0 ] = fail[ 1 ] = 1;
3546 
3547     for ( charid = 0; charid < ucharcount ; charid++ ) {
3548 	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3549 	if ( newstate ) {
3550             q[ q_write ] = newstate;
3551             /* set to point at the root */
3552             fail[ q[ q_write++ ] ]=1;
3553         }
3554     }
3555     while ( q_read < q_write) {
3556 	const U32 cur = q[ q_read++ % numstates ];
3557         base = trie->states[ cur ].trans.base;
3558 
3559         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3560 	    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3561 	    if (ch_state) {
3562                 U32 fail_state = cur;
3563                 U32 fail_base;
3564                 do {
3565                     fail_state = fail[ fail_state ];
3566                     fail_base = aho->states[ fail_state ].trans.base;
3567                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3568 
3569                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3570                 fail[ ch_state ] = fail_state;
3571                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3572                 {
3573                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3574                 }
3575                 q[ q_write++ % numstates] = ch_state;
3576             }
3577         }
3578     }
3579     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3580        when we fail in state 1, this allows us to use the
3581        charclass scan to find a valid start char. This is based on the principle
3582        that theres a good chance the string being searched contains lots of stuff
3583        that cant be a start char.
3584      */
3585     fail[ 0 ] = fail[ 1 ] = 0;
3586     DEBUG_TRIE_COMPILE_r({
3587         Perl_re_indentf( aTHX_  "Stclass Failtable (%"UVuf" states): 0",
3588                       depth, (UV)numstates
3589         );
3590         for( q_read=1; q_read<numstates; q_read++ ) {
3591             Perl_re_printf( aTHX_  ", %"UVuf, (UV)fail[q_read]);
3592         }
3593         Perl_re_printf( aTHX_  "\n");
3594     });
3595     Safefree(q);
3596     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3597     return stclass;
3598 }
3599 
3600 
3601 #define DEBUG_PEEP(str,scan,depth)         \
3602     DEBUG_OPTIMISE_r({if (scan){           \
3603        regnode *Next = regnext(scan);      \
3604        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\
3605        Perl_re_indentf( aTHX_  "" str ">%3d: %s (%d)", \
3606            depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3607            Next ? (REG_NODE_NUM(Next)) : 0 );\
3608        DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3609        Perl_re_printf( aTHX_  "\n");                   \
3610    }});
3611 
3612 /* The below joins as many adjacent EXACTish nodes as possible into a single
3613  * one.  The regop may be changed if the node(s) contain certain sequences that
3614  * require special handling.  The joining is only done if:
3615  * 1) there is room in the current conglomerated node to entirely contain the
3616  *    next one.
3617  * 2) they are the exact same node type
3618  *
3619  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3620  * these get optimized out
3621  *
3622  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3623  * as possible, even if that means splitting an existing node so that its first
3624  * part is moved to the preceeding node.  This would maximise the efficiency of
3625  * memEQ during matching.  Elsewhere in this file, khw proposes splitting
3626  * EXACTFish nodes into portions that don't change under folding vs those that
3627  * do.  Those portions that don't change may be the only things in the pattern that
3628  * could be used to find fixed and floating strings.
3629  *
3630  * If a node is to match under /i (folded), the number of characters it matches
3631  * can be different than its character length if it contains a multi-character
3632  * fold.  *min_subtract is set to the total delta number of characters of the
3633  * input nodes.
3634  *
3635  * And *unfolded_multi_char is set to indicate whether or not the node contains
3636  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3637  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3638  * SMALL LETTER SHARP S, as only if the target string being matched against
3639  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3640  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3641  * whose components are all above the Latin1 range are not run-time locale
3642  * dependent, and have already been folded by the time this function is
3643  * called.)
3644  *
3645  * This is as good a place as any to discuss the design of handling these
3646  * multi-character fold sequences.  It's been wrong in Perl for a very long
3647  * time.  There are three code points in Unicode whose multi-character folds
3648  * were long ago discovered to mess things up.  The previous designs for
3649  * dealing with these involved assigning a special node for them.  This
3650  * approach doesn't always work, as evidenced by this example:
3651  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3652  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3653  * would match just the \xDF, it won't be able to handle the case where a
3654  * successful match would have to cross the node's boundary.  The new approach
3655  * that hopefully generally solves the problem generates an EXACTFU_SS node
3656  * that is "sss" in this case.
3657  *
3658  * It turns out that there are problems with all multi-character folds, and not
3659  * just these three.  Now the code is general, for all such cases.  The
3660  * approach taken is:
3661  * 1)   This routine examines each EXACTFish node that could contain multi-
3662  *      character folded sequences.  Since a single character can fold into
3663  *      such a sequence, the minimum match length for this node is less than
3664  *      the number of characters in the node.  This routine returns in
3665  *      *min_subtract how many characters to subtract from the the actual
3666  *      length of the string to get a real minimum match length; it is 0 if
3667  *      there are no multi-char foldeds.  This delta is used by the caller to
3668  *      adjust the min length of the match, and the delta between min and max,
3669  *      so that the optimizer doesn't reject these possibilities based on size
3670  *      constraints.
3671  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3672  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3673  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3674  *      there is a possible fold length change.  That means that a regular
3675  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3676  *      with length changes, and so can be processed faster.  regexec.c takes
3677  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3678  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3679  *      known until runtime).  This saves effort in regex matching.  However,
3680  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3681  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3682  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3683  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3684  *      possibilities for the non-UTF8 patterns are quite simple, except for
3685  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3686  *      members of a fold-pair, and arrays are set up for all of them so that
3687  *      the other member of the pair can be found quickly.  Code elsewhere in
3688  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3689  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3690  *      described in the next item.
3691  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3692  *      validity of the fold won't be known until runtime, and so must remain
3693  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3694  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3695  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3696  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3697  *      The reason this is a problem is that the optimizer part of regexec.c
3698  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3699  *      that a character in the pattern corresponds to at most a single
3700  *      character in the target string.  (And I do mean character, and not byte
3701  *      here, unlike other parts of the documentation that have never been
3702  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3703  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3704  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3705  *      nodes, violate the assumption, and they are the only instances where it
3706  *      is violated.  I'm reluctant to try to change the assumption, as the
3707  *      code involved is impenetrable to me (khw), so instead the code here
3708  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3709  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3710  *      boolean indicating whether or not the node contains such a fold.  When
3711  *      it is true, the caller sets a flag that later causes the optimizer in
3712  *      this file to not set values for the floating and fixed string lengths,
3713  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3714  *      assumption.  Thus, there is no optimization based on string lengths for
3715  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3716  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3717  *      assumption is wrong only in these cases is that all other non-UTF-8
3718  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3719  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3720  *      EXACTF nodes because we don't know at compile time if it actually
3721  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3722  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3723  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3724  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3725  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3726  *      string would require the pattern to be forced into UTF-8, the overhead
3727  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3728  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3729  *      locale.)
3730  *
3731  *      Similarly, the code that generates tries doesn't currently handle
3732  *      not-already-folded multi-char folds, and it looks like a pain to change
3733  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3734  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3735  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3736  *      using /iaa matching will be doing so almost entirely with ASCII
3737  *      strings, so this should rarely be encountered in practice */
3738 
3739 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3740     if (PL_regkind[OP(scan)] == EXACT) \
3741         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3742 
3743 STATIC U32
3744 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3745                    UV *min_subtract, bool *unfolded_multi_char,
3746                    U32 flags,regnode *val, U32 depth)
3747 {
3748     /* Merge several consecutive EXACTish nodes into one. */
3749     regnode *n = regnext(scan);
3750     U32 stringok = 1;
3751     regnode *next = scan + NODE_SZ_STR(scan);
3752     U32 merged = 0;
3753     U32 stopnow = 0;
3754 #ifdef DEBUGGING
3755     regnode *stop = scan;
3756     GET_RE_DEBUG_FLAGS_DECL;
3757 #else
3758     PERL_UNUSED_ARG(depth);
3759 #endif
3760 
3761     PERL_ARGS_ASSERT_JOIN_EXACT;
3762 #ifndef EXPERIMENTAL_INPLACESCAN
3763     PERL_UNUSED_ARG(flags);
3764     PERL_UNUSED_ARG(val);
3765 #endif
3766     DEBUG_PEEP("join",scan,depth);
3767 
3768     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3769      * EXACT ones that are mergeable to the current one. */
3770     while (n
3771            && (PL_regkind[OP(n)] == NOTHING
3772                || (stringok && OP(n) == OP(scan)))
3773            && NEXT_OFF(n)
3774            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3775     {
3776 
3777         if (OP(n) == TAIL || n > next)
3778             stringok = 0;
3779         if (PL_regkind[OP(n)] == NOTHING) {
3780             DEBUG_PEEP("skip:",n,depth);
3781             NEXT_OFF(scan) += NEXT_OFF(n);
3782             next = n + NODE_STEP_REGNODE;
3783 #ifdef DEBUGGING
3784             if (stringok)
3785                 stop = n;
3786 #endif
3787             n = regnext(n);
3788         }
3789         else if (stringok) {
3790             const unsigned int oldl = STR_LEN(scan);
3791             regnode * const nnext = regnext(n);
3792 
3793             /* XXX I (khw) kind of doubt that this works on platforms (should
3794              * Perl ever run on one) where U8_MAX is above 255 because of lots
3795              * of other assumptions */
3796             /* Don't join if the sum can't fit into a single node */
3797             if (oldl + STR_LEN(n) > U8_MAX)
3798                 break;
3799 
3800             DEBUG_PEEP("merg",n,depth);
3801             merged++;
3802 
3803             NEXT_OFF(scan) += NEXT_OFF(n);
3804             STR_LEN(scan) += STR_LEN(n);
3805             next = n + NODE_SZ_STR(n);
3806             /* Now we can overwrite *n : */
3807             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3808 #ifdef DEBUGGING
3809             stop = next - 1;
3810 #endif
3811             n = nnext;
3812             if (stopnow) break;
3813         }
3814 
3815 #ifdef EXPERIMENTAL_INPLACESCAN
3816 	if (flags && !NEXT_OFF(n)) {
3817 	    DEBUG_PEEP("atch", val, depth);
3818 	    if (reg_off_by_arg[OP(n)]) {
3819 		ARG_SET(n, val - n);
3820 	    }
3821 	    else {
3822 		NEXT_OFF(n) = val - n;
3823 	    }
3824 	    stopnow = 1;
3825 	}
3826 #endif
3827     }
3828 
3829     *min_subtract = 0;
3830     *unfolded_multi_char = FALSE;
3831 
3832     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3833      * can now analyze for sequences of problematic code points.  (Prior to
3834      * this final joining, sequences could have been split over boundaries, and
3835      * hence missed).  The sequences only happen in folding, hence for any
3836      * non-EXACT EXACTish node */
3837     if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3838         U8* s0 = (U8*) STRING(scan);
3839         U8* s = s0;
3840         U8* s_end = s0 + STR_LEN(scan);
3841 
3842         int total_count_delta = 0;  /* Total delta number of characters that
3843                                        multi-char folds expand to */
3844 
3845 	/* One pass is made over the node's string looking for all the
3846 	 * possibilities.  To avoid some tests in the loop, there are two main
3847 	 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3848 	 * non-UTF-8 */
3849 	if (UTF) {
3850             U8* folded = NULL;
3851 
3852             if (OP(scan) == EXACTFL) {
3853                 U8 *d;
3854 
3855                 /* An EXACTFL node would already have been changed to another
3856                  * node type unless there is at least one character in it that
3857                  * is problematic; likely a character whose fold definition
3858                  * won't be known until runtime, and so has yet to be folded.
3859                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3860                  * to handle the UTF-8 case, we need to create a temporary
3861                  * folded copy using UTF-8 locale rules in order to analyze it.
3862                  * This is because our macros that look to see if a sequence is
3863                  * a multi-char fold assume everything is folded (otherwise the
3864                  * tests in those macros would be too complicated and slow).
3865                  * Note that here, the non-problematic folds will have already
3866                  * been done, so we can just copy such characters.  We actually
3867                  * don't completely fold the EXACTFL string.  We skip the
3868                  * unfolded multi-char folds, as that would just create work
3869                  * below to figure out the size they already are */
3870 
3871                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3872                 d = folded;
3873                 while (s < s_end) {
3874                     STRLEN s_len = UTF8SKIP(s);
3875                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3876                         Copy(s, d, s_len, U8);
3877                         d += s_len;
3878                     }
3879                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3880                         *unfolded_multi_char = TRUE;
3881                         Copy(s, d, s_len, U8);
3882                         d += s_len;
3883                     }
3884                     else if (isASCII(*s)) {
3885                         *(d++) = toFOLD(*s);
3886                     }
3887                     else {
3888                         STRLEN len;
3889                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3890                         d += len;
3891                     }
3892                     s += s_len;
3893                 }
3894 
3895                 /* Point the remainder of the routine to look at our temporary
3896                  * folded copy */
3897                 s = folded;
3898                 s_end = d;
3899             } /* End of creating folded copy of EXACTFL string */
3900 
3901             /* Examine the string for a multi-character fold sequence.  UTF-8
3902              * patterns have all characters pre-folded by the time this code is
3903              * executed */
3904             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3905                                      length sequence we are looking for is 2 */
3906 	    {
3907                 int count = 0;  /* How many characters in a multi-char fold */
3908                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3909                 if (! len) {    /* Not a multi-char fold: get next char */
3910                     s += UTF8SKIP(s);
3911                     continue;
3912                 }
3913 
3914                 /* Nodes with 'ss' require special handling, except for
3915                  * EXACTFA-ish for which there is no multi-char fold to this */
3916                 if (len == 2 && *s == 's' && *(s+1) == 's'
3917                     && OP(scan) != EXACTFA
3918                     && OP(scan) != EXACTFA_NO_TRIE)
3919                 {
3920                     count = 2;
3921                     if (OP(scan) != EXACTFL) {
3922                         OP(scan) = EXACTFU_SS;
3923                     }
3924                     s += 2;
3925                 }
3926                 else { /* Here is a generic multi-char fold. */
3927                     U8* multi_end  = s + len;
3928 
3929                     /* Count how many characters are in it.  In the case of
3930                      * /aa, no folds which contain ASCII code points are
3931                      * allowed, so check for those, and skip if found. */
3932                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3933                         count = utf8_length(s, multi_end);
3934                         s = multi_end;
3935                     }
3936                     else {
3937                         while (s < multi_end) {
3938                             if (isASCII(*s)) {
3939                                 s++;
3940                                 goto next_iteration;
3941                             }
3942                             else {
3943                                 s += UTF8SKIP(s);
3944                             }
3945                             count++;
3946                         }
3947                     }
3948                 }
3949 
3950                 /* The delta is how long the sequence is minus 1 (1 is how long
3951                  * the character that folds to the sequence is) */
3952                 total_count_delta += count - 1;
3953               next_iteration: ;
3954 	    }
3955 
3956             /* We created a temporary folded copy of the string in EXACTFL
3957              * nodes.  Therefore we need to be sure it doesn't go below zero,
3958              * as the real string could be shorter */
3959             if (OP(scan) == EXACTFL) {
3960                 int total_chars = utf8_length((U8*) STRING(scan),
3961                                            (U8*) STRING(scan) + STR_LEN(scan));
3962                 if (total_count_delta > total_chars) {
3963                     total_count_delta = total_chars;
3964                 }
3965             }
3966 
3967             *min_subtract += total_count_delta;
3968             Safefree(folded);
3969 	}
3970 	else if (OP(scan) == EXACTFA) {
3971 
3972             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3973              * fold to the ASCII range (and there are no existing ones in the
3974              * upper latin1 range).  But, as outlined in the comments preceding
3975              * this function, we need to flag any occurrences of the sharp s.
3976              * This character forbids trie formation (because of added
3977              * complexity) */
3978 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
3979    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
3980                                       || UNICODE_DOT_DOT_VERSION > 0)
3981 	    while (s < s_end) {
3982                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3983                     OP(scan) = EXACTFA_NO_TRIE;
3984                     *unfolded_multi_char = TRUE;
3985                     break;
3986                 }
3987                 s++;
3988             }
3989         }
3990 	else {
3991 
3992             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3993              * folds that are all Latin1.  As explained in the comments
3994              * preceding this function, we look also for the sharp s in EXACTF
3995              * and EXACTFL nodes; it can be in the final position.  Otherwise
3996              * we can stop looking 1 byte earlier because have to find at least
3997              * two characters for a multi-fold */
3998 	    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3999                               ? s_end
4000                               : s_end -1;
4001 
4002 	    while (s < upper) {
4003                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4004                 if (! len) {    /* Not a multi-char fold. */
4005                     if (*s == LATIN_SMALL_LETTER_SHARP_S
4006                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4007                     {
4008                         *unfolded_multi_char = TRUE;
4009                     }
4010                     s++;
4011                     continue;
4012                 }
4013 
4014                 if (len == 2
4015                     && isALPHA_FOLD_EQ(*s, 's')
4016                     && isALPHA_FOLD_EQ(*(s+1), 's'))
4017                 {
4018 
4019                     /* EXACTF nodes need to know that the minimum length
4020                      * changed so that a sharp s in the string can match this
4021                      * ss in the pattern, but they remain EXACTF nodes, as they
4022                      * won't match this unless the target string is is UTF-8,
4023                      * which we don't know until runtime.  EXACTFL nodes can't
4024                      * transform into EXACTFU nodes */
4025                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4026                         OP(scan) = EXACTFU_SS;
4027                     }
4028 		}
4029 
4030                 *min_subtract += len - 1;
4031                 s += len;
4032 	    }
4033 #endif
4034 	}
4035     }
4036 
4037 #ifdef DEBUGGING
4038     /* Allow dumping but overwriting the collection of skipped
4039      * ops and/or strings with fake optimized ops */
4040     n = scan + NODE_SZ_STR(scan);
4041     while (n <= stop) {
4042 	OP(n) = OPTIMIZED;
4043 	FLAGS(n) = 0;
4044 	NEXT_OFF(n) = 0;
4045         n++;
4046     }
4047 #endif
4048     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
4049     return stopnow;
4050 }
4051 
4052 /* REx optimizer.  Converts nodes into quicker variants "in place".
4053    Finds fixed substrings.  */
4054 
4055 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4056    to the position after last scanned or to NULL. */
4057 
4058 #define INIT_AND_WITHP \
4059     assert(!and_withp); \
4060     Newx(and_withp,1, regnode_ssc); \
4061     SAVEFREEPV(and_withp)
4062 
4063 
4064 static void
4065 S_unwind_scan_frames(pTHX_ const void *p)
4066 {
4067     scan_frame *f= (scan_frame *)p;
4068     do {
4069         scan_frame *n= f->next_frame;
4070         Safefree(f);
4071         f= n;
4072     } while (f);
4073 }
4074 
4075 
4076 STATIC SSize_t
4077 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4078                         SSize_t *minlenp, SSize_t *deltap,
4079 			regnode *last,
4080 			scan_data_t *data,
4081 			I32 stopparen,
4082                         U32 recursed_depth,
4083 			regnode_ssc *and_withp,
4084 			U32 flags, U32 depth)
4085 			/* scanp: Start here (read-write). */
4086 			/* deltap: Write maxlen-minlen here. */
4087 			/* last: Stop before this one. */
4088 			/* data: string data about the pattern */
4089 			/* stopparen: treat close N as END */
4090 			/* recursed: which subroutines have we recursed into */
4091 			/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4092 {
4093     /* There must be at least this number of characters to match */
4094     SSize_t min = 0;
4095     I32 pars = 0, code;
4096     regnode *scan = *scanp, *next;
4097     SSize_t delta = 0;
4098     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4099     int is_inf_internal = 0;		/* The studied chunk is infinite */
4100     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4101     scan_data_t data_fake;
4102     SV *re_trie_maxbuff = NULL;
4103     regnode *first_non_open = scan;
4104     SSize_t stopmin = SSize_t_MAX;
4105     scan_frame *frame = NULL;
4106     GET_RE_DEBUG_FLAGS_DECL;
4107 
4108     PERL_ARGS_ASSERT_STUDY_CHUNK;
4109     RExC_study_started= 1;
4110 
4111 
4112     if ( depth == 0 ) {
4113         while (first_non_open && OP(first_non_open) == OPEN)
4114             first_non_open=regnext(first_non_open);
4115     }
4116 
4117 
4118   fake_study_recurse:
4119     DEBUG_r(
4120         RExC_study_chunk_recursed_count++;
4121     );
4122     DEBUG_OPTIMISE_MORE_r(
4123     {
4124         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4125             depth, (long)stopparen,
4126             (unsigned long)RExC_study_chunk_recursed_count,
4127             (unsigned long)depth, (unsigned long)recursed_depth,
4128             scan,
4129             last);
4130         if (recursed_depth) {
4131             U32 i;
4132             U32 j;
4133             for ( j = 0 ; j < recursed_depth ; j++ ) {
4134                 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
4135                     if (
4136                         PAREN_TEST(RExC_study_chunk_recursed +
4137                                    ( j * RExC_study_chunk_recursed_bytes), i )
4138                         && (
4139                             !j ||
4140                             !PAREN_TEST(RExC_study_chunk_recursed +
4141                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4142                         )
4143                     ) {
4144                         Perl_re_printf( aTHX_ " %d",(int)i);
4145                         break;
4146                     }
4147                 }
4148                 if ( j + 1 < recursed_depth ) {
4149                     Perl_re_printf( aTHX_  ",");
4150                 }
4151             }
4152         }
4153         Perl_re_printf( aTHX_ "\n");
4154     }
4155     );
4156     while ( scan && OP(scan) != END && scan < last ){
4157         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
4158                                    node length to get a real minimum (because
4159                                    the folded version may be shorter) */
4160 	bool unfolded_multi_char = FALSE;
4161 	/* Peephole optimizer: */
4162         DEBUG_STUDYDATA("Peep:", data, depth);
4163         DEBUG_PEEP("Peep", scan, depth);
4164 
4165 
4166         /* The reason we do this here is that we need to deal with things like
4167          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4168          * parsing code, as each (?:..) is handled by a different invocation of
4169          * reg() -- Yves
4170          */
4171         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4172 
4173 	/* Follow the next-chain of the current node and optimize
4174 	   away all the NOTHINGs from it.  */
4175 	if (OP(scan) != CURLYX) {
4176 	    const int max = (reg_off_by_arg[OP(scan)]
4177 		       ? I32_MAX
4178 		       /* I32 may be smaller than U16 on CRAYs! */
4179 		       : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4180 	    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4181 	    int noff;
4182 	    regnode *n = scan;
4183 
4184 	    /* Skip NOTHING and LONGJMP. */
4185 	    while ((n = regnext(n))
4186 		   && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4187 		       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4188 		   && off + noff < max)
4189 		off += noff;
4190 	    if (reg_off_by_arg[OP(scan)])
4191 		ARG(scan) = off;
4192 	    else
4193 		NEXT_OFF(scan) = off;
4194 	}
4195 
4196 	/* The principal pseudo-switch.  Cannot be a switch, since we
4197 	   look into several different things.  */
4198         if ( OP(scan) == DEFINEP ) {
4199             SSize_t minlen = 0;
4200             SSize_t deltanext = 0;
4201             SSize_t fake_last_close = 0;
4202             I32 f = SCF_IN_DEFINE;
4203 
4204             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4205             scan = regnext(scan);
4206             assert( OP(scan) == IFTHEN );
4207             DEBUG_PEEP("expect IFTHEN", scan, depth);
4208 
4209             data_fake.last_closep= &fake_last_close;
4210             minlen = *minlenp;
4211             next = regnext(scan);
4212             scan = NEXTOPER(NEXTOPER(scan));
4213             DEBUG_PEEP("scan", scan, depth);
4214             DEBUG_PEEP("next", next, depth);
4215 
4216             /* we suppose the run is continuous, last=next...
4217              * NOTE we dont use the return here! */
4218             (void)study_chunk(pRExC_state, &scan, &minlen,
4219                               &deltanext, next, &data_fake, stopparen,
4220                               recursed_depth, NULL, f, depth+1);
4221 
4222             scan = next;
4223         } else
4224         if (
4225             OP(scan) == BRANCH  ||
4226             OP(scan) == BRANCHJ ||
4227             OP(scan) == IFTHEN
4228         ) {
4229 	    next = regnext(scan);
4230 	    code = OP(scan);
4231 
4232             /* The op(next)==code check below is to see if we
4233              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4234              * IFTHEN is special as it might not appear in pairs.
4235              * Not sure whether BRANCH-BRANCHJ is possible, regardless
4236              * we dont handle it cleanly. */
4237 	    if (OP(next) == code || code == IFTHEN) {
4238                 /* NOTE - There is similar code to this block below for
4239                  * handling TRIE nodes on a re-study.  If you change stuff here
4240                  * check there too. */
4241 		SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4242 		regnode_ssc accum;
4243 		regnode * const startbranch=scan;
4244 
4245                 if (flags & SCF_DO_SUBSTR) {
4246                     /* Cannot merge strings after this. */
4247                     scan_commit(pRExC_state, data, minlenp, is_inf);
4248                 }
4249 
4250                 if (flags & SCF_DO_STCLASS)
4251 		    ssc_init_zero(pRExC_state, &accum);
4252 
4253 		while (OP(scan) == code) {
4254 		    SSize_t deltanext, minnext, fake;
4255 		    I32 f = 0;
4256 		    regnode_ssc this_class;
4257 
4258                     DEBUG_PEEP("Branch", scan, depth);
4259 
4260 		    num++;
4261                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4262 		    if (data) {
4263 			data_fake.whilem_c = data->whilem_c;
4264 			data_fake.last_closep = data->last_closep;
4265 		    }
4266 		    else
4267 			data_fake.last_closep = &fake;
4268 
4269 		    data_fake.pos_delta = delta;
4270 		    next = regnext(scan);
4271 
4272                     scan = NEXTOPER(scan); /* everything */
4273                     if (code != BRANCH)    /* everything but BRANCH */
4274 			scan = NEXTOPER(scan);
4275 
4276 		    if (flags & SCF_DO_STCLASS) {
4277 			ssc_init(pRExC_state, &this_class);
4278 			data_fake.start_class = &this_class;
4279 			f = SCF_DO_STCLASS_AND;
4280 		    }
4281 		    if (flags & SCF_WHILEM_VISITED_POS)
4282 			f |= SCF_WHILEM_VISITED_POS;
4283 
4284 		    /* we suppose the run is continuous, last=next...*/
4285 		    minnext = study_chunk(pRExC_state, &scan, minlenp,
4286                                       &deltanext, next, &data_fake, stopparen,
4287                                       recursed_depth, NULL, f,depth+1);
4288 
4289 		    if (min1 > minnext)
4290 			min1 = minnext;
4291 		    if (deltanext == SSize_t_MAX) {
4292 			is_inf = is_inf_internal = 1;
4293 			max1 = SSize_t_MAX;
4294 		    } else if (max1 < minnext + deltanext)
4295 			max1 = minnext + deltanext;
4296 		    scan = next;
4297 		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4298 			pars++;
4299 	            if (data_fake.flags & SCF_SEEN_ACCEPT) {
4300 	                if ( stopmin > minnext)
4301 	                    stopmin = min + min1;
4302 	                flags &= ~SCF_DO_SUBSTR;
4303 	                if (data)
4304 	                    data->flags |= SCF_SEEN_ACCEPT;
4305 	            }
4306 		    if (data) {
4307 			if (data_fake.flags & SF_HAS_EVAL)
4308 			    data->flags |= SF_HAS_EVAL;
4309 			data->whilem_c = data_fake.whilem_c;
4310 		    }
4311 		    if (flags & SCF_DO_STCLASS)
4312 			ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4313 		}
4314 		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4315 		    min1 = 0;
4316 		if (flags & SCF_DO_SUBSTR) {
4317 		    data->pos_min += min1;
4318 		    if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4319 		        data->pos_delta = SSize_t_MAX;
4320 		    else
4321 		        data->pos_delta += max1 - min1;
4322 		    if (max1 != min1 || is_inf)
4323 			data->longest = &(data->longest_float);
4324 		}
4325 		min += min1;
4326 		if (delta == SSize_t_MAX
4327 		 || SSize_t_MAX - delta - (max1 - min1) < 0)
4328 		    delta = SSize_t_MAX;
4329 		else
4330 		    delta += max1 - min1;
4331 		if (flags & SCF_DO_STCLASS_OR) {
4332 		    ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4333 		    if (min1) {
4334 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4335 			flags &= ~SCF_DO_STCLASS;
4336 		    }
4337 		}
4338 		else if (flags & SCF_DO_STCLASS_AND) {
4339 		    if (min1) {
4340 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4341 			flags &= ~SCF_DO_STCLASS;
4342 		    }
4343 		    else {
4344 			/* Switch to OR mode: cache the old value of
4345 			 * data->start_class */
4346 			INIT_AND_WITHP;
4347 			StructCopy(data->start_class, and_withp, regnode_ssc);
4348 			flags &= ~SCF_DO_STCLASS_AND;
4349 			StructCopy(&accum, data->start_class, regnode_ssc);
4350 			flags |= SCF_DO_STCLASS_OR;
4351 		    }
4352 		}
4353 
4354                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4355                         OP( startbranch ) == BRANCH )
4356                 {
4357 		/* demq.
4358 
4359                    Assuming this was/is a branch we are dealing with: 'scan'
4360                    now points at the item that follows the branch sequence,
4361                    whatever it is. We now start at the beginning of the
4362                    sequence and look for subsequences of
4363 
4364 		   BRANCH->EXACT=>x1
4365 		   BRANCH->EXACT=>x2
4366 		   tail
4367 
4368                    which would be constructed from a pattern like
4369                    /A|LIST|OF|WORDS/
4370 
4371 		   If we can find such a subsequence we need to turn the first
4372 		   element into a trie and then add the subsequent branch exact
4373 		   strings to the trie.
4374 
4375 		   We have two cases
4376 
4377                      1. patterns where the whole set of branches can be
4378                         converted.
4379 
4380 		     2. patterns where only a subset can be converted.
4381 
4382 		   In case 1 we can replace the whole set with a single regop
4383 		   for the trie. In case 2 we need to keep the start and end
4384 		   branches so
4385 
4386 		     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4387 		     becomes BRANCH TRIE; BRANCH X;
4388 
4389 		  There is an additional case, that being where there is a
4390 		  common prefix, which gets split out into an EXACT like node
4391 		  preceding the TRIE node.
4392 
4393 		  If x(1..n)==tail then we can do a simple trie, if not we make
4394 		  a "jump" trie, such that when we match the appropriate word
4395 		  we "jump" to the appropriate tail node. Essentially we turn
4396 		  a nested if into a case structure of sorts.
4397 
4398 		*/
4399 
4400 		    int made=0;
4401 		    if (!re_trie_maxbuff) {
4402 			re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4403 			if (!SvIOK(re_trie_maxbuff))
4404 			    sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4405 		    }
4406                     if ( SvIV(re_trie_maxbuff)>=0  ) {
4407                         regnode *cur;
4408                         regnode *first = (regnode *)NULL;
4409                         regnode *last = (regnode *)NULL;
4410                         regnode *tail = scan;
4411                         U8 trietype = 0;
4412                         U32 count=0;
4413 
4414                         /* var tail is used because there may be a TAIL
4415                            regop in the way. Ie, the exacts will point to the
4416                            thing following the TAIL, but the last branch will
4417                            point at the TAIL. So we advance tail. If we
4418                            have nested (?:) we may have to move through several
4419                            tails.
4420                          */
4421 
4422                         while ( OP( tail ) == TAIL ) {
4423                             /* this is the TAIL generated by (?:) */
4424                             tail = regnext( tail );
4425                         }
4426 
4427 
4428                         DEBUG_TRIE_COMPILE_r({
4429                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4430                             Perl_re_indentf( aTHX_  "%s %"UVuf":%s\n",
4431                               depth+1,
4432                               "Looking for TRIE'able sequences. Tail node is ",
4433                               (UV)(tail - RExC_emit_start),
4434                               SvPV_nolen_const( RExC_mysv )
4435                             );
4436                         });
4437 
4438                         /*
4439 
4440                             Step through the branches
4441                                 cur represents each branch,
4442                                 noper is the first thing to be matched as part
4443                                       of that branch
4444                                 noper_next is the regnext() of that node.
4445 
4446                             We normally handle a case like this
4447                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4448                             support building with NOJUMPTRIE, which restricts
4449                             the trie logic to structures like /FOO|BAR/.
4450 
4451                             If noper is a trieable nodetype then the branch is
4452                             a possible optimization target. If we are building
4453                             under NOJUMPTRIE then we require that noper_next is
4454                             the same as scan (our current position in the regex
4455                             program).
4456 
4457                             Once we have two or more consecutive such branches
4458                             we can create a trie of the EXACT's contents and
4459                             stitch it in place into the program.
4460 
4461                             If the sequence represents all of the branches in
4462                             the alternation we replace the entire thing with a
4463                             single TRIE node.
4464 
4465                             Otherwise when it is a subsequence we need to
4466                             stitch it in place and replace only the relevant
4467                             branches. This means the first branch has to remain
4468                             as it is used by the alternation logic, and its
4469                             next pointer, and needs to be repointed at the item
4470                             on the branch chain following the last branch we
4471                             have optimized away.
4472 
4473                             This could be either a BRANCH, in which case the
4474                             subsequence is internal, or it could be the item
4475                             following the branch sequence in which case the
4476                             subsequence is at the end (which does not
4477                             necessarily mean the first node is the start of the
4478                             alternation).
4479 
4480                             TRIE_TYPE(X) is a define which maps the optype to a
4481                             trietype.
4482 
4483                                 optype          |  trietype
4484                                 ----------------+-----------
4485                                 NOTHING         | NOTHING
4486                                 EXACT           | EXACT
4487                                 EXACTFU         | EXACTFU
4488                                 EXACTFU_SS      | EXACTFU
4489                                 EXACTFA         | EXACTFA
4490                                 EXACTL          | EXACTL
4491                                 EXACTFLU8       | EXACTFLU8
4492 
4493 
4494                         */
4495 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
4496                        ? NOTHING                                            \
4497                        : ( EXACT == (X) )                                   \
4498                          ? EXACT                                            \
4499                          : ( EXACTFU == (X) || EXACTFU_SS == (X) )          \
4500                            ? EXACTFU                                        \
4501                            : ( EXACTFA == (X) )                             \
4502                              ? EXACTFA                                      \
4503                              : ( EXACTL == (X) )                            \
4504                                ? EXACTL                                     \
4505                                : ( EXACTFLU8 == (X) )                        \
4506                                  ? EXACTFLU8                                 \
4507                                  : 0 )
4508 
4509                         /* dont use tail as the end marker for this traverse */
4510                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4511                             regnode * const noper = NEXTOPER( cur );
4512                             U8 noper_type = OP( noper );
4513                             U8 noper_trietype = TRIE_TYPE( noper_type );
4514 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4515                             regnode * const noper_next = regnext( noper );
4516                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4517                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4518 #endif
4519 
4520                             DEBUG_TRIE_COMPILE_r({
4521                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4522                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
4523                                    depth+1,
4524                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4525 
4526                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4527                                 Perl_re_printf( aTHX_  " -> %d:%s",
4528                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4529 
4530                                 if ( noper_next ) {
4531                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4532                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4533                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4534                                 }
4535                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4536                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4537 				   PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4538 				);
4539                             });
4540 
4541                             /* Is noper a trieable nodetype that can be merged
4542                              * with the current trie (if there is one)? */
4543                             if ( noper_trietype
4544                                   &&
4545                                   (
4546                                         ( noper_trietype == NOTHING )
4547                                         || ( trietype == NOTHING )
4548                                         || ( trietype == noper_trietype )
4549                                   )
4550 #ifdef NOJUMPTRIE
4551                                   && noper_next >= tail
4552 #endif
4553                                   && count < U16_MAX)
4554                             {
4555                                 /* Handle mergable triable node Either we are
4556                                  * the first node in a new trieable sequence,
4557                                  * in which case we do some bookkeeping,
4558                                  * otherwise we update the end pointer. */
4559                                 if ( !first ) {
4560                                     first = cur;
4561 				    if ( noper_trietype == NOTHING ) {
4562 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4563 					regnode * const noper_next = regnext( noper );
4564                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4565 					U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4566 #endif
4567 
4568                                         if ( noper_next_trietype ) {
4569 					    trietype = noper_next_trietype;
4570                                         } else if (noper_next_type)  {
4571                                             /* a NOTHING regop is 1 regop wide.
4572                                              * We need at least two for a trie
4573                                              * so we can't merge this in */
4574                                             first = NULL;
4575                                         }
4576                                     } else {
4577                                         trietype = noper_trietype;
4578                                     }
4579                                 } else {
4580                                     if ( trietype == NOTHING )
4581                                         trietype = noper_trietype;
4582                                     last = cur;
4583                                 }
4584 				if (first)
4585 				    count++;
4586                             } /* end handle mergable triable node */
4587                             else {
4588                                 /* handle unmergable node -
4589                                  * noper may either be a triable node which can
4590                                  * not be tried together with the current trie,
4591                                  * or a non triable node */
4592                                 if ( last ) {
4593                                     /* If last is set and trietype is not
4594                                      * NOTHING then we have found at least two
4595                                      * triable branch sequences in a row of a
4596                                      * similar trietype so we can turn them
4597                                      * into a trie. If/when we allow NOTHING to
4598                                      * start a trie sequence this condition
4599                                      * will be required, and it isn't expensive
4600                                      * so we leave it in for now. */
4601                                     if ( trietype && trietype != NOTHING )
4602                                         make_trie( pRExC_state,
4603                                                 startbranch, first, cur, tail,
4604                                                 count, trietype, depth+1 );
4605                                     last = NULL; /* note: we clear/update
4606                                                     first, trietype etc below,
4607                                                     so we dont do it here */
4608                                 }
4609                                 if ( noper_trietype
4610 #ifdef NOJUMPTRIE
4611                                      && noper_next >= tail
4612 #endif
4613                                 ){
4614                                     /* noper is triable, so we can start a new
4615                                      * trie sequence */
4616                                     count = 1;
4617                                     first = cur;
4618                                     trietype = noper_trietype;
4619                                 } else if (first) {
4620                                     /* if we already saw a first but the
4621                                      * current node is not triable then we have
4622                                      * to reset the first information. */
4623                                     count = 0;
4624                                     first = NULL;
4625                                     trietype = 0;
4626                                 }
4627                             } /* end handle unmergable node */
4628                         } /* loop over branches */
4629                         DEBUG_TRIE_COMPILE_r({
4630                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4631                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
4632                               depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4633                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4634                                REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4635                                PL_reg_name[trietype]
4636                             );
4637 
4638                         });
4639                         if ( last && trietype ) {
4640                             if ( trietype != NOTHING ) {
4641                                 /* the last branch of the sequence was part of
4642                                  * a trie, so we have to construct it here
4643                                  * outside of the loop */
4644                                 made= make_trie( pRExC_state, startbranch,
4645                                                  first, scan, tail, count,
4646                                                  trietype, depth+1 );
4647 #ifdef TRIE_STUDY_OPT
4648                                 if ( ((made == MADE_EXACT_TRIE &&
4649                                      startbranch == first)
4650                                      || ( first_non_open == first )) &&
4651                                      depth==0 ) {
4652                                     flags |= SCF_TRIE_RESTUDY;
4653                                     if ( startbranch == first
4654                                          && scan >= tail )
4655                                     {
4656                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4657                                     }
4658                                 }
4659 #endif
4660                             } else {
4661                                 /* at this point we know whatever we have is a
4662                                  * NOTHING sequence/branch AND if 'startbranch'
4663                                  * is 'first' then we can turn the whole thing
4664                                  * into a NOTHING
4665                                  */
4666                                 if ( startbranch == first ) {
4667                                     regnode *opt;
4668                                     /* the entire thing is a NOTHING sequence,
4669                                      * something like this: (?:|) So we can
4670                                      * turn it into a plain NOTHING op. */
4671                                     DEBUG_TRIE_COMPILE_r({
4672                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4673                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
4674                                           depth+1,
4675                                           SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4676 
4677                                     });
4678                                     OP(startbranch)= NOTHING;
4679                                     NEXT_OFF(startbranch)= tail - startbranch;
4680                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4681                                         OP(opt)= OPTIMIZED;
4682                                 }
4683                             }
4684                         } /* end if ( last) */
4685                     } /* TRIE_MAXBUF is non zero */
4686 
4687                 } /* do trie */
4688 
4689 	    }
4690 	    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4691 		scan = NEXTOPER(NEXTOPER(scan));
4692 	    } else			/* single branch is optimized. */
4693 		scan = NEXTOPER(scan);
4694 	    continue;
4695         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
4696             I32 paren = 0;
4697             regnode *start = NULL;
4698             regnode *end = NULL;
4699             U32 my_recursed_depth= recursed_depth;
4700 
4701             if (OP(scan) != SUSPEND) { /* GOSUB */
4702                 /* Do setup, note this code has side effects beyond
4703                  * the rest of this block. Specifically setting
4704                  * RExC_recurse[] must happen at least once during
4705                  * study_chunk(). */
4706                 paren = ARG(scan);
4707                 RExC_recurse[ARG2L(scan)] = scan;
4708                 start = RExC_open_parens[paren];
4709                 end   = RExC_close_parens[paren];
4710 
4711                 /* NOTE we MUST always execute the above code, even
4712                  * if we do nothing with a GOSUB */
4713                 if (
4714                     ( flags & SCF_IN_DEFINE )
4715                     ||
4716                     (
4717                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4718                         &&
4719                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4720                     )
4721                 ) {
4722                     /* no need to do anything here if we are in a define. */
4723                     /* or we are after some kind of infinite construct
4724                      * so we can skip recursing into this item.
4725                      * Since it is infinite we will not change the maxlen
4726                      * or delta, and if we miss something that might raise
4727                      * the minlen it will merely pessimise a little.
4728                      *
4729                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4730                      * might result in a minlen of 1 and not of 4,
4731                      * but this doesn't make us mismatch, just try a bit
4732                      * harder than we should.
4733                      * */
4734                     scan= regnext(scan);
4735                     continue;
4736                 }
4737 
4738                 if (
4739                     !recursed_depth
4740                     ||
4741                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4742                 ) {
4743                     /* it is quite possible that there are more efficient ways
4744                      * to do this. We maintain a bitmap per level of recursion
4745                      * of which patterns we have entered so we can detect if a
4746                      * pattern creates a possible infinite loop. When we
4747                      * recurse down a level we copy the previous levels bitmap
4748                      * down. When we are at recursion level 0 we zero the top
4749                      * level bitmap. It would be nice to implement a different
4750                      * more efficient way of doing this. In particular the top
4751                      * level bitmap may be unnecessary.
4752                      */
4753                     if (!recursed_depth) {
4754                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4755                     } else {
4756                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4757                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4758                              RExC_study_chunk_recursed_bytes, U8);
4759                     }
4760                     /* we havent recursed into this paren yet, so recurse into it */
4761                     DEBUG_STUDYDATA("gosub-set:", data,depth);
4762                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4763                     my_recursed_depth= recursed_depth + 1;
4764                 } else {
4765                     DEBUG_STUDYDATA("gosub-inf:", data,depth);
4766                     /* some form of infinite recursion, assume infinite length
4767                      * */
4768                     if (flags & SCF_DO_SUBSTR) {
4769                         scan_commit(pRExC_state, data, minlenp, is_inf);
4770                         data->longest = &(data->longest_float);
4771                     }
4772                     is_inf = is_inf_internal = 1;
4773                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4774                         ssc_anything(data->start_class);
4775                     flags &= ~SCF_DO_STCLASS;
4776 
4777                     start= NULL; /* reset start so we dont recurse later on. */
4778 	        }
4779             } else {
4780 	        paren = stopparen;
4781                 start = scan + 2;
4782 	        end = regnext(scan);
4783 	    }
4784             if (start) {
4785                 scan_frame *newframe;
4786                 assert(end);
4787                 if (!RExC_frame_last) {
4788                     Newxz(newframe, 1, scan_frame);
4789                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4790                     RExC_frame_head= newframe;
4791                     RExC_frame_count++;
4792                 } else if (!RExC_frame_last->next_frame) {
4793                     Newxz(newframe,1,scan_frame);
4794                     RExC_frame_last->next_frame= newframe;
4795                     newframe->prev_frame= RExC_frame_last;
4796                     RExC_frame_count++;
4797                 } else {
4798                     newframe= RExC_frame_last->next_frame;
4799                 }
4800                 RExC_frame_last= newframe;
4801 
4802                 newframe->next_regnode = regnext(scan);
4803                 newframe->last_regnode = last;
4804                 newframe->stopparen = stopparen;
4805                 newframe->prev_recursed_depth = recursed_depth;
4806                 newframe->this_prev_frame= frame;
4807 
4808                 DEBUG_STUDYDATA("frame-new:",data,depth);
4809                 DEBUG_PEEP("fnew", scan, depth);
4810 
4811 	        frame = newframe;
4812 	        scan =  start;
4813 	        stopparen = paren;
4814 	        last = end;
4815                 depth = depth + 1;
4816                 recursed_depth= my_recursed_depth;
4817 
4818 	        continue;
4819 	    }
4820 	}
4821 	else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4822 	    SSize_t l = STR_LEN(scan);
4823 	    UV uc;
4824 	    if (UTF) {
4825 		const U8 * const s = (U8*)STRING(scan);
4826 		uc = utf8_to_uvchr_buf(s, s + l, NULL);
4827 		l = utf8_length(s, s + l);
4828 	    } else {
4829 		uc = *((U8*)STRING(scan));
4830 	    }
4831 	    min += l;
4832 	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4833 		/* The code below prefers earlier match for fixed
4834 		   offset, later match for variable offset.  */
4835 		if (data->last_end == -1) { /* Update the start info. */
4836 		    data->last_start_min = data->pos_min;
4837  		    data->last_start_max = is_inf
4838  			? SSize_t_MAX : data->pos_min + data->pos_delta;
4839 		}
4840 		sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4841 		if (UTF)
4842 		    SvUTF8_on(data->last_found);
4843 		{
4844 		    SV * const sv = data->last_found;
4845 		    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4846 			mg_find(sv, PERL_MAGIC_utf8) : NULL;
4847 		    if (mg && mg->mg_len >= 0)
4848 			mg->mg_len += utf8_length((U8*)STRING(scan),
4849                                               (U8*)STRING(scan)+STR_LEN(scan));
4850 		}
4851 		data->last_end = data->pos_min + l;
4852 		data->pos_min += l; /* As in the first entry. */
4853 		data->flags &= ~SF_BEFORE_EOL;
4854 	    }
4855 
4856             /* ANDing the code point leaves at most it, and not in locale, and
4857              * can't match null string */
4858 	    if (flags & SCF_DO_STCLASS_AND) {
4859                 ssc_cp_and(data->start_class, uc);
4860                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4861                 ssc_clear_locale(data->start_class);
4862 	    }
4863 	    else if (flags & SCF_DO_STCLASS_OR) {
4864                 ssc_add_cp(data->start_class, uc);
4865 		ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4866 
4867                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4868                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4869 	    }
4870 	    flags &= ~SCF_DO_STCLASS;
4871 	}
4872         else if (PL_regkind[OP(scan)] == EXACT) {
4873             /* But OP != EXACT!, so is EXACTFish */
4874 	    SSize_t l = STR_LEN(scan);
4875             const U8 * s = (U8*)STRING(scan);
4876 
4877 	    /* Search for fixed substrings supports EXACT only. */
4878 	    if (flags & SCF_DO_SUBSTR) {
4879 		assert(data);
4880                 scan_commit(pRExC_state, data, minlenp, is_inf);
4881 	    }
4882 	    if (UTF) {
4883 		l = utf8_length(s, s + l);
4884 	    }
4885 	    if (unfolded_multi_char) {
4886                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4887 	    }
4888 	    min += l - min_subtract;
4889             assert (min >= 0);
4890             delta += min_subtract;
4891 	    if (flags & SCF_DO_SUBSTR) {
4892 		data->pos_min += l - min_subtract;
4893 		if (data->pos_min < 0) {
4894                     data->pos_min = 0;
4895                 }
4896                 data->pos_delta += min_subtract;
4897 		if (min_subtract) {
4898 		    data->longest = &(data->longest_float);
4899 		}
4900 	    }
4901 
4902             if (flags & SCF_DO_STCLASS) {
4903                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4904 
4905                 assert(EXACTF_invlist);
4906                 if (flags & SCF_DO_STCLASS_AND) {
4907                     if (OP(scan) != EXACTFL)
4908                         ssc_clear_locale(data->start_class);
4909                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4910                     ANYOF_POSIXL_ZERO(data->start_class);
4911                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4912                 }
4913                 else {  /* SCF_DO_STCLASS_OR */
4914                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4915                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4916 
4917                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4918                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4919                 }
4920                 flags &= ~SCF_DO_STCLASS;
4921                 SvREFCNT_dec(EXACTF_invlist);
4922             }
4923 	}
4924 	else if (REGNODE_VARIES(OP(scan))) {
4925 	    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4926 	    I32 fl = 0, f = flags;
4927 	    regnode * const oscan = scan;
4928 	    regnode_ssc this_class;
4929 	    regnode_ssc *oclass = NULL;
4930 	    I32 next_is_eval = 0;
4931 
4932 	    switch (PL_regkind[OP(scan)]) {
4933 	    case WHILEM:		/* End of (?:...)* . */
4934 		scan = NEXTOPER(scan);
4935 		goto finish;
4936 	    case PLUS:
4937 		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4938 		    next = NEXTOPER(scan);
4939 		    if (OP(next) == EXACT
4940                         || OP(next) == EXACTL
4941                         || (flags & SCF_DO_STCLASS))
4942                     {
4943 			mincount = 1;
4944 			maxcount = REG_INFTY;
4945 			next = regnext(scan);
4946 			scan = NEXTOPER(scan);
4947 			goto do_curly;
4948 		    }
4949 		}
4950 		if (flags & SCF_DO_SUBSTR)
4951 		    data->pos_min++;
4952 		min++;
4953 		/* FALLTHROUGH */
4954 	    case STAR:
4955 		if (flags & SCF_DO_STCLASS) {
4956 		    mincount = 0;
4957 		    maxcount = REG_INFTY;
4958 		    next = regnext(scan);
4959 		    scan = NEXTOPER(scan);
4960 		    goto do_curly;
4961 		}
4962 		if (flags & SCF_DO_SUBSTR) {
4963                     scan_commit(pRExC_state, data, minlenp, is_inf);
4964                     /* Cannot extend fixed substrings */
4965 		    data->longest = &(data->longest_float);
4966 		}
4967                 is_inf = is_inf_internal = 1;
4968                 scan = regnext(scan);
4969 		goto optimize_curly_tail;
4970 	    case CURLY:
4971 	        if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4972 	            && (scan->flags == stopparen))
4973 		{
4974 		    mincount = 1;
4975 		    maxcount = 1;
4976 		} else {
4977 		    mincount = ARG1(scan);
4978 		    maxcount = ARG2(scan);
4979 		}
4980 		next = regnext(scan);
4981 		if (OP(scan) == CURLYX) {
4982 		    I32 lp = (data ? *(data->last_closep) : 0);
4983 		    scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4984 		}
4985 		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4986 		next_is_eval = (OP(scan) == EVAL);
4987 	      do_curly:
4988 		if (flags & SCF_DO_SUBSTR) {
4989                     if (mincount == 0)
4990                         scan_commit(pRExC_state, data, minlenp, is_inf);
4991                     /* Cannot extend fixed substrings */
4992 		    pos_before = data->pos_min;
4993 		}
4994 		if (data) {
4995 		    fl = data->flags;
4996 		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4997 		    if (is_inf)
4998 			data->flags |= SF_IS_INF;
4999 		}
5000 		if (flags & SCF_DO_STCLASS) {
5001 		    ssc_init(pRExC_state, &this_class);
5002 		    oclass = data->start_class;
5003 		    data->start_class = &this_class;
5004 		    f |= SCF_DO_STCLASS_AND;
5005 		    f &= ~SCF_DO_STCLASS_OR;
5006 		}
5007 	        /* Exclude from super-linear cache processing any {n,m}
5008 		   regops for which the combination of input pos and regex
5009 		   pos is not enough information to determine if a match
5010 		   will be possible.
5011 
5012 		   For example, in the regex /foo(bar\s*){4,8}baz/ with the
5013 		   regex pos at the \s*, the prospects for a match depend not
5014 		   only on the input position but also on how many (bar\s*)
5015 		   repeats into the {4,8} we are. */
5016                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5017 		    f &= ~SCF_WHILEM_VISITED_POS;
5018 
5019 		/* This will finish on WHILEM, setting scan, or on NULL: */
5020 		minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5021                                   last, data, stopparen, recursed_depth, NULL,
5022                                   (mincount == 0
5023                                    ? (f & ~SCF_DO_SUBSTR)
5024                                    : f)
5025                                   ,depth+1);
5026 
5027 		if (flags & SCF_DO_STCLASS)
5028 		    data->start_class = oclass;
5029 		if (mincount == 0 || minnext == 0) {
5030 		    if (flags & SCF_DO_STCLASS_OR) {
5031 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5032 		    }
5033 		    else if (flags & SCF_DO_STCLASS_AND) {
5034 			/* Switch to OR mode: cache the old value of
5035 			 * data->start_class */
5036 			INIT_AND_WITHP;
5037 			StructCopy(data->start_class, and_withp, regnode_ssc);
5038 			flags &= ~SCF_DO_STCLASS_AND;
5039 			StructCopy(&this_class, data->start_class, regnode_ssc);
5040 			flags |= SCF_DO_STCLASS_OR;
5041                         ANYOF_FLAGS(data->start_class)
5042                                                 |= SSC_MATCHES_EMPTY_STRING;
5043 		    }
5044 		} else {		/* Non-zero len */
5045 		    if (flags & SCF_DO_STCLASS_OR) {
5046 			ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5047 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5048 		    }
5049 		    else if (flags & SCF_DO_STCLASS_AND)
5050 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5051 		    flags &= ~SCF_DO_STCLASS;
5052 		}
5053 		if (!scan) 		/* It was not CURLYX, but CURLY. */
5054 		    scan = next;
5055 		if (!(flags & SCF_TRIE_DOING_RESTUDY)
5056 		    /* ? quantifier ok, except for (?{ ... }) */
5057 		    && (next_is_eval || !(mincount == 0 && maxcount == 1))
5058 		    && (minnext == 0) && (deltanext == 0)
5059 		    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5060                     && maxcount <= REG_INFTY/3) /* Complement check for big
5061                                                    count */
5062 		{
5063 		    /* Fatal warnings may leak the regexp without this: */
5064 		    SAVEFREESV(RExC_rx_sv);
5065 		    Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5066 			"Quantifier unexpected on zero-length expression "
5067 			"in regex m/%"UTF8f"/",
5068 			 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5069 				  RExC_precomp));
5070 		    (void)ReREFCNT_inc(RExC_rx_sv);
5071 		}
5072 
5073 		min += minnext * mincount;
5074 		is_inf_internal |= deltanext == SSize_t_MAX
5075                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
5076 		is_inf |= is_inf_internal;
5077                 if (is_inf) {
5078 		    delta = SSize_t_MAX;
5079                 } else {
5080 		    delta += (minnext + deltanext) * maxcount
5081                              - minnext * mincount;
5082                 }
5083 		/* Try powerful optimization CURLYX => CURLYN. */
5084 		if (  OP(oscan) == CURLYX && data
5085 		      && data->flags & SF_IN_PAR
5086 		      && !(data->flags & SF_HAS_EVAL)
5087 		      && !deltanext && minnext == 1 ) {
5088 		    /* Try to optimize to CURLYN.  */
5089 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5090 		    regnode * const nxt1 = nxt;
5091 #ifdef DEBUGGING
5092 		    regnode *nxt2;
5093 #endif
5094 
5095 		    /* Skip open. */
5096 		    nxt = regnext(nxt);
5097 		    if (!REGNODE_SIMPLE(OP(nxt))
5098 			&& !(PL_regkind[OP(nxt)] == EXACT
5099 			     && STR_LEN(nxt) == 1))
5100 			goto nogo;
5101 #ifdef DEBUGGING
5102 		    nxt2 = nxt;
5103 #endif
5104 		    nxt = regnext(nxt);
5105 		    if (OP(nxt) != CLOSE)
5106 			goto nogo;
5107 		    if (RExC_open_parens) {
5108                         RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5109                         RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/
5110 		    }
5111 		    /* Now we know that nxt2 is the only contents: */
5112 		    oscan->flags = (U8)ARG(nxt);
5113 		    OP(oscan) = CURLYN;
5114 		    OP(nxt1) = NOTHING;	/* was OPEN. */
5115 
5116 #ifdef DEBUGGING
5117 		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5118 		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5119 		    NEXT_OFF(nxt2) = 0;	/* just for consistency with CURLY. */
5120 		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
5121 		    OP(nxt + 1) = OPTIMIZED; /* was count. */
5122 		    NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5123 #endif
5124 		}
5125 	      nogo:
5126 
5127 		/* Try optimization CURLYX => CURLYM. */
5128 		if (  OP(oscan) == CURLYX && data
5129 		      && !(data->flags & SF_HAS_PAR)
5130 		      && !(data->flags & SF_HAS_EVAL)
5131 		      && !deltanext	/* atom is fixed width */
5132 		      && minnext != 0	/* CURLYM can't handle zero width */
5133 
5134                          /* Nor characters whose fold at run-time may be
5135                           * multi-character */
5136                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5137 		) {
5138 		    /* XXXX How to optimize if data == 0? */
5139 		    /* Optimize to a simpler form.  */
5140 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5141 		    regnode *nxt2;
5142 
5143 		    OP(oscan) = CURLYM;
5144 		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5145 			    && (OP(nxt2) != WHILEM))
5146 			nxt = nxt2;
5147 		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
5148 		    /* Need to optimize away parenths. */
5149 		    if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5150 			/* Set the parenth number.  */
5151 			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5152 
5153 			oscan->flags = (U8)ARG(nxt);
5154 			if (RExC_open_parens) {
5155                             RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/
5156                             RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/
5157 			}
5158 			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
5159 			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
5160 
5161 #ifdef DEBUGGING
5162 			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5163 			OP(nxt + 1) = OPTIMIZED; /* was count. */
5164 			NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5165 			NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5166 #endif
5167 #if 0
5168 			while ( nxt1 && (OP(nxt1) != WHILEM)) {
5169 			    regnode *nnxt = regnext(nxt1);
5170 			    if (nnxt == nxt) {
5171 				if (reg_off_by_arg[OP(nxt1)])
5172 				    ARG_SET(nxt1, nxt2 - nxt1);
5173 				else if (nxt2 - nxt1 < U16_MAX)
5174 				    NEXT_OFF(nxt1) = nxt2 - nxt1;
5175 				else
5176 				    OP(nxt) = NOTHING;	/* Cannot beautify */
5177 			    }
5178 			    nxt1 = nnxt;
5179 			}
5180 #endif
5181 			/* Optimize again: */
5182 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5183                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
5184 		    }
5185 		    else
5186 			oscan->flags = 0;
5187 		}
5188 		else if ((OP(oscan) == CURLYX)
5189 			 && (flags & SCF_WHILEM_VISITED_POS)
5190 			 /* See the comment on a similar expression above.
5191 			    However, this time it's not a subexpression
5192 			    we care about, but the expression itself. */
5193 			 && (maxcount == REG_INFTY)
5194 			 && data && ++data->whilem_c < 16) {
5195 		    /* This stays as CURLYX, we can put the count/of pair. */
5196 		    /* Find WHILEM (as in regexec.c) */
5197 		    regnode *nxt = oscan + NEXT_OFF(oscan);
5198 
5199 		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5200 			nxt += ARG(nxt);
5201 		    PREVOPER(nxt)->flags = (U8)(data->whilem_c
5202 			| (RExC_whilem_seen << 4)); /* On WHILEM */
5203 		}
5204 		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5205 		    pars++;
5206 		if (flags & SCF_DO_SUBSTR) {
5207 		    SV *last_str = NULL;
5208                     STRLEN last_chrs = 0;
5209 		    int counted = mincount != 0;
5210 
5211                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
5212                                                                   string. */
5213 			SSize_t b = pos_before >= data->last_start_min
5214 			    ? pos_before : data->last_start_min;
5215 			STRLEN l;
5216 			const char * const s = SvPV_const(data->last_found, l);
5217 			SSize_t old = b - data->last_start_min;
5218 
5219 			if (UTF)
5220 			    old = utf8_hop((U8*)s, old) - (U8*)s;
5221 			l -= old;
5222 			/* Get the added string: */
5223 			last_str = newSVpvn_utf8(s  + old, l, UTF);
5224                         last_chrs = UTF ? utf8_length((U8*)(s + old),
5225                                             (U8*)(s + old + l)) : l;
5226 			if (deltanext == 0 && pos_before == b) {
5227 			    /* What was added is a constant string */
5228 			    if (mincount > 1) {
5229 
5230 				SvGROW(last_str, (mincount * l) + 1);
5231 				repeatcpy(SvPVX(last_str) + l,
5232 					  SvPVX_const(last_str), l,
5233                                           mincount - 1);
5234 				SvCUR_set(last_str, SvCUR(last_str) * mincount);
5235 				/* Add additional parts. */
5236 				SvCUR_set(data->last_found,
5237 					  SvCUR(data->last_found) - l);
5238 				sv_catsv(data->last_found, last_str);
5239 				{
5240 				    SV * sv = data->last_found;
5241 				    MAGIC *mg =
5242 					SvUTF8(sv) && SvMAGICAL(sv) ?
5243 					mg_find(sv, PERL_MAGIC_utf8) : NULL;
5244 				    if (mg && mg->mg_len >= 0)
5245 					mg->mg_len += last_chrs * (mincount-1);
5246 				}
5247                                 last_chrs *= mincount;
5248 				data->last_end += l * (mincount - 1);
5249 			    }
5250 			} else {
5251 			    /* start offset must point into the last copy */
5252 			    data->last_start_min += minnext * (mincount - 1);
5253 			    data->last_start_max =
5254                               is_inf
5255                                ? SSize_t_MAX
5256 			       : data->last_start_max +
5257                                  (maxcount - 1) * (minnext + data->pos_delta);
5258 			}
5259 		    }
5260 		    /* It is counted once already... */
5261 		    data->pos_min += minnext * (mincount - counted);
5262 #if 0
5263 Perl_re_printf( aTHX_  "counted=%"UVuf" deltanext=%"UVuf
5264                               " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5265                               " maxcount=%"UVuf" mincount=%"UVuf"\n",
5266     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5267     (UV)mincount);
5268 if (deltanext != SSize_t_MAX)
5269 Perl_re_printf( aTHX_  "LHS=%"UVuf" RHS=%"UVuf"\n",
5270     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5271           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5272 #endif
5273 		    if (deltanext == SSize_t_MAX
5274                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5275 		        data->pos_delta = SSize_t_MAX;
5276 		    else
5277 		        data->pos_delta += - counted * deltanext +
5278 			(minnext + deltanext) * maxcount - minnext * mincount;
5279 		    if (mincount != maxcount) {
5280 			 /* Cannot extend fixed substrings found inside
5281 			    the group.  */
5282                         scan_commit(pRExC_state, data, minlenp, is_inf);
5283 			if (mincount && last_str) {
5284 			    SV * const sv = data->last_found;
5285 			    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5286 				mg_find(sv, PERL_MAGIC_utf8) : NULL;
5287 
5288 			    if (mg)
5289 				mg->mg_len = -1;
5290 			    sv_setsv(sv, last_str);
5291 			    data->last_end = data->pos_min;
5292 			    data->last_start_min = data->pos_min - last_chrs;
5293 			    data->last_start_max = is_inf
5294 				? SSize_t_MAX
5295 				: data->pos_min + data->pos_delta - last_chrs;
5296 			}
5297 			data->longest = &(data->longest_float);
5298 		    }
5299 		    SvREFCNT_dec(last_str);
5300 		}
5301 		if (data && (fl & SF_HAS_EVAL))
5302 		    data->flags |= SF_HAS_EVAL;
5303 	      optimize_curly_tail:
5304 		if (OP(oscan) != CURLYX) {
5305 		    while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5306 			   && NEXT_OFF(next))
5307 			NEXT_OFF(oscan) += NEXT_OFF(next);
5308 		}
5309 		continue;
5310 
5311 	    default:
5312 #ifdef DEBUGGING
5313                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5314                                                                     OP(scan));
5315 #endif
5316             case REF:
5317             case CLUMP:
5318 		if (flags & SCF_DO_SUBSTR) {
5319                     /* Cannot expect anything... */
5320                     scan_commit(pRExC_state, data, minlenp, is_inf);
5321 		    data->longest = &(data->longest_float);
5322 		}
5323 		is_inf = is_inf_internal = 1;
5324 		if (flags & SCF_DO_STCLASS_OR) {
5325                     if (OP(scan) == CLUMP) {
5326                         /* Actually is any start char, but very few code points
5327                          * aren't start characters */
5328                         ssc_match_all_cp(data->start_class);
5329                     }
5330                     else {
5331                         ssc_anything(data->start_class);
5332                     }
5333                 }
5334 		flags &= ~SCF_DO_STCLASS;
5335 		break;
5336 	    }
5337 	}
5338 	else if (OP(scan) == LNBREAK) {
5339 	    if (flags & SCF_DO_STCLASS) {
5340     	        if (flags & SCF_DO_STCLASS_AND) {
5341                     ssc_intersection(data->start_class,
5342                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5343                     ssc_clear_locale(data->start_class);
5344                     ANYOF_FLAGS(data->start_class)
5345                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5346                 }
5347                 else if (flags & SCF_DO_STCLASS_OR) {
5348                     ssc_union(data->start_class,
5349                               PL_XPosix_ptrs[_CC_VERTSPACE],
5350                               FALSE);
5351 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5352 
5353                     /* See commit msg for
5354                      * 749e076fceedeb708a624933726e7989f2302f6a */
5355                     ANYOF_FLAGS(data->start_class)
5356                                                 &= ~SSC_MATCHES_EMPTY_STRING;
5357                 }
5358 		flags &= ~SCF_DO_STCLASS;
5359             }
5360 	    min++;
5361             if (delta != SSize_t_MAX)
5362                 delta++;    /* Because of the 2 char string cr-lf */
5363             if (flags & SCF_DO_SUBSTR) {
5364                 /* Cannot expect anything... */
5365                 scan_commit(pRExC_state, data, minlenp, is_inf);
5366     	        data->pos_min += 1;
5367 	        data->pos_delta += 1;
5368 		data->longest = &(data->longest_float);
5369     	    }
5370 	}
5371 	else if (REGNODE_SIMPLE(OP(scan))) {
5372 
5373 	    if (flags & SCF_DO_SUBSTR) {
5374                 scan_commit(pRExC_state, data, minlenp, is_inf);
5375 		data->pos_min++;
5376 	    }
5377 	    min++;
5378 	    if (flags & SCF_DO_STCLASS) {
5379                 bool invert = 0;
5380                 SV* my_invlist = NULL;
5381                 U8 namedclass;
5382 
5383                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5384                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5385 
5386 		/* Some of the logic below assumes that switching
5387 		   locale on will only add false positives. */
5388 		switch (OP(scan)) {
5389 
5390 		default:
5391 #ifdef DEBUGGING
5392                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5393                                                                      OP(scan));
5394 #endif
5395 		case SANY:
5396 		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5397 			ssc_match_all_cp(data->start_class);
5398 		    break;
5399 
5400 		case REG_ANY:
5401                     {
5402                         SV* REG_ANY_invlist = _new_invlist(2);
5403                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5404                                                             '\n');
5405                         if (flags & SCF_DO_STCLASS_OR) {
5406                             ssc_union(data->start_class,
5407                                       REG_ANY_invlist,
5408                                       TRUE /* TRUE => invert, hence all but \n
5409                                             */
5410                                       );
5411                         }
5412                         else if (flags & SCF_DO_STCLASS_AND) {
5413                             ssc_intersection(data->start_class,
5414                                              REG_ANY_invlist,
5415                                              TRUE  /* TRUE => invert */
5416                                              );
5417                             ssc_clear_locale(data->start_class);
5418                         }
5419                         SvREFCNT_dec_NN(REG_ANY_invlist);
5420 		    }
5421 		    break;
5422 
5423                 case ANYOFD:
5424                 case ANYOFL:
5425                 case ANYOF:
5426 		    if (flags & SCF_DO_STCLASS_AND)
5427 			ssc_and(pRExC_state, data->start_class,
5428                                 (regnode_charclass *) scan);
5429 		    else
5430 			ssc_or(pRExC_state, data->start_class,
5431                                                           (regnode_charclass *) scan);
5432 		    break;
5433 
5434 		case NPOSIXL:
5435                     invert = 1;
5436                     /* FALLTHROUGH */
5437 
5438 		case POSIXL:
5439                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5440                     if (flags & SCF_DO_STCLASS_AND) {
5441                         bool was_there = cBOOL(
5442                                           ANYOF_POSIXL_TEST(data->start_class,
5443                                                                  namedclass));
5444                         ANYOF_POSIXL_ZERO(data->start_class);
5445                         if (was_there) {    /* Do an AND */
5446                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5447                         }
5448                         /* No individual code points can now match */
5449                         data->start_class->invlist
5450                                                 = sv_2mortal(_new_invlist(0));
5451                     }
5452                     else {
5453                         int complement = namedclass + ((invert) ? -1 : 1);
5454 
5455                         assert(flags & SCF_DO_STCLASS_OR);
5456 
5457                         /* If the complement of this class was already there,
5458                          * the result is that they match all code points,
5459                          * (\d + \D == everything).  Remove the classes from
5460                          * future consideration.  Locale is not relevant in
5461                          * this case */
5462                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5463                             ssc_match_all_cp(data->start_class);
5464                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5465                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
5466                         }
5467                         else {  /* The usual case; just add this class to the
5468                                    existing set */
5469                             ANYOF_POSIXL_SET(data->start_class, namedclass);
5470                         }
5471                     }
5472                     break;
5473 
5474                 case NPOSIXA:   /* For these, we always know the exact set of
5475                                    what's matched */
5476                     invert = 1;
5477                     /* FALLTHROUGH */
5478 		case POSIXA:
5479                     if (FLAGS(scan) == _CC_ASCII) {
5480                         my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5481                     }
5482                     else {
5483                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5484                                               PL_XPosix_ptrs[_CC_ASCII],
5485                                               &my_invlist);
5486                     }
5487                     goto join_posix;
5488 
5489 		case NPOSIXD:
5490 		case NPOSIXU:
5491                     invert = 1;
5492                     /* FALLTHROUGH */
5493 		case POSIXD:
5494 		case POSIXU:
5495                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5496 
5497                     /* NPOSIXD matches all upper Latin1 code points unless the
5498                      * target string being matched is UTF-8, which is
5499                      * unknowable until match time.  Since we are going to
5500                      * invert, we want to get rid of all of them so that the
5501                      * inversion will match all */
5502                     if (OP(scan) == NPOSIXD) {
5503                         _invlist_subtract(my_invlist, PL_UpperLatin1,
5504                                           &my_invlist);
5505                     }
5506 
5507                   join_posix:
5508 
5509                     if (flags & SCF_DO_STCLASS_AND) {
5510                         ssc_intersection(data->start_class, my_invlist, invert);
5511                         ssc_clear_locale(data->start_class);
5512                     }
5513                     else {
5514                         assert(flags & SCF_DO_STCLASS_OR);
5515                         ssc_union(data->start_class, my_invlist, invert);
5516                     }
5517                     SvREFCNT_dec(my_invlist);
5518 		}
5519 		if (flags & SCF_DO_STCLASS_OR)
5520 		    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5521 		flags &= ~SCF_DO_STCLASS;
5522 	    }
5523 	}
5524 	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5525 	    data->flags |= (OP(scan) == MEOL
5526 			    ? SF_BEFORE_MEOL
5527 			    : SF_BEFORE_SEOL);
5528             scan_commit(pRExC_state, data, minlenp, is_inf);
5529 
5530 	}
5531 	else if (  PL_regkind[OP(scan)] == BRANCHJ
5532 		 /* Lookbehind, or need to calculate parens/evals/stclass: */
5533 		   && (scan->flags || data || (flags & SCF_DO_STCLASS))
5534 		   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5535         {
5536             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5537                 || OP(scan) == UNLESSM )
5538             {
5539                 /* Negative Lookahead/lookbehind
5540                    In this case we can't do fixed string optimisation.
5541                 */
5542 
5543                 SSize_t deltanext, minnext, fake = 0;
5544                 regnode *nscan;
5545                 regnode_ssc intrnl;
5546                 int f = 0;
5547 
5548                 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5549                 if (data) {
5550                     data_fake.whilem_c = data->whilem_c;
5551                     data_fake.last_closep = data->last_closep;
5552 		}
5553                 else
5554                     data_fake.last_closep = &fake;
5555 		data_fake.pos_delta = delta;
5556                 if ( flags & SCF_DO_STCLASS && !scan->flags
5557                      && OP(scan) == IFMATCH ) { /* Lookahead */
5558                     ssc_init(pRExC_state, &intrnl);
5559                     data_fake.start_class = &intrnl;
5560                     f |= SCF_DO_STCLASS_AND;
5561 		}
5562                 if (flags & SCF_WHILEM_VISITED_POS)
5563                     f |= SCF_WHILEM_VISITED_POS;
5564                 next = regnext(scan);
5565                 nscan = NEXTOPER(NEXTOPER(scan));
5566                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5567                                       last, &data_fake, stopparen,
5568                                       recursed_depth, NULL, f, depth+1);
5569                 if (scan->flags) {
5570                     if (deltanext) {
5571 			FAIL("Variable length lookbehind not implemented");
5572                     }
5573                     else if (minnext > (I32)U8_MAX) {
5574 			FAIL2("Lookbehind longer than %"UVuf" not implemented",
5575                               (UV)U8_MAX);
5576                     }
5577                     scan->flags = (U8)minnext;
5578                 }
5579                 if (data) {
5580                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5581                         pars++;
5582                     if (data_fake.flags & SF_HAS_EVAL)
5583                         data->flags |= SF_HAS_EVAL;
5584                     data->whilem_c = data_fake.whilem_c;
5585                 }
5586                 if (f & SCF_DO_STCLASS_AND) {
5587 		    if (flags & SCF_DO_STCLASS_OR) {
5588 			/* OR before, AND after: ideally we would recurse with
5589 			 * data_fake to get the AND applied by study of the
5590 			 * remainder of the pattern, and then derecurse;
5591 			 * *** HACK *** for now just treat as "no information".
5592 			 * See [perl #56690].
5593 			 */
5594 			ssc_init(pRExC_state, data->start_class);
5595 		    }  else {
5596                         /* AND before and after: combine and continue.  These
5597                          * assertions are zero-length, so can match an EMPTY
5598                          * string */
5599 			ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5600                         ANYOF_FLAGS(data->start_class)
5601                                                    |= SSC_MATCHES_EMPTY_STRING;
5602 		    }
5603                 }
5604 	    }
5605 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5606             else {
5607                 /* Positive Lookahead/lookbehind
5608                    In this case we can do fixed string optimisation,
5609                    but we must be careful about it. Note in the case of
5610                    lookbehind the positions will be offset by the minimum
5611                    length of the pattern, something we won't know about
5612                    until after the recurse.
5613                 */
5614                 SSize_t deltanext, fake = 0;
5615                 regnode *nscan;
5616                 regnode_ssc intrnl;
5617                 int f = 0;
5618                 /* We use SAVEFREEPV so that when the full compile
5619                     is finished perl will clean up the allocated
5620                     minlens when it's all done. This way we don't
5621                     have to worry about freeing them when we know
5622                     they wont be used, which would be a pain.
5623                  */
5624                 SSize_t *minnextp;
5625                 Newx( minnextp, 1, SSize_t );
5626                 SAVEFREEPV(minnextp);
5627 
5628                 if (data) {
5629                     StructCopy(data, &data_fake, scan_data_t);
5630                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5631                         f |= SCF_DO_SUBSTR;
5632                         if (scan->flags)
5633                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5634                         data_fake.last_found=newSVsv(data->last_found);
5635                     }
5636                 }
5637                 else
5638                     data_fake.last_closep = &fake;
5639                 data_fake.flags = 0;
5640 		data_fake.pos_delta = delta;
5641                 if (is_inf)
5642 	            data_fake.flags |= SF_IS_INF;
5643                 if ( flags & SCF_DO_STCLASS && !scan->flags
5644                      && OP(scan) == IFMATCH ) { /* Lookahead */
5645                     ssc_init(pRExC_state, &intrnl);
5646                     data_fake.start_class = &intrnl;
5647                     f |= SCF_DO_STCLASS_AND;
5648                 }
5649                 if (flags & SCF_WHILEM_VISITED_POS)
5650                     f |= SCF_WHILEM_VISITED_POS;
5651                 next = regnext(scan);
5652                 nscan = NEXTOPER(NEXTOPER(scan));
5653 
5654                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5655                                         &deltanext, last, &data_fake,
5656                                         stopparen, recursed_depth, NULL,
5657                                         f,depth+1);
5658                 if (scan->flags) {
5659                     if (deltanext) {
5660 			FAIL("Variable length lookbehind not implemented");
5661                     }
5662                     else if (*minnextp > (I32)U8_MAX) {
5663 			FAIL2("Lookbehind longer than %"UVuf" not implemented",
5664                               (UV)U8_MAX);
5665                     }
5666                     scan->flags = (U8)*minnextp;
5667                 }
5668 
5669                 *minnextp += min;
5670 
5671                 if (f & SCF_DO_STCLASS_AND) {
5672                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5673                     ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5674                 }
5675                 if (data) {
5676                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5677                         pars++;
5678                     if (data_fake.flags & SF_HAS_EVAL)
5679                         data->flags |= SF_HAS_EVAL;
5680                     data->whilem_c = data_fake.whilem_c;
5681                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5682                         if (RExC_rx->minlen<*minnextp)
5683                             RExC_rx->minlen=*minnextp;
5684                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5685                         SvREFCNT_dec_NN(data_fake.last_found);
5686 
5687                         if ( data_fake.minlen_fixed != minlenp )
5688                         {
5689                             data->offset_fixed= data_fake.offset_fixed;
5690                             data->minlen_fixed= data_fake.minlen_fixed;
5691                             data->lookbehind_fixed+= scan->flags;
5692                         }
5693                         if ( data_fake.minlen_float != minlenp )
5694                         {
5695                             data->minlen_float= data_fake.minlen_float;
5696                             data->offset_float_min=data_fake.offset_float_min;
5697                             data->offset_float_max=data_fake.offset_float_max;
5698                             data->lookbehind_float+= scan->flags;
5699                         }
5700                     }
5701                 }
5702 	    }
5703 #endif
5704 	}
5705 	else if (OP(scan) == OPEN) {
5706 	    if (stopparen != (I32)ARG(scan))
5707 	        pars++;
5708 	}
5709 	else if (OP(scan) == CLOSE) {
5710 	    if (stopparen == (I32)ARG(scan)) {
5711 	        break;
5712 	    }
5713 	    if ((I32)ARG(scan) == is_par) {
5714 		next = regnext(scan);
5715 
5716 		if ( next && (OP(next) != WHILEM) && next < last)
5717 		    is_par = 0;		/* Disable optimization */
5718 	    }
5719 	    if (data)
5720 		*(data->last_closep) = ARG(scan);
5721 	}
5722 	else if (OP(scan) == EVAL) {
5723 		if (data)
5724 		    data->flags |= SF_HAS_EVAL;
5725 	}
5726 	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5727 	    if (flags & SCF_DO_SUBSTR) {
5728                 scan_commit(pRExC_state, data, minlenp, is_inf);
5729 		flags &= ~SCF_DO_SUBSTR;
5730 	    }
5731 	    if (data && OP(scan)==ACCEPT) {
5732 	        data->flags |= SCF_SEEN_ACCEPT;
5733 	        if (stopmin > min)
5734 	            stopmin = min;
5735 	    }
5736 	}
5737 	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5738 	{
5739 		if (flags & SCF_DO_SUBSTR) {
5740                     scan_commit(pRExC_state, data, minlenp, is_inf);
5741 		    data->longest = &(data->longest_float);
5742 		}
5743 		is_inf = is_inf_internal = 1;
5744 		if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5745 		    ssc_anything(data->start_class);
5746 		flags &= ~SCF_DO_STCLASS;
5747 	}
5748 	else if (OP(scan) == GPOS) {
5749             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5750 	        !(delta || is_inf || (data && data->pos_delta)))
5751 	    {
5752                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5753                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5754 	        if (RExC_rx->gofs < (STRLEN)min)
5755 		    RExC_rx->gofs = min;
5756             } else {
5757                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5758                 RExC_rx->gofs = 0;
5759             }
5760 	}
5761 #ifdef TRIE_STUDY_OPT
5762 #ifdef FULL_TRIE_STUDY
5763         else if (PL_regkind[OP(scan)] == TRIE) {
5764             /* NOTE - There is similar code to this block above for handling
5765                BRANCH nodes on the initial study.  If you change stuff here
5766                check there too. */
5767             regnode *trie_node= scan;
5768             regnode *tail= regnext(scan);
5769             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5770             SSize_t max1 = 0, min1 = SSize_t_MAX;
5771             regnode_ssc accum;
5772 
5773             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5774                 /* Cannot merge strings after this. */
5775                 scan_commit(pRExC_state, data, minlenp, is_inf);
5776             }
5777             if (flags & SCF_DO_STCLASS)
5778                 ssc_init_zero(pRExC_state, &accum);
5779 
5780             if (!trie->jump) {
5781                 min1= trie->minlen;
5782                 max1= trie->maxlen;
5783             } else {
5784                 const regnode *nextbranch= NULL;
5785                 U32 word;
5786 
5787                 for ( word=1 ; word <= trie->wordcount ; word++)
5788                 {
5789                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5790                     regnode_ssc this_class;
5791 
5792                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5793                     if (data) {
5794                         data_fake.whilem_c = data->whilem_c;
5795                         data_fake.last_closep = data->last_closep;
5796                     }
5797                     else
5798                         data_fake.last_closep = &fake;
5799 		    data_fake.pos_delta = delta;
5800                     if (flags & SCF_DO_STCLASS) {
5801                         ssc_init(pRExC_state, &this_class);
5802                         data_fake.start_class = &this_class;
5803                         f = SCF_DO_STCLASS_AND;
5804                     }
5805                     if (flags & SCF_WHILEM_VISITED_POS)
5806                         f |= SCF_WHILEM_VISITED_POS;
5807 
5808                     if (trie->jump[word]) {
5809                         if (!nextbranch)
5810                             nextbranch = trie_node + trie->jump[0];
5811                         scan= trie_node + trie->jump[word];
5812                         /* We go from the jump point to the branch that follows
5813                            it. Note this means we need the vestigal unused
5814                            branches even though they arent otherwise used. */
5815                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5816                             &deltanext, (regnode *)nextbranch, &data_fake,
5817                             stopparen, recursed_depth, NULL, f,depth+1);
5818                     }
5819                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5820                         nextbranch= regnext((regnode*)nextbranch);
5821 
5822                     if (min1 > (SSize_t)(minnext + trie->minlen))
5823                         min1 = minnext + trie->minlen;
5824                     if (deltanext == SSize_t_MAX) {
5825                         is_inf = is_inf_internal = 1;
5826                         max1 = SSize_t_MAX;
5827                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5828                         max1 = minnext + deltanext + trie->maxlen;
5829 
5830                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5831                         pars++;
5832                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5833                         if ( stopmin > min + min1)
5834 	                    stopmin = min + min1;
5835 	                flags &= ~SCF_DO_SUBSTR;
5836 	                if (data)
5837 	                    data->flags |= SCF_SEEN_ACCEPT;
5838 	            }
5839                     if (data) {
5840                         if (data_fake.flags & SF_HAS_EVAL)
5841                             data->flags |= SF_HAS_EVAL;
5842                         data->whilem_c = data_fake.whilem_c;
5843                     }
5844                     if (flags & SCF_DO_STCLASS)
5845                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5846                 }
5847             }
5848             if (flags & SCF_DO_SUBSTR) {
5849                 data->pos_min += min1;
5850                 data->pos_delta += max1 - min1;
5851                 if (max1 != min1 || is_inf)
5852                     data->longest = &(data->longest_float);
5853             }
5854             min += min1;
5855             if (delta != SSize_t_MAX)
5856                 delta += max1 - min1;
5857             if (flags & SCF_DO_STCLASS_OR) {
5858                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5859                 if (min1) {
5860                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5861                     flags &= ~SCF_DO_STCLASS;
5862                 }
5863             }
5864             else if (flags & SCF_DO_STCLASS_AND) {
5865                 if (min1) {
5866                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5867                     flags &= ~SCF_DO_STCLASS;
5868                 }
5869                 else {
5870                     /* Switch to OR mode: cache the old value of
5871                      * data->start_class */
5872 		    INIT_AND_WITHP;
5873                     StructCopy(data->start_class, and_withp, regnode_ssc);
5874                     flags &= ~SCF_DO_STCLASS_AND;
5875                     StructCopy(&accum, data->start_class, regnode_ssc);
5876                     flags |= SCF_DO_STCLASS_OR;
5877                 }
5878             }
5879             scan= tail;
5880             continue;
5881         }
5882 #else
5883 	else if (PL_regkind[OP(scan)] == TRIE) {
5884 	    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5885 	    U8*bang=NULL;
5886 
5887 	    min += trie->minlen;
5888 	    delta += (trie->maxlen - trie->minlen);
5889 	    flags &= ~SCF_DO_STCLASS; /* xxx */
5890             if (flags & SCF_DO_SUBSTR) {
5891                 /* Cannot expect anything... */
5892                 scan_commit(pRExC_state, data, minlenp, is_inf);
5893     	        data->pos_min += trie->minlen;
5894     	        data->pos_delta += (trie->maxlen - trie->minlen);
5895 		if (trie->maxlen != trie->minlen)
5896 		    data->longest = &(data->longest_float);
5897     	    }
5898     	    if (trie->jump) /* no more substrings -- for now /grr*/
5899                flags &= ~SCF_DO_SUBSTR;
5900 	}
5901 #endif /* old or new */
5902 #endif /* TRIE_STUDY_OPT */
5903 
5904 	/* Else: zero-length, ignore. */
5905 	scan = regnext(scan);
5906     }
5907 
5908   finish:
5909     if (frame) {
5910         /* we need to unwind recursion. */
5911         depth = depth - 1;
5912 
5913         DEBUG_STUDYDATA("frame-end:",data,depth);
5914         DEBUG_PEEP("fend", scan, depth);
5915 
5916         /* restore previous context */
5917         last = frame->last_regnode;
5918         scan = frame->next_regnode;
5919         stopparen = frame->stopparen;
5920         recursed_depth = frame->prev_recursed_depth;
5921 
5922         RExC_frame_last = frame->prev_frame;
5923         frame = frame->this_prev_frame;
5924         goto fake_study_recurse;
5925     }
5926 
5927     assert(!frame);
5928     DEBUG_STUDYDATA("pre-fin:",data,depth);
5929 
5930     *scanp = scan;
5931     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5932 
5933     if (flags & SCF_DO_SUBSTR && is_inf)
5934 	data->pos_delta = SSize_t_MAX - data->pos_min;
5935     if (is_par > (I32)U8_MAX)
5936 	is_par = 0;
5937     if (is_par && pars==1 && data) {
5938 	data->flags |= SF_IN_PAR;
5939 	data->flags &= ~SF_HAS_PAR;
5940     }
5941     else if (pars && data) {
5942 	data->flags |= SF_HAS_PAR;
5943 	data->flags &= ~SF_IN_PAR;
5944     }
5945     if (flags & SCF_DO_STCLASS_OR)
5946 	ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5947     if (flags & SCF_TRIE_RESTUDY)
5948         data->flags |= 	SCF_TRIE_RESTUDY;
5949 
5950     DEBUG_STUDYDATA("post-fin:",data,depth);
5951 
5952     {
5953         SSize_t final_minlen= min < stopmin ? min : stopmin;
5954 
5955         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5956             if (final_minlen > SSize_t_MAX - delta)
5957                 RExC_maxlen = SSize_t_MAX;
5958             else if (RExC_maxlen < final_minlen + delta)
5959                 RExC_maxlen = final_minlen + delta;
5960         }
5961         return final_minlen;
5962     }
5963     NOT_REACHED; /* NOTREACHED */
5964 }
5965 
5966 STATIC U32
5967 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5968 {
5969     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5970 
5971     PERL_ARGS_ASSERT_ADD_DATA;
5972 
5973     Renewc(RExC_rxi->data,
5974 	   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5975 	   char, struct reg_data);
5976     if(count)
5977 	Renew(RExC_rxi->data->what, count + n, U8);
5978     else
5979 	Newx(RExC_rxi->data->what, n, U8);
5980     RExC_rxi->data->count = count + n;
5981     Copy(s, RExC_rxi->data->what + count, n, U8);
5982     return count;
5983 }
5984 
5985 /*XXX: todo make this not included in a non debugging perl, but appears to be
5986  * used anyway there, in 'use re' */
5987 #ifndef PERL_IN_XSUB_RE
5988 void
5989 Perl_reginitcolors(pTHX)
5990 {
5991     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5992     if (s) {
5993 	char *t = savepv(s);
5994 	int i = 0;
5995 	PL_colors[0] = t;
5996 	while (++i < 6) {
5997 	    t = strchr(t, '\t');
5998 	    if (t) {
5999 		*t = '\0';
6000 		PL_colors[i] = ++t;
6001 	    }
6002 	    else
6003 		PL_colors[i] = t = (char *)"";
6004 	}
6005     } else {
6006 	int i = 0;
6007 	while (i < 6)
6008 	    PL_colors[i++] = (char *)"";
6009     }
6010     PL_colorset = 1;
6011 }
6012 #endif
6013 
6014 
6015 #ifdef TRIE_STUDY_OPT
6016 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
6017     STMT_START {                                            \
6018         if (                                                \
6019               (data.flags & SCF_TRIE_RESTUDY)               \
6020               && ! restudied++                              \
6021         ) {                                                 \
6022             dOsomething;                                    \
6023             goto reStudy;                                   \
6024         }                                                   \
6025     } STMT_END
6026 #else
6027 #define CHECK_RESTUDY_GOTO_butfirst
6028 #endif
6029 
6030 /*
6031  * pregcomp - compile a regular expression into internal code
6032  *
6033  * Decides which engine's compiler to call based on the hint currently in
6034  * scope
6035  */
6036 
6037 #ifndef PERL_IN_XSUB_RE
6038 
6039 /* return the currently in-scope regex engine (or the default if none)  */
6040 
6041 regexp_engine const *
6042 Perl_current_re_engine(pTHX)
6043 {
6044     if (IN_PERL_COMPILETIME) {
6045 	HV * const table = GvHV(PL_hintgv);
6046 	SV **ptr;
6047 
6048 	if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6049 	    return &PL_core_reg_engine;
6050 	ptr = hv_fetchs(table, "regcomp", FALSE);
6051 	if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6052 	    return &PL_core_reg_engine;
6053 	return INT2PTR(regexp_engine*,SvIV(*ptr));
6054     }
6055     else {
6056 	SV *ptr;
6057 	if (!PL_curcop->cop_hints_hash)
6058 	    return &PL_core_reg_engine;
6059 	ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6060 	if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6061 	    return &PL_core_reg_engine;
6062 	return INT2PTR(regexp_engine*,SvIV(ptr));
6063     }
6064 }
6065 
6066 
6067 REGEXP *
6068 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6069 {
6070     regexp_engine const *eng = current_re_engine();
6071     GET_RE_DEBUG_FLAGS_DECL;
6072 
6073     PERL_ARGS_ASSERT_PREGCOMP;
6074 
6075     /* Dispatch a request to compile a regexp to correct regexp engine. */
6076     DEBUG_COMPILE_r({
6077         Perl_re_printf( aTHX_  "Using engine %"UVxf"\n",
6078 			PTR2UV(eng));
6079     });
6080     return CALLREGCOMP_ENG(eng, pattern, flags);
6081 }
6082 #endif
6083 
6084 /* public(ish) entry point for the perl core's own regex compiling code.
6085  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6086  * pattern rather than a list of OPs, and uses the internal engine rather
6087  * than the current one */
6088 
6089 REGEXP *
6090 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6091 {
6092     SV *pat = pattern; /* defeat constness! */
6093     PERL_ARGS_ASSERT_RE_COMPILE;
6094     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6095 #ifdef PERL_IN_XSUB_RE
6096                                 &my_reg_engine,
6097 #else
6098                                 &PL_core_reg_engine,
6099 #endif
6100                                 NULL, NULL, rx_flags, 0);
6101 }
6102 
6103 
6104 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6105  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6106  * point to the realloced string and length.
6107  *
6108  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6109  * stuff added */
6110 
6111 static void
6112 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6113 		    char **pat_p, STRLEN *plen_p, int num_code_blocks)
6114 {
6115     U8 *const src = (U8*)*pat_p;
6116     U8 *dst, *d;
6117     int n=0;
6118     STRLEN s = 0;
6119     bool do_end = 0;
6120     GET_RE_DEBUG_FLAGS_DECL;
6121 
6122     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6123         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6124 
6125     Newx(dst, *plen_p * 2 + 1, U8);
6126     d = dst;
6127 
6128     while (s < *plen_p) {
6129         append_utf8_from_native_byte(src[s], &d);
6130         if (n < num_code_blocks) {
6131             if (!do_end && pRExC_state->code_blocks[n].start == s) {
6132                 pRExC_state->code_blocks[n].start = d - dst - 1;
6133                 assert(*(d - 1) == '(');
6134                 do_end = 1;
6135             }
6136             else if (do_end && pRExC_state->code_blocks[n].end == s) {
6137                 pRExC_state->code_blocks[n].end = d - dst - 1;
6138                 assert(*(d - 1) == ')');
6139                 do_end = 0;
6140                 n++;
6141             }
6142         }
6143         s++;
6144     }
6145     *d = '\0';
6146     *plen_p = d - dst;
6147     *pat_p = (char*) dst;
6148     SAVEFREEPV(*pat_p);
6149     RExC_orig_utf8 = RExC_utf8 = 1;
6150 }
6151 
6152 
6153 
6154 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6155  * while recording any code block indices, and handling overloading,
6156  * nested qr// objects etc.  If pat is null, it will allocate a new
6157  * string, or just return the first arg, if there's only one.
6158  *
6159  * Returns the malloced/updated pat.
6160  * patternp and pat_count is the array of SVs to be concatted;
6161  * oplist is the optional list of ops that generated the SVs;
6162  * recompile_p is a pointer to a boolean that will be set if
6163  *   the regex will need to be recompiled.
6164  * delim, if non-null is an SV that will be inserted between each element
6165  */
6166 
6167 static SV*
6168 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6169                 SV *pat, SV ** const patternp, int pat_count,
6170                 OP *oplist, bool *recompile_p, SV *delim)
6171 {
6172     SV **svp;
6173     int n = 0;
6174     bool use_delim = FALSE;
6175     bool alloced = FALSE;
6176 
6177     /* if we know we have at least two args, create an empty string,
6178      * then concatenate args to that. For no args, return an empty string */
6179     if (!pat && pat_count != 1) {
6180         pat = newSVpvs("");
6181         SAVEFREESV(pat);
6182         alloced = TRUE;
6183     }
6184 
6185     for (svp = patternp; svp < patternp + pat_count; svp++) {
6186         SV *sv;
6187         SV *rx  = NULL;
6188         STRLEN orig_patlen = 0;
6189         bool code = 0;
6190         SV *msv = use_delim ? delim : *svp;
6191         if (!msv) msv = &PL_sv_undef;
6192 
6193         /* if we've got a delimiter, we go round the loop twice for each
6194          * svp slot (except the last), using the delimiter the second
6195          * time round */
6196         if (use_delim) {
6197             svp--;
6198             use_delim = FALSE;
6199         }
6200         else if (delim)
6201             use_delim = TRUE;
6202 
6203         if (SvTYPE(msv) == SVt_PVAV) {
6204             /* we've encountered an interpolated array within
6205              * the pattern, e.g. /...@a..../. Expand the list of elements,
6206              * then recursively append elements.
6207              * The code in this block is based on S_pushav() */
6208 
6209             AV *const av = (AV*)msv;
6210             const SSize_t maxarg = AvFILL(av) + 1;
6211             SV **array;
6212 
6213             if (oplist) {
6214                 assert(oplist->op_type == OP_PADAV
6215                     || oplist->op_type == OP_RV2AV);
6216                 oplist = OpSIBLING(oplist);
6217             }
6218 
6219             if (SvRMAGICAL(av)) {
6220                 SSize_t i;
6221 
6222                 Newx(array, maxarg, SV*);
6223                 SAVEFREEPV(array);
6224                 for (i=0; i < maxarg; i++) {
6225                     SV ** const svp = av_fetch(av, i, FALSE);
6226                     array[i] = svp ? *svp : &PL_sv_undef;
6227                 }
6228             }
6229             else
6230                 array = AvARRAY(av);
6231 
6232             pat = S_concat_pat(aTHX_ pRExC_state, pat,
6233                                 array, maxarg, NULL, recompile_p,
6234                                 /* $" */
6235                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6236 
6237             continue;
6238         }
6239 
6240 
6241         /* we make the assumption here that each op in the list of
6242          * op_siblings maps to one SV pushed onto the stack,
6243          * except for code blocks, with have both an OP_NULL and
6244          * and OP_CONST.
6245          * This allows us to match up the list of SVs against the
6246          * list of OPs to find the next code block.
6247          *
6248          * Note that       PUSHMARK PADSV PADSV ..
6249          * is optimised to
6250          *                 PADRANGE PADSV  PADSV  ..
6251          * so the alignment still works. */
6252 
6253         if (oplist) {
6254             if (oplist->op_type == OP_NULL
6255                 && (oplist->op_flags & OPf_SPECIAL))
6256             {
6257                 assert(n < pRExC_state->num_code_blocks);
6258                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6259                 pRExC_state->code_blocks[n].block = oplist;
6260                 pRExC_state->code_blocks[n].src_regex = NULL;
6261                 n++;
6262                 code = 1;
6263                 oplist = OpSIBLING(oplist); /* skip CONST */
6264                 assert(oplist);
6265             }
6266             oplist = OpSIBLING(oplist);;
6267         }
6268 
6269 	/* apply magic and QR overloading to arg */
6270 
6271         SvGETMAGIC(msv);
6272         if (SvROK(msv) && SvAMAGIC(msv)) {
6273             SV *sv = AMG_CALLunary(msv, regexp_amg);
6274             if (sv) {
6275                 if (SvROK(sv))
6276                     sv = SvRV(sv);
6277                 if (SvTYPE(sv) != SVt_REGEXP)
6278                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6279                 msv = sv;
6280             }
6281         }
6282 
6283         /* try concatenation overload ... */
6284         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6285                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6286         {
6287             sv_setsv(pat, sv);
6288             /* overloading involved: all bets are off over literal
6289              * code. Pretend we haven't seen it */
6290             pRExC_state->num_code_blocks -= n;
6291             n = 0;
6292         }
6293         else  {
6294             /* ... or failing that, try "" overload */
6295             while (SvAMAGIC(msv)
6296                     && (sv = AMG_CALLunary(msv, string_amg))
6297                     && sv != msv
6298                     &&  !(   SvROK(msv)
6299                           && SvROK(sv)
6300                           && SvRV(msv) == SvRV(sv))
6301             ) {
6302                 msv = sv;
6303                 SvGETMAGIC(msv);
6304             }
6305             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6306                 msv = SvRV(msv);
6307 
6308             if (pat) {
6309                 /* this is a partially unrolled
6310                  *     sv_catsv_nomg(pat, msv);
6311                  * that allows us to adjust code block indices if
6312                  * needed */
6313                 STRLEN dlen;
6314                 char *dst = SvPV_force_nomg(pat, dlen);
6315                 orig_patlen = dlen;
6316                 if (SvUTF8(msv) && !SvUTF8(pat)) {
6317                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6318                     sv_setpvn(pat, dst, dlen);
6319                     SvUTF8_on(pat);
6320                 }
6321                 sv_catsv_nomg(pat, msv);
6322                 rx = msv;
6323             }
6324             else
6325                 pat = msv;
6326 
6327             if (code)
6328                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6329         }
6330 
6331         /* extract any code blocks within any embedded qr//'s */
6332         if (rx && SvTYPE(rx) == SVt_REGEXP
6333             && RX_ENGINE((REGEXP*)rx)->op_comp)
6334         {
6335 
6336             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6337             if (ri->num_code_blocks) {
6338                 int i;
6339                 /* the presence of an embedded qr// with code means
6340                  * we should always recompile: the text of the
6341                  * qr// may not have changed, but it may be a
6342                  * different closure than last time */
6343                 *recompile_p = 1;
6344                 Renew(pRExC_state->code_blocks,
6345                     pRExC_state->num_code_blocks + ri->num_code_blocks,
6346                     struct reg_code_block);
6347                 pRExC_state->num_code_blocks += ri->num_code_blocks;
6348 
6349                 for (i=0; i < ri->num_code_blocks; i++) {
6350                     struct reg_code_block *src, *dst;
6351                     STRLEN offset =  orig_patlen
6352                         + ReANY((REGEXP *)rx)->pre_prefix;
6353                     assert(n < pRExC_state->num_code_blocks);
6354                     src = &ri->code_blocks[i];
6355                     dst = &pRExC_state->code_blocks[n];
6356                     dst->start	    = src->start + offset;
6357                     dst->end	    = src->end   + offset;
6358                     dst->block	    = src->block;
6359                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
6360                                             src->src_regex
6361                                                 ? src->src_regex
6362                                                 : (REGEXP*)rx);
6363                     n++;
6364                 }
6365             }
6366         }
6367     }
6368     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6369     if (alloced)
6370         SvSETMAGIC(pat);
6371 
6372     return pat;
6373 }
6374 
6375 
6376 
6377 /* see if there are any run-time code blocks in the pattern.
6378  * False positives are allowed */
6379 
6380 static bool
6381 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6382 		    char *pat, STRLEN plen)
6383 {
6384     int n = 0;
6385     STRLEN s;
6386 
6387     PERL_UNUSED_CONTEXT;
6388 
6389     for (s = 0; s < plen; s++) {
6390 	if (n < pRExC_state->num_code_blocks
6391 	    && s == pRExC_state->code_blocks[n].start)
6392 	{
6393 	    s = pRExC_state->code_blocks[n].end;
6394 	    n++;
6395 	    continue;
6396 	}
6397 	/* TODO ideally should handle [..], (#..), /#.../x to reduce false
6398 	 * positives here */
6399 	if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6400 	    (pat[s+2] == '{'
6401                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6402 	)
6403 	    return 1;
6404     }
6405     return 0;
6406 }
6407 
6408 /* Handle run-time code blocks. We will already have compiled any direct
6409  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6410  * copy of it, but with any literal code blocks blanked out and
6411  * appropriate chars escaped; then feed it into
6412  *
6413  *    eval "qr'modified_pattern'"
6414  *
6415  * For example,
6416  *
6417  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6418  *
6419  * becomes
6420  *
6421  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6422  *
6423  * After eval_sv()-ing that, grab any new code blocks from the returned qr
6424  * and merge them with any code blocks of the original regexp.
6425  *
6426  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6427  * instead, just save the qr and return FALSE; this tells our caller that
6428  * the original pattern needs upgrading to utf8.
6429  */
6430 
6431 static bool
6432 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6433     char *pat, STRLEN plen)
6434 {
6435     SV *qr;
6436 
6437     GET_RE_DEBUG_FLAGS_DECL;
6438 
6439     if (pRExC_state->runtime_code_qr) {
6440 	/* this is the second time we've been called; this should
6441 	 * only happen if the main pattern got upgraded to utf8
6442 	 * during compilation; re-use the qr we compiled first time
6443 	 * round (which should be utf8 too)
6444 	 */
6445 	qr = pRExC_state->runtime_code_qr;
6446 	pRExC_state->runtime_code_qr = NULL;
6447 	assert(RExC_utf8 && SvUTF8(qr));
6448     }
6449     else {
6450 	int n = 0;
6451 	STRLEN s;
6452 	char *p, *newpat;
6453 	int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6454 	SV *sv, *qr_ref;
6455 	dSP;
6456 
6457 	/* determine how many extra chars we need for ' and \ escaping */
6458 	for (s = 0; s < plen; s++) {
6459 	    if (pat[s] == '\'' || pat[s] == '\\')
6460 		newlen++;
6461 	}
6462 
6463 	Newx(newpat, newlen, char);
6464 	p = newpat;
6465 	*p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6466 
6467 	for (s = 0; s < plen; s++) {
6468 	    if (n < pRExC_state->num_code_blocks
6469 		&& s == pRExC_state->code_blocks[n].start)
6470 	    {
6471 		/* blank out literal code block */
6472 		assert(pat[s] == '(');
6473 		while (s <= pRExC_state->code_blocks[n].end) {
6474 		    *p++ = '_';
6475 		    s++;
6476 		}
6477 		s--;
6478 		n++;
6479 		continue;
6480 	    }
6481 	    if (pat[s] == '\'' || pat[s] == '\\')
6482 		*p++ = '\\';
6483 	    *p++ = pat[s];
6484 	}
6485 	*p++ = '\'';
6486 	if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6487 	    *p++ = 'x';
6488 	*p++ = '\0';
6489 	DEBUG_COMPILE_r({
6490             Perl_re_printf( aTHX_
6491 		"%sre-parsing pattern for runtime code:%s %s\n",
6492 		PL_colors[4],PL_colors[5],newpat);
6493 	});
6494 
6495 	sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6496 	Safefree(newpat);
6497 
6498 	ENTER;
6499 	SAVETMPS;
6500 	save_re_context();
6501 	PUSHSTACKi(PERLSI_REQUIRE);
6502         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6503          * parsing qr''; normally only q'' does this. It also alters
6504          * hints handling */
6505 	eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6506 	SvREFCNT_dec_NN(sv);
6507 	SPAGAIN;
6508 	qr_ref = POPs;
6509 	PUTBACK;
6510 	{
6511 	    SV * const errsv = ERRSV;
6512 	    if (SvTRUE_NN(errsv))
6513 	    {
6514 		Safefree(pRExC_state->code_blocks);
6515                 /* use croak_sv ? */
6516 		Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6517 	    }
6518 	}
6519 	assert(SvROK(qr_ref));
6520 	qr = SvRV(qr_ref);
6521 	assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6522 	/* the leaving below frees the tmp qr_ref.
6523 	 * Give qr a life of its own */
6524 	SvREFCNT_inc(qr);
6525 	POPSTACK;
6526 	FREETMPS;
6527 	LEAVE;
6528 
6529     }
6530 
6531     if (!RExC_utf8 && SvUTF8(qr)) {
6532 	/* first time through; the pattern got upgraded; save the
6533 	 * qr for the next time through */
6534 	assert(!pRExC_state->runtime_code_qr);
6535 	pRExC_state->runtime_code_qr = qr;
6536 	return 0;
6537     }
6538 
6539 
6540     /* extract any code blocks within the returned qr//  */
6541 
6542 
6543     /* merge the main (r1) and run-time (r2) code blocks into one */
6544     {
6545 	RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6546 	struct reg_code_block *new_block, *dst;
6547 	RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6548 	int i1 = 0, i2 = 0;
6549 
6550 	if (!r2->num_code_blocks) /* we guessed wrong */
6551 	{
6552 	    SvREFCNT_dec_NN(qr);
6553 	    return 1;
6554 	}
6555 
6556 	Newx(new_block,
6557 	    r1->num_code_blocks + r2->num_code_blocks,
6558 	    struct reg_code_block);
6559 	dst = new_block;
6560 
6561 	while (    i1 < r1->num_code_blocks
6562 		|| i2 < r2->num_code_blocks)
6563 	{
6564 	    struct reg_code_block *src;
6565 	    bool is_qr = 0;
6566 
6567 	    if (i1 == r1->num_code_blocks) {
6568 		src = &r2->code_blocks[i2++];
6569 		is_qr = 1;
6570 	    }
6571 	    else if (i2 == r2->num_code_blocks)
6572 		src = &r1->code_blocks[i1++];
6573 	    else if (  r1->code_blocks[i1].start
6574 	             < r2->code_blocks[i2].start)
6575 	    {
6576 		src = &r1->code_blocks[i1++];
6577 		assert(src->end < r2->code_blocks[i2].start);
6578 	    }
6579 	    else {
6580 		assert(  r1->code_blocks[i1].start
6581 		       > r2->code_blocks[i2].start);
6582 		src = &r2->code_blocks[i2++];
6583 		is_qr = 1;
6584 		assert(src->end < r1->code_blocks[i1].start);
6585 	    }
6586 
6587 	    assert(pat[src->start] == '(');
6588 	    assert(pat[src->end]   == ')');
6589 	    dst->start	    = src->start;
6590 	    dst->end	    = src->end;
6591 	    dst->block	    = src->block;
6592 	    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6593 				    : src->src_regex;
6594 	    dst++;
6595 	}
6596 	r1->num_code_blocks += r2->num_code_blocks;
6597 	Safefree(r1->code_blocks);
6598 	r1->code_blocks = new_block;
6599     }
6600 
6601     SvREFCNT_dec_NN(qr);
6602     return 1;
6603 }
6604 
6605 
6606 STATIC bool
6607 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6608                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6609 		      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6610                       STRLEN longest_length, bool eol, bool meol)
6611 {
6612     /* This is the common code for setting up the floating and fixed length
6613      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6614      * as to whether succeeded or not */
6615 
6616     I32 t;
6617     SSize_t ml;
6618 
6619     if (! (longest_length
6620            || (eol /* Can't have SEOL and MULTI */
6621                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6622           )
6623             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6624         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6625     {
6626         return FALSE;
6627     }
6628 
6629     /* copy the information about the longest from the reg_scan_data
6630         over to the program. */
6631     if (SvUTF8(sv_longest)) {
6632         *rx_utf8 = sv_longest;
6633         *rx_substr = NULL;
6634     } else {
6635         *rx_substr = sv_longest;
6636         *rx_utf8 = NULL;
6637     }
6638     /* end_shift is how many chars that must be matched that
6639         follow this item. We calculate it ahead of time as once the
6640         lookbehind offset is added in we lose the ability to correctly
6641         calculate it.*/
6642     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6643     *rx_end_shift = ml - offset
6644         - longest_length + (SvTAIL(sv_longest) != 0)
6645         + lookbehind;
6646 
6647     t = (eol/* Can't have SEOL and MULTI */
6648          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6649     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6650 
6651     return TRUE;
6652 }
6653 
6654 /*
6655  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6656  * regular expression into internal code.
6657  * The pattern may be passed either as:
6658  *    a list of SVs (patternp plus pat_count)
6659  *    a list of OPs (expr)
6660  * If both are passed, the SV list is used, but the OP list indicates
6661  * which SVs are actually pre-compiled code blocks
6662  *
6663  * The SVs in the list have magic and qr overloading applied to them (and
6664  * the list may be modified in-place with replacement SVs in the latter
6665  * case).
6666  *
6667  * If the pattern hasn't changed from old_re, then old_re will be
6668  * returned.
6669  *
6670  * eng is the current engine. If that engine has an op_comp method, then
6671  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6672  * do the initial concatenation of arguments and pass on to the external
6673  * engine.
6674  *
6675  * If is_bare_re is not null, set it to a boolean indicating whether the
6676  * arg list reduced (after overloading) to a single bare regex which has
6677  * been returned (i.e. /$qr/).
6678  *
6679  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6680  *
6681  * pm_flags contains the PMf_* flags, typically based on those from the
6682  * pm_flags field of the related PMOP. Currently we're only interested in
6683  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6684  *
6685  * We can't allocate space until we know how big the compiled form will be,
6686  * but we can't compile it (and thus know how big it is) until we've got a
6687  * place to put the code.  So we cheat:  we compile it twice, once with code
6688  * generation turned off and size counting turned on, and once "for real".
6689  * This also means that we don't allocate space until we are sure that the
6690  * thing really will compile successfully, and we never have to move the
6691  * code and thus invalidate pointers into it.  (Note that it has to be in
6692  * one piece because free() must be able to free it all.) [NB: not true in perl]
6693  *
6694  * Beware that the optimization-preparation code in here knows about some
6695  * of the structure of the compiled regexp.  [I'll say.]
6696  */
6697 
6698 REGEXP *
6699 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6700 		    OP *expr, const regexp_engine* eng, REGEXP *old_re,
6701 		     bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6702 {
6703     REGEXP *rx;
6704     struct regexp *r;
6705     regexp_internal *ri;
6706     STRLEN plen;
6707     char *exp;
6708     regnode *scan;
6709     I32 flags;
6710     SSize_t minlen = 0;
6711     U32 rx_flags;
6712     SV *pat;
6713     SV *code_blocksv = NULL;
6714     SV** new_patternp = patternp;
6715 
6716     /* these are all flags - maybe they should be turned
6717      * into a single int with different bit masks */
6718     I32 sawlookahead = 0;
6719     I32 sawplus = 0;
6720     I32 sawopen = 0;
6721     I32 sawminmod = 0;
6722 
6723     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6724     bool recompile = 0;
6725     bool runtime_code = 0;
6726     scan_data_t data;
6727     RExC_state_t RExC_state;
6728     RExC_state_t * const pRExC_state = &RExC_state;
6729 #ifdef TRIE_STUDY_OPT
6730     int restudied = 0;
6731     RExC_state_t copyRExC_state;
6732 #endif
6733     GET_RE_DEBUG_FLAGS_DECL;
6734 
6735     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6736 
6737     DEBUG_r(if (!PL_colorset) reginitcolors());
6738 
6739     /* Initialize these here instead of as-needed, as is quick and avoids
6740      * having to test them each time otherwise */
6741     if (! PL_AboveLatin1) {
6742 #ifdef DEBUGGING
6743         char * dump_len_string;
6744 #endif
6745 
6746 	PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6747 	PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6748 	PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6749         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6750         PL_HasMultiCharFold =
6751                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6752 
6753         /* This is calculated here, because the Perl program that generates the
6754          * static global ones doesn't currently have access to
6755          * NUM_ANYOF_CODE_POINTS */
6756 	PL_InBitmap = _new_invlist(2);
6757 	PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6758                                                     NUM_ANYOF_CODE_POINTS - 1);
6759 #ifdef DEBUGGING
6760         dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
6761         if (   ! dump_len_string
6762             || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
6763         {
6764             PL_dump_re_max_len = 0;
6765         }
6766 #endif
6767     }
6768 
6769     pRExC_state->warn_text = NULL;
6770     pRExC_state->code_blocks = NULL;
6771     pRExC_state->num_code_blocks = 0;
6772 
6773     if (is_bare_re)
6774 	*is_bare_re = FALSE;
6775 
6776     if (expr && (expr->op_type == OP_LIST ||
6777 		(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6778 	/* allocate code_blocks if needed */
6779 	OP *o;
6780 	int ncode = 0;
6781 
6782 	for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6783 	    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6784 		ncode++; /* count of DO blocks */
6785 	if (ncode) {
6786 	    pRExC_state->num_code_blocks = ncode;
6787 	    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6788 	}
6789     }
6790 
6791     if (!pat_count) {
6792         /* compile-time pattern with just OP_CONSTs and DO blocks */
6793 
6794         int n;
6795         OP *o;
6796 
6797         /* find how many CONSTs there are */
6798         assert(expr);
6799         n = 0;
6800         if (expr->op_type == OP_CONST)
6801             n = 1;
6802         else
6803             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6804                 if (o->op_type == OP_CONST)
6805                     n++;
6806             }
6807 
6808         /* fake up an SV array */
6809 
6810         assert(!new_patternp);
6811         Newx(new_patternp, n, SV*);
6812         SAVEFREEPV(new_patternp);
6813         pat_count = n;
6814 
6815         n = 0;
6816         if (expr->op_type == OP_CONST)
6817             new_patternp[n] = cSVOPx_sv(expr);
6818         else
6819             for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6820                 if (o->op_type == OP_CONST)
6821                     new_patternp[n++] = cSVOPo_sv;
6822             }
6823 
6824     }
6825 
6826     DEBUG_PARSE_r(Perl_re_printf( aTHX_
6827         "Assembling pattern from %d elements%s\n", pat_count,
6828             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6829 
6830     /* set expr to the first arg op */
6831 
6832     if (pRExC_state->num_code_blocks
6833          && expr->op_type != OP_CONST)
6834     {
6835             expr = cLISTOPx(expr)->op_first;
6836             assert(   expr->op_type == OP_PUSHMARK
6837                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6838                    || expr->op_type == OP_PADRANGE);
6839             expr = OpSIBLING(expr);
6840     }
6841 
6842     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6843                         expr, &recompile, NULL);
6844 
6845     /* handle bare (possibly after overloading) regex: foo =~ $re */
6846     {
6847         SV *re = pat;
6848         if (SvROK(re))
6849             re = SvRV(re);
6850         if (SvTYPE(re) == SVt_REGEXP) {
6851             if (is_bare_re)
6852                 *is_bare_re = TRUE;
6853             SvREFCNT_inc(re);
6854             Safefree(pRExC_state->code_blocks);
6855             DEBUG_PARSE_r(Perl_re_printf( aTHX_
6856                 "Precompiled pattern%s\n",
6857                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6858 
6859             return (REGEXP*)re;
6860         }
6861     }
6862 
6863     exp = SvPV_nomg(pat, plen);
6864 
6865     if (!eng->op_comp) {
6866 	if ((SvUTF8(pat) && IN_BYTES)
6867 		|| SvGMAGICAL(pat) || SvAMAGIC(pat))
6868 	{
6869 	    /* make a temporary copy; either to convert to bytes,
6870 	     * or to avoid repeating get-magic / overloaded stringify */
6871 	    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6872 					(IN_BYTES ? 0 : SvUTF8(pat)));
6873 	}
6874 	Safefree(pRExC_state->code_blocks);
6875 	return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6876     }
6877 
6878     /* ignore the utf8ness if the pattern is 0 length */
6879     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6880 
6881     RExC_uni_semantics = 0;
6882     RExC_seen_unfolded_sharp_s = 0;
6883     RExC_contains_locale = 0;
6884     RExC_contains_i = 0;
6885     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6886     RExC_study_started = 0;
6887     pRExC_state->runtime_code_qr = NULL;
6888     RExC_frame_head= NULL;
6889     RExC_frame_last= NULL;
6890     RExC_frame_count= 0;
6891 
6892     DEBUG_r({
6893         RExC_mysv1= sv_newmortal();
6894         RExC_mysv2= sv_newmortal();
6895     });
6896     DEBUG_COMPILE_r({
6897             SV *dsv= sv_newmortal();
6898             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6899             Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
6900                           PL_colors[4],PL_colors[5],s);
6901         });
6902 
6903   redo_first_pass:
6904     /* we jump here if we have to recompile, e.g., from upgrading the pattern
6905      * to utf8 */
6906 
6907     if ((pm_flags & PMf_USE_RE_EVAL)
6908 		/* this second condition covers the non-regex literal case,
6909 		 * i.e.  $foo =~ '(?{})'. */
6910 		|| (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6911     )
6912 	runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6913 
6914     /* return old regex if pattern hasn't changed */
6915     /* XXX: note in the below we have to check the flags as well as the
6916      * pattern.
6917      *
6918      * Things get a touch tricky as we have to compare the utf8 flag
6919      * independently from the compile flags.  */
6920 
6921     if (   old_re
6922         && !recompile
6923         && !!RX_UTF8(old_re) == !!RExC_utf8
6924         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6925 	&& RX_PRECOMP(old_re)
6926 	&& RX_PRELEN(old_re) == plen
6927         && memEQ(RX_PRECOMP(old_re), exp, plen)
6928 	&& !runtime_code /* with runtime code, always recompile */ )
6929     {
6930         Safefree(pRExC_state->code_blocks);
6931         return old_re;
6932     }
6933 
6934     rx_flags = orig_rx_flags;
6935 
6936     if (rx_flags & PMf_FOLD) {
6937         RExC_contains_i = 1;
6938     }
6939     if (   initial_charset == REGEX_DEPENDS_CHARSET
6940         && (RExC_utf8 ||RExC_uni_semantics))
6941     {
6942 
6943 	/* Set to use unicode semantics if the pattern is in utf8 and has the
6944 	 * 'depends' charset specified, as it means unicode when utf8  */
6945 	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6946     }
6947 
6948     RExC_precomp = exp;
6949     RExC_precomp_adj = 0;
6950     RExC_flags = rx_flags;
6951     RExC_pm_flags = pm_flags;
6952 
6953     if (runtime_code) {
6954         assert(TAINTING_get || !TAINT_get);
6955 	if (TAINT_get)
6956 	    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6957 
6958 	if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6959 	    /* whoops, we have a non-utf8 pattern, whilst run-time code
6960 	     * got compiled as utf8. Try again with a utf8 pattern */
6961             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6962                                     pRExC_state->num_code_blocks);
6963             goto redo_first_pass;
6964 	}
6965     }
6966     assert(!pRExC_state->runtime_code_qr);
6967 
6968     RExC_sawback = 0;
6969 
6970     RExC_seen = 0;
6971     RExC_maxlen = 0;
6972     RExC_in_lookbehind = 0;
6973     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6974     RExC_extralen = 0;
6975     RExC_override_recoding = 0;
6976 #ifdef EBCDIC
6977     RExC_recode_x_to_native = 0;
6978 #endif
6979     RExC_in_multi_char_class = 0;
6980 
6981     /* First pass: determine size, legality. */
6982     RExC_parse = exp;
6983     RExC_start = RExC_adjusted_start = exp;
6984     RExC_end = exp + plen;
6985     RExC_precomp_end = RExC_end;
6986     RExC_naughty = 0;
6987     RExC_npar = 1;
6988     RExC_nestroot = 0;
6989     RExC_size = 0L;
6990     RExC_emit = (regnode *) &RExC_emit_dummy;
6991     RExC_whilem_seen = 0;
6992     RExC_open_parens = NULL;
6993     RExC_close_parens = NULL;
6994     RExC_end_op = NULL;
6995     RExC_paren_names = NULL;
6996 #ifdef DEBUGGING
6997     RExC_paren_name_list = NULL;
6998 #endif
6999     RExC_recurse = NULL;
7000     RExC_study_chunk_recursed = NULL;
7001     RExC_study_chunk_recursed_bytes= 0;
7002     RExC_recurse_count = 0;
7003     pRExC_state->code_index = 0;
7004 
7005     /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7006      * code makes sure the final byte is an uncounted NUL.  But should this
7007      * ever not be the case, lots of things could read beyond the end of the
7008      * buffer: loops like
7009      *      while(isFOO(*RExC_parse)) RExC_parse++;
7010      *      strchr(RExC_parse, "foo");
7011      * etc.  So it is worth noting. */
7012     assert(*RExC_end == '\0');
7013 
7014     DEBUG_PARSE_r(
7015         Perl_re_printf( aTHX_  "Starting first pass (sizing)\n");
7016         RExC_lastnum=0;
7017         RExC_lastparse=NULL;
7018     );
7019     /* reg may croak on us, not giving us a chance to free
7020        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
7021        need it to survive as long as the regexp (qr/(?{})/).
7022        We must check that code_blocksv is not already set, because we may
7023        have jumped back to restart the sizing pass. */
7024     if (pRExC_state->code_blocks && !code_blocksv) {
7025 	code_blocksv = newSV_type(SVt_PV);
7026 	SAVEFREESV(code_blocksv);
7027 	SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
7028 	SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
7029     }
7030     if (reg(pRExC_state, 0, &flags,1) == NULL) {
7031         /* It's possible to write a regexp in ascii that represents Unicode
7032         codepoints outside of the byte range, such as via \x{100}. If we
7033         detect such a sequence we have to convert the entire pattern to utf8
7034         and then recompile, as our sizing calculation will have been based
7035         on 1 byte == 1 character, but we will need to use utf8 to encode
7036         at least some part of the pattern, and therefore must convert the whole
7037         thing.
7038         -- dmq */
7039         if (flags & RESTART_PASS1) {
7040             if (flags & NEED_UTF8) {
7041                 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7042                                     pRExC_state->num_code_blocks);
7043             }
7044             else {
7045                 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7046                 "Need to redo pass 1\n"));
7047             }
7048 
7049             goto redo_first_pass;
7050         }
7051         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
7052     }
7053     if (code_blocksv)
7054 	SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
7055 
7056     DEBUG_PARSE_r({
7057         Perl_re_printf( aTHX_
7058             "Required size %"IVdf" nodes\n"
7059             "Starting second pass (creation)\n",
7060             (IV)RExC_size);
7061         RExC_lastnum=0;
7062         RExC_lastparse=NULL;
7063     });
7064 
7065     /* The first pass could have found things that force Unicode semantics */
7066     if ((RExC_utf8 || RExC_uni_semantics)
7067 	 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
7068     {
7069 	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7070     }
7071 
7072     /* Small enough for pointer-storage convention?
7073        If extralen==0, this means that we will not need long jumps. */
7074     if (RExC_size >= 0x10000L && RExC_extralen)
7075         RExC_size += RExC_extralen;
7076     else
7077 	RExC_extralen = 0;
7078     if (RExC_whilem_seen > 15)
7079 	RExC_whilem_seen = 15;
7080 
7081     /* Allocate space and zero-initialize. Note, the two step process
7082        of zeroing when in debug mode, thus anything assigned has to
7083        happen after that */
7084     rx = (REGEXP*) newSV_type(SVt_REGEXP);
7085     r = ReANY(rx);
7086     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7087 	 char, regexp_internal);
7088     if ( r == NULL || ri == NULL )
7089 	FAIL("Regexp out of space");
7090 #ifdef DEBUGGING
7091     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
7092     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7093          char);
7094 #else
7095     /* bulk initialize base fields with 0. */
7096     Zero(ri, sizeof(regexp_internal), char);
7097 #endif
7098 
7099     /* non-zero initialization begins here */
7100     RXi_SET( r, ri );
7101     r->engine= eng;
7102     r->extflags = rx_flags;
7103     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7104 
7105     if (pm_flags & PMf_IS_QR) {
7106 	ri->code_blocks = pRExC_state->code_blocks;
7107 	ri->num_code_blocks = pRExC_state->num_code_blocks;
7108     }
7109     else
7110     {
7111 	int n;
7112 	for (n = 0; n < pRExC_state->num_code_blocks; n++)
7113 	    if (pRExC_state->code_blocks[n].src_regex)
7114 		SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
7115 	if(pRExC_state->code_blocks)
7116 	    SAVEFREEPV(pRExC_state->code_blocks); /* often null */
7117     }
7118 
7119     {
7120         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7121         bool has_charset = (get_regex_charset(r->extflags)
7122                                                     != REGEX_DEPENDS_CHARSET);
7123 
7124         /* The caret is output if there are any defaults: if not all the STD
7125          * flags are set, or if no character set specifier is needed */
7126         bool has_default =
7127                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7128                     || ! has_charset);
7129         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7130                                                    == REG_RUN_ON_COMMENT_SEEN);
7131 	U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
7132 			    >> RXf_PMf_STD_PMMOD_SHIFT);
7133 	const char *fptr = STD_PAT_MODS;        /*"msixn"*/
7134 	char *p;
7135 
7136         /* We output all the necessary flags; we never output a minus, as all
7137          * those are defaults, so are
7138          * covered by the caret */
7139 	const STRLEN wraplen = plen + has_p + has_runon
7140             + has_default       /* If needs a caret */
7141             + PL_bitcount[reganch] /* 1 char for each set standard flag */
7142 
7143 		/* If needs a character set specifier */
7144 	    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7145             + (sizeof("(?:)") - 1);
7146 
7147         /* make sure PL_bitcount bounds not exceeded */
7148         assert(sizeof(STD_PAT_MODS) <= 8);
7149 
7150         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
7151 	r->xpv_len_u.xpvlenu_pv = p;
7152 	if (RExC_utf8)
7153 	    SvFLAGS(rx) |= SVf_UTF8;
7154         *p++='('; *p++='?';
7155 
7156         /* If a default, cover it using the caret */
7157         if (has_default) {
7158             *p++= DEFAULT_PAT_MOD;
7159         }
7160         if (has_charset) {
7161 	    STRLEN len;
7162 	    const char* const name = get_regex_charset_name(r->extflags, &len);
7163 	    Copy(name, p, len, char);
7164 	    p += len;
7165         }
7166         if (has_p)
7167             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7168         {
7169             char ch;
7170             while((ch = *fptr++)) {
7171                 if(reganch & 1)
7172                     *p++ = ch;
7173                 reganch >>= 1;
7174             }
7175         }
7176 
7177         *p++ = ':';
7178         Copy(RExC_precomp, p, plen, char);
7179 	assert ((RX_WRAPPED(rx) - p) < 16);
7180 	r->pre_prefix = p - RX_WRAPPED(rx);
7181         p += plen;
7182         if (has_runon)
7183             *p++ = '\n';
7184         *p++ = ')';
7185         *p = 0;
7186 	SvCUR_set(rx, p - RX_WRAPPED(rx));
7187     }
7188 
7189     r->intflags = 0;
7190     r->nparens = RExC_npar - 1;	/* set early to validate backrefs */
7191 
7192     /* Useful during FAIL. */
7193 #ifdef RE_TRACK_PATTERN_OFFSETS
7194     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
7195     DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7196                           "%s %"UVuf" bytes for offset annotations.\n",
7197                           ri->u.offsets ? "Got" : "Couldn't get",
7198                           (UV)((2*RExC_size+1) * sizeof(U32))));
7199 #endif
7200     SetProgLen(ri,RExC_size);
7201     RExC_rx_sv = rx;
7202     RExC_rx = r;
7203     RExC_rxi = ri;
7204 
7205     /* Second pass: emit code. */
7206     RExC_flags = rx_flags;	/* don't let top level (?i) bleed */
7207     RExC_pm_flags = pm_flags;
7208     RExC_parse = exp;
7209     RExC_end = exp + plen;
7210     RExC_naughty = 0;
7211     RExC_emit_start = ri->program;
7212     RExC_emit = ri->program;
7213     RExC_emit_bound = ri->program + RExC_size + 1;
7214     pRExC_state->code_index = 0;
7215 
7216     *((char*) RExC_emit++) = (char) REG_MAGIC;
7217     /* setup various meta data about recursion, this all requires
7218      * RExC_npar to be correctly set, and a bit later on we clear it */
7219     if (RExC_seen & REG_RECURSE_SEEN) {
7220         DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
7221             "%*s%*s Setting up open/close parens\n",
7222                   22, "|    |", (int)(0 * 2 + 1), ""));
7223 
7224         /* setup RExC_open_parens, which holds the address of each
7225          * OPEN tag, and to make things simpler for the 0 index
7226          * the start of the program - this is used later for offsets */
7227         Newxz(RExC_open_parens, RExC_npar,regnode *);
7228         SAVEFREEPV(RExC_open_parens);
7229         RExC_open_parens[0] = RExC_emit;
7230 
7231         /* setup RExC_close_parens, which holds the address of each
7232          * CLOSE tag, and to make things simpler for the 0 index
7233          * the end of the program - this is used later for offsets */
7234         Newxz(RExC_close_parens, RExC_npar,regnode *);
7235         SAVEFREEPV(RExC_close_parens);
7236         /* we dont know where end op starts yet, so we dont
7237          * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
7238 
7239         /* Note, RExC_npar is 1 + the number of parens in a pattern.
7240          * So its 1 if there are no parens. */
7241         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
7242                                          ((RExC_npar & 0x07) != 0);
7243         Newx(RExC_study_chunk_recursed,
7244              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7245         SAVEFREEPV(RExC_study_chunk_recursed);
7246     }
7247     RExC_npar = 1;
7248     if (reg(pRExC_state, 0, &flags,1) == NULL) {
7249 	ReREFCNT_dec(rx);
7250         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
7251     }
7252     DEBUG_OPTIMISE_r(
7253         Perl_re_printf( aTHX_  "Starting post parse optimization\n");
7254     );
7255 
7256     /* XXXX To minimize changes to RE engine we always allocate
7257        3-units-long substrs field. */
7258     Newx(r->substrs, 1, struct reg_substr_data);
7259     if (RExC_recurse_count) {
7260         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7261         SAVEFREEPV(RExC_recurse);
7262     }
7263 
7264   reStudy:
7265     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7266     DEBUG_r(
7267         RExC_study_chunk_recursed_count= 0;
7268     );
7269     Zero(r->substrs, 1, struct reg_substr_data);
7270     if (RExC_study_chunk_recursed) {
7271         Zero(RExC_study_chunk_recursed,
7272              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7273     }
7274 
7275 
7276 #ifdef TRIE_STUDY_OPT
7277     if (!restudied) {
7278         StructCopy(&zero_scan_data, &data, scan_data_t);
7279         copyRExC_state = RExC_state;
7280     } else {
7281         U32 seen=RExC_seen;
7282         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7283 
7284         RExC_state = copyRExC_state;
7285         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7286             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7287         else
7288             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7289 	StructCopy(&zero_scan_data, &data, scan_data_t);
7290     }
7291 #else
7292     StructCopy(&zero_scan_data, &data, scan_data_t);
7293 #endif
7294 
7295     /* Dig out information for optimizations. */
7296     r->extflags = RExC_flags; /* was pm_op */
7297     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7298 
7299     if (UTF)
7300 	SvUTF8_on(rx);	/* Unicode in it? */
7301     ri->regstclass = NULL;
7302     if (RExC_naughty >= TOO_NAUGHTY)	/* Probably an expensive pattern. */
7303 	r->intflags |= PREGf_NAUGHTY;
7304     scan = ri->program + 1;		/* First BRANCH. */
7305 
7306     /* testing for BRANCH here tells us whether there is "must appear"
7307        data in the pattern. If there is then we can use it for optimisations */
7308     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
7309                                                   */
7310 	SSize_t fake;
7311 	STRLEN longest_float_length, longest_fixed_length;
7312 	regnode_ssc ch_class; /* pointed to by data */
7313 	int stclass_flag;
7314 	SSize_t last_close = 0; /* pointed to by data */
7315         regnode *first= scan;
7316         regnode *first_next= regnext(first);
7317 	/*
7318 	 * Skip introductions and multiplicators >= 1
7319 	 * so that we can extract the 'meat' of the pattern that must
7320 	 * match in the large if() sequence following.
7321 	 * NOTE that EXACT is NOT covered here, as it is normally
7322 	 * picked up by the optimiser separately.
7323 	 *
7324 	 * This is unfortunate as the optimiser isnt handling lookahead
7325 	 * properly currently.
7326 	 *
7327 	 */
7328 	while ((OP(first) == OPEN && (sawopen = 1)) ||
7329 	       /* An OR of *one* alternative - should not happen now. */
7330 	    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7331 	    /* for now we can't handle lookbehind IFMATCH*/
7332 	    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7333 	    (OP(first) == PLUS) ||
7334 	    (OP(first) == MINMOD) ||
7335 	       /* An {n,m} with n>0 */
7336 	    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7337 	    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7338 	{
7339 		/*
7340 		 * the only op that could be a regnode is PLUS, all the rest
7341 		 * will be regnode_1 or regnode_2.
7342 		 *
7343                  * (yves doesn't think this is true)
7344 		 */
7345 		if (OP(first) == PLUS)
7346 		    sawplus = 1;
7347                 else {
7348                     if (OP(first) == MINMOD)
7349                         sawminmod = 1;
7350 		    first += regarglen[OP(first)];
7351                 }
7352 		first = NEXTOPER(first);
7353 		first_next= regnext(first);
7354 	}
7355 
7356 	/* Starting-point info. */
7357       again:
7358         DEBUG_PEEP("first:",first,0);
7359         /* Ignore EXACT as we deal with it later. */
7360 	if (PL_regkind[OP(first)] == EXACT) {
7361 	    if (OP(first) == EXACT || OP(first) == EXACTL)
7362 		NOOP;	/* Empty, get anchored substr later. */
7363 	    else
7364 		ri->regstclass = first;
7365 	}
7366 #ifdef TRIE_STCLASS
7367 	else if (PL_regkind[OP(first)] == TRIE &&
7368 	        ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7369 	{
7370             /* this can happen only on restudy */
7371             ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7372 	}
7373 #endif
7374 	else if (REGNODE_SIMPLE(OP(first)))
7375 	    ri->regstclass = first;
7376 	else if (PL_regkind[OP(first)] == BOUND ||
7377 		 PL_regkind[OP(first)] == NBOUND)
7378 	    ri->regstclass = first;
7379 	else if (PL_regkind[OP(first)] == BOL) {
7380             r->intflags |= (OP(first) == MBOL
7381                            ? PREGf_ANCH_MBOL
7382                            : PREGf_ANCH_SBOL);
7383 	    first = NEXTOPER(first);
7384 	    goto again;
7385 	}
7386 	else if (OP(first) == GPOS) {
7387             r->intflags |= PREGf_ANCH_GPOS;
7388 	    first = NEXTOPER(first);
7389 	    goto again;
7390 	}
7391 	else if ((!sawopen || !RExC_sawback) &&
7392             !sawlookahead &&
7393 	    (OP(first) == STAR &&
7394 	    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7395             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7396 	{
7397 	    /* turn .* into ^.* with an implied $*=1 */
7398 	    const int type =
7399 		(OP(NEXTOPER(first)) == REG_ANY)
7400                     ? PREGf_ANCH_MBOL
7401                     : PREGf_ANCH_SBOL;
7402             r->intflags |= (type | PREGf_IMPLICIT);
7403 	    first = NEXTOPER(first);
7404 	    goto again;
7405 	}
7406         if (sawplus && !sawminmod && !sawlookahead
7407             && (!sawopen || !RExC_sawback)
7408 	    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7409 	    /* x+ must match at the 1st pos of run of x's */
7410 	    r->intflags |= PREGf_SKIP;
7411 
7412 	/* Scan is after the zeroth branch, first is atomic matcher. */
7413 #ifdef TRIE_STUDY_OPT
7414 	DEBUG_PARSE_r(
7415 	    if (!restudied)
7416                 Perl_re_printf( aTHX_  "first at %"IVdf"\n",
7417 			      (IV)(first - scan + 1))
7418         );
7419 #else
7420 	DEBUG_PARSE_r(
7421             Perl_re_printf( aTHX_  "first at %"IVdf"\n",
7422 	        (IV)(first - scan + 1))
7423         );
7424 #endif
7425 
7426 
7427 	/*
7428 	* If there's something expensive in the r.e., find the
7429 	* longest literal string that must appear and make it the
7430 	* regmust.  Resolve ties in favor of later strings, since
7431 	* the regstart check works with the beginning of the r.e.
7432 	* and avoiding duplication strengthens checking.  Not a
7433 	* strong reason, but sufficient in the absence of others.
7434 	* [Now we resolve ties in favor of the earlier string if
7435 	* it happens that c_offset_min has been invalidated, since the
7436 	* earlier string may buy us something the later one won't.]
7437 	*/
7438 
7439 	data.longest_fixed = newSVpvs("");
7440 	data.longest_float = newSVpvs("");
7441 	data.last_found = newSVpvs("");
7442 	data.longest = &(data.longest_fixed);
7443 	ENTER_with_name("study_chunk");
7444 	SAVEFREESV(data.longest_fixed);
7445 	SAVEFREESV(data.longest_float);
7446 	SAVEFREESV(data.last_found);
7447 	first = scan;
7448 	if (!ri->regstclass) {
7449 	    ssc_init(pRExC_state, &ch_class);
7450 	    data.start_class = &ch_class;
7451 	    stclass_flag = SCF_DO_STCLASS_AND;
7452 	} else				/* XXXX Check for BOUND? */
7453 	    stclass_flag = 0;
7454 	data.last_closep = &last_close;
7455 
7456         DEBUG_RExC_seen();
7457 	minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7458                              scan + RExC_size, /* Up to end */
7459             &data, -1, 0, NULL,
7460             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7461                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7462             0);
7463 
7464 
7465         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7466 
7467 
7468 	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7469 	     && data.last_start_min == 0 && data.last_end > 0
7470 	     && !RExC_seen_zerolen
7471              && !(RExC_seen & REG_VERBARG_SEEN)
7472              && !(RExC_seen & REG_GPOS_SEEN)
7473         ){
7474 	    r->extflags |= RXf_CHECK_ALL;
7475         }
7476 	scan_commit(pRExC_state, &data,&minlen,0);
7477 
7478 	longest_float_length = CHR_SVLEN(data.longest_float);
7479 
7480         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
7481                    && data.offset_fixed == data.offset_float_min
7482                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7483             && S_setup_longest (aTHX_ pRExC_state,
7484                                     data.longest_float,
7485                                     &(r->float_utf8),
7486                                     &(r->float_substr),
7487                                     &(r->float_end_shift),
7488                                     data.lookbehind_float,
7489                                     data.offset_float_min,
7490                                     data.minlen_float,
7491                                     longest_float_length,
7492                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
7493                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7494         {
7495 	    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7496 	    r->float_max_offset = data.offset_float_max;
7497 	    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7498 	        r->float_max_offset -= data.lookbehind_float;
7499 	    SvREFCNT_inc_simple_void_NN(data.longest_float);
7500 	}
7501 	else {
7502 	    r->float_substr = r->float_utf8 = NULL;
7503 	    longest_float_length = 0;
7504 	}
7505 
7506 	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7507 
7508         if (S_setup_longest (aTHX_ pRExC_state,
7509                                 data.longest_fixed,
7510                                 &(r->anchored_utf8),
7511                                 &(r->anchored_substr),
7512                                 &(r->anchored_end_shift),
7513                                 data.lookbehind_fixed,
7514                                 data.offset_fixed,
7515                                 data.minlen_fixed,
7516                                 longest_fixed_length,
7517                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7518                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7519         {
7520 	    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7521 	    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7522 	}
7523 	else {
7524 	    r->anchored_substr = r->anchored_utf8 = NULL;
7525 	    longest_fixed_length = 0;
7526 	}
7527 	LEAVE_with_name("study_chunk");
7528 
7529 	if (ri->regstclass
7530 	    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7531 	    ri->regstclass = NULL;
7532 
7533 	if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7534 	    && stclass_flag
7535             && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7536 	    && is_ssc_worth_it(pRExC_state, data.start_class))
7537 	{
7538 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7539 
7540             ssc_finalize(pRExC_state, data.start_class);
7541 
7542 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7543 	    StructCopy(data.start_class,
7544 		       (regnode_ssc*)RExC_rxi->data->data[n],
7545 		       regnode_ssc);
7546 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7547 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
7548 	    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7549                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7550                       Perl_re_printf( aTHX_
7551 				    "synthetic stclass \"%s\".\n",
7552 				    SvPVX_const(sv));});
7553             data.start_class = NULL;
7554 	}
7555 
7556         /* A temporary algorithm prefers floated substr to fixed one to dig
7557          * more info. */
7558 	if (longest_fixed_length > longest_float_length) {
7559 	    r->substrs->check_ix = 0;
7560 	    r->check_end_shift = r->anchored_end_shift;
7561 	    r->check_substr = r->anchored_substr;
7562 	    r->check_utf8 = r->anchored_utf8;
7563 	    r->check_offset_min = r->check_offset_max = r->anchored_offset;
7564             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7565                 r->intflags |= PREGf_NOSCAN;
7566 	}
7567 	else {
7568 	    r->substrs->check_ix = 1;
7569 	    r->check_end_shift = r->float_end_shift;
7570 	    r->check_substr = r->float_substr;
7571 	    r->check_utf8 = r->float_utf8;
7572 	    r->check_offset_min = r->float_min_offset;
7573 	    r->check_offset_max = r->float_max_offset;
7574 	}
7575 	if ((r->check_substr || r->check_utf8) ) {
7576 	    r->extflags |= RXf_USE_INTUIT;
7577 	    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7578 		r->extflags |= RXf_INTUIT_TAIL;
7579 	}
7580         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7581 
7582 	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7583 	if ( (STRLEN)minlen < longest_float_length )
7584             minlen= longest_float_length;
7585         if ( (STRLEN)minlen < longest_fixed_length )
7586             minlen= longest_fixed_length;
7587         */
7588     }
7589     else {
7590 	/* Several toplevels. Best we can is to set minlen. */
7591 	SSize_t fake;
7592 	regnode_ssc ch_class;
7593 	SSize_t last_close = 0;
7594 
7595         DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
7596 
7597 	scan = ri->program + 1;
7598 	ssc_init(pRExC_state, &ch_class);
7599 	data.start_class = &ch_class;
7600 	data.last_closep = &last_close;
7601 
7602         DEBUG_RExC_seen();
7603 	minlen = study_chunk(pRExC_state,
7604             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7605             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7606                                                       ? SCF_TRIE_DOING_RESTUDY
7607                                                       : 0),
7608             0);
7609 
7610         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7611 
7612 	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7613 		= r->float_substr = r->float_utf8 = NULL;
7614 
7615         if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7616 	    && is_ssc_worth_it(pRExC_state, data.start_class))
7617         {
7618 	    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7619 
7620             ssc_finalize(pRExC_state, data.start_class);
7621 
7622 	    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7623 	    StructCopy(data.start_class,
7624 		       (regnode_ssc*)RExC_rxi->data->data[n],
7625 		       regnode_ssc);
7626 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7627 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
7628 	    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7629                       regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7630                       Perl_re_printf( aTHX_
7631 				    "synthetic stclass \"%s\".\n",
7632 				    SvPVX_const(sv));});
7633             data.start_class = NULL;
7634 	}
7635     }
7636 
7637     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7638         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7639         r->maxlen = REG_INFTY;
7640     }
7641     else {
7642         r->maxlen = RExC_maxlen;
7643     }
7644 
7645     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7646        the "real" pattern. */
7647     DEBUG_OPTIMISE_r({
7648         Perl_re_printf( aTHX_ "minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7649                       (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7650     });
7651     r->minlenret = minlen;
7652     if (r->minlen < minlen)
7653         r->minlen = minlen;
7654 
7655     if (RExC_seen & REG_RECURSE_SEEN ) {
7656         r->intflags |= PREGf_RECURSE_SEEN;
7657         Newxz(r->recurse_locinput, r->nparens + 1, char *);
7658     }
7659     if (RExC_seen & REG_GPOS_SEEN)
7660         r->intflags |= PREGf_GPOS_SEEN;
7661     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7662         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7663                                                 lookbehind */
7664     if (pRExC_state->num_code_blocks)
7665 	r->extflags |= RXf_EVAL_SEEN;
7666     if (RExC_seen & REG_VERBARG_SEEN)
7667     {
7668 	r->intflags |= PREGf_VERBARG_SEEN;
7669         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7670     }
7671     if (RExC_seen & REG_CUTGROUP_SEEN)
7672 	r->intflags |= PREGf_CUTGROUP_SEEN;
7673     if (pm_flags & PMf_USE_RE_EVAL)
7674 	r->intflags |= PREGf_USE_RE_EVAL;
7675     if (RExC_paren_names)
7676         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7677     else
7678         RXp_PAREN_NAMES(r) = NULL;
7679 
7680     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7681      * so it can be used in pp.c */
7682     if (r->intflags & PREGf_ANCH)
7683         r->extflags |= RXf_IS_ANCHORED;
7684 
7685 
7686     {
7687         /* this is used to identify "special" patterns that might result
7688          * in Perl NOT calling the regex engine and instead doing the match "itself",
7689          * particularly special cases in split//. By having the regex compiler
7690          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7691          * we avoid weird issues with equivalent patterns resulting in different behavior,
7692          * AND we allow non Perl engines to get the same optimizations by the setting the
7693          * flags appropriately - Yves */
7694         regnode *first = ri->program + 1;
7695         U8 fop = OP(first);
7696         regnode *next = regnext(first);
7697         U8 nop = OP(next);
7698 
7699         if (PL_regkind[fop] == NOTHING && nop == END)
7700             r->extflags |= RXf_NULL;
7701         else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7702             /* when fop is SBOL first->flags will be true only when it was
7703              * produced by parsing /\A/, and not when parsing /^/. This is
7704              * very important for the split code as there we want to
7705              * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7706              * See rt #122761 for more details. -- Yves */
7707             r->extflags |= RXf_START_ONLY;
7708         else if (fop == PLUS
7709                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7710                  && nop == END)
7711             r->extflags |= RXf_WHITE;
7712         else if ( r->extflags & RXf_SPLIT
7713                   && (fop == EXACT || fop == EXACTL)
7714                   && STR_LEN(first) == 1
7715                   && *(STRING(first)) == ' '
7716                   && nop == END )
7717             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7718 
7719     }
7720 
7721     if (RExC_contains_locale) {
7722         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7723     }
7724 
7725 #ifdef DEBUGGING
7726     if (RExC_paren_names) {
7727         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7728         ri->data->data[ri->name_list_idx]
7729                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7730     } else
7731 #endif
7732     ri->name_list_idx = 0;
7733 
7734     while ( RExC_recurse_count > 0 ) {
7735         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
7736         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan );
7737     }
7738 
7739     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7740     /* assume we don't need to swap parens around before we match */
7741     DEBUG_TEST_r({
7742         Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
7743             (unsigned long)RExC_study_chunk_recursed_count);
7744     });
7745     DEBUG_DUMP_r({
7746         DEBUG_RExC_seen();
7747         Perl_re_printf( aTHX_ "Final program:\n");
7748         regdump(r);
7749     });
7750 #ifdef RE_TRACK_PATTERN_OFFSETS
7751     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7752         const STRLEN len = ri->u.offsets[0];
7753         STRLEN i;
7754         GET_RE_DEBUG_FLAGS_DECL;
7755         Perl_re_printf( aTHX_
7756                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7757         for (i = 1; i <= len; i++) {
7758             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7759                 Perl_re_printf( aTHX_  "%"UVuf":%"UVuf"[%"UVuf"] ",
7760                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7761             }
7762         Perl_re_printf( aTHX_  "\n");
7763     });
7764 #endif
7765 
7766 #ifdef USE_ITHREADS
7767     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7768      * by setting the regexp SV to readonly-only instead. If the
7769      * pattern's been recompiled, the USEDness should remain. */
7770     if (old_re && SvREADONLY(old_re))
7771         SvREADONLY_on(rx);
7772 #endif
7773     return rx;
7774 }
7775 
7776 
7777 SV*
7778 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7779                     const U32 flags)
7780 {
7781     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7782 
7783     PERL_UNUSED_ARG(value);
7784 
7785     if (flags & RXapif_FETCH) {
7786         return reg_named_buff_fetch(rx, key, flags);
7787     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7788         Perl_croak_no_modify();
7789         return NULL;
7790     } else if (flags & RXapif_EXISTS) {
7791         return reg_named_buff_exists(rx, key, flags)
7792             ? &PL_sv_yes
7793             : &PL_sv_no;
7794     } else if (flags & RXapif_REGNAMES) {
7795         return reg_named_buff_all(rx, flags);
7796     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7797         return reg_named_buff_scalar(rx, flags);
7798     } else {
7799         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7800         return NULL;
7801     }
7802 }
7803 
7804 SV*
7805 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7806                          const U32 flags)
7807 {
7808     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7809     PERL_UNUSED_ARG(lastkey);
7810 
7811     if (flags & RXapif_FIRSTKEY)
7812         return reg_named_buff_firstkey(rx, flags);
7813     else if (flags & RXapif_NEXTKEY)
7814         return reg_named_buff_nextkey(rx, flags);
7815     else {
7816         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7817                                             (int)flags);
7818         return NULL;
7819     }
7820 }
7821 
7822 SV*
7823 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7824 			  const U32 flags)
7825 {
7826     AV *retarray = NULL;
7827     SV *ret;
7828     struct regexp *const rx = ReANY(r);
7829 
7830     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7831 
7832     if (flags & RXapif_ALL)
7833         retarray=newAV();
7834 
7835     if (rx && RXp_PAREN_NAMES(rx)) {
7836         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7837         if (he_str) {
7838             IV i;
7839             SV* sv_dat=HeVAL(he_str);
7840             I32 *nums=(I32*)SvPVX(sv_dat);
7841             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7842                 if ((I32)(rx->nparens) >= nums[i]
7843                     && rx->offs[nums[i]].start != -1
7844                     && rx->offs[nums[i]].end != -1)
7845                 {
7846                     ret = newSVpvs("");
7847                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7848                     if (!retarray)
7849                         return ret;
7850                 } else {
7851                     if (retarray)
7852                         ret = newSVsv(&PL_sv_undef);
7853                 }
7854                 if (retarray)
7855                     av_push(retarray, ret);
7856             }
7857             if (retarray)
7858                 return newRV_noinc(MUTABLE_SV(retarray));
7859         }
7860     }
7861     return NULL;
7862 }
7863 
7864 bool
7865 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7866                            const U32 flags)
7867 {
7868     struct regexp *const rx = ReANY(r);
7869 
7870     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7871 
7872     if (rx && RXp_PAREN_NAMES(rx)) {
7873         if (flags & RXapif_ALL) {
7874             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7875         } else {
7876 	    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7877             if (sv) {
7878 		SvREFCNT_dec_NN(sv);
7879                 return TRUE;
7880             } else {
7881                 return FALSE;
7882             }
7883         }
7884     } else {
7885         return FALSE;
7886     }
7887 }
7888 
7889 SV*
7890 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7891 {
7892     struct regexp *const rx = ReANY(r);
7893 
7894     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7895 
7896     if ( rx && RXp_PAREN_NAMES(rx) ) {
7897 	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
7898 
7899 	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7900     } else {
7901 	return FALSE;
7902     }
7903 }
7904 
7905 SV*
7906 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7907 {
7908     struct regexp *const rx = ReANY(r);
7909     GET_RE_DEBUG_FLAGS_DECL;
7910 
7911     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7912 
7913     if (rx && RXp_PAREN_NAMES(rx)) {
7914         HV *hv = RXp_PAREN_NAMES(rx);
7915         HE *temphe;
7916         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7917             IV i;
7918             IV parno = 0;
7919             SV* sv_dat = HeVAL(temphe);
7920             I32 *nums = (I32*)SvPVX(sv_dat);
7921             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7922                 if ((I32)(rx->lastparen) >= nums[i] &&
7923                     rx->offs[nums[i]].start != -1 &&
7924                     rx->offs[nums[i]].end != -1)
7925                 {
7926                     parno = nums[i];
7927                     break;
7928                 }
7929             }
7930             if (parno || flags & RXapif_ALL) {
7931 		return newSVhek(HeKEY_hek(temphe));
7932             }
7933         }
7934     }
7935     return NULL;
7936 }
7937 
7938 SV*
7939 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7940 {
7941     SV *ret;
7942     AV *av;
7943     SSize_t length;
7944     struct regexp *const rx = ReANY(r);
7945 
7946     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7947 
7948     if (rx && RXp_PAREN_NAMES(rx)) {
7949         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7950             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7951         } else if (flags & RXapif_ONE) {
7952             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7953             av = MUTABLE_AV(SvRV(ret));
7954             length = av_tindex(av);
7955 	    SvREFCNT_dec_NN(ret);
7956             return newSViv(length + 1);
7957         } else {
7958             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7959                                                 (int)flags);
7960             return NULL;
7961         }
7962     }
7963     return &PL_sv_undef;
7964 }
7965 
7966 SV*
7967 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7968 {
7969     struct regexp *const rx = ReANY(r);
7970     AV *av = newAV();
7971 
7972     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7973 
7974     if (rx && RXp_PAREN_NAMES(rx)) {
7975         HV *hv= RXp_PAREN_NAMES(rx);
7976         HE *temphe;
7977         (void)hv_iterinit(hv);
7978         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7979             IV i;
7980             IV parno = 0;
7981             SV* sv_dat = HeVAL(temphe);
7982             I32 *nums = (I32*)SvPVX(sv_dat);
7983             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7984                 if ((I32)(rx->lastparen) >= nums[i] &&
7985                     rx->offs[nums[i]].start != -1 &&
7986                     rx->offs[nums[i]].end != -1)
7987                 {
7988                     parno = nums[i];
7989                     break;
7990                 }
7991             }
7992             if (parno || flags & RXapif_ALL) {
7993                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7994             }
7995         }
7996     }
7997 
7998     return newRV_noinc(MUTABLE_SV(av));
7999 }
8000 
8001 void
8002 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8003 			     SV * const sv)
8004 {
8005     struct regexp *const rx = ReANY(r);
8006     char *s = NULL;
8007     SSize_t i = 0;
8008     SSize_t s1, t1;
8009     I32 n = paren;
8010 
8011     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8012 
8013     if (      n == RX_BUFF_IDX_CARET_PREMATCH
8014            || n == RX_BUFF_IDX_CARET_FULLMATCH
8015            || n == RX_BUFF_IDX_CARET_POSTMATCH
8016        )
8017     {
8018         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8019         if (!keepcopy) {
8020             /* on something like
8021              *    $r = qr/.../;
8022              *    /$qr/p;
8023              * the KEEPCOPY is set on the PMOP rather than the regex */
8024             if (PL_curpm && r == PM_GETRE(PL_curpm))
8025                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8026         }
8027         if (!keepcopy)
8028             goto ret_undef;
8029     }
8030 
8031     if (!rx->subbeg)
8032         goto ret_undef;
8033 
8034     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8035         /* no need to distinguish between them any more */
8036         n = RX_BUFF_IDX_FULLMATCH;
8037 
8038     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8039         && rx->offs[0].start != -1)
8040     {
8041         /* $`, ${^PREMATCH} */
8042 	i = rx->offs[0].start;
8043 	s = rx->subbeg;
8044     }
8045     else
8046     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8047         && rx->offs[0].end != -1)
8048     {
8049         /* $', ${^POSTMATCH} */
8050 	s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8051 	i = rx->sublen + rx->suboffset - rx->offs[0].end;
8052     }
8053     else
8054     if ( 0 <= n && n <= (I32)rx->nparens &&
8055         (s1 = rx->offs[n].start) != -1 &&
8056         (t1 = rx->offs[n].end) != -1)
8057     {
8058         /* $&, ${^MATCH},  $1 ... */
8059         i = t1 - s1;
8060         s = rx->subbeg + s1 - rx->suboffset;
8061     } else {
8062         goto ret_undef;
8063     }
8064 
8065     assert(s >= rx->subbeg);
8066     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8067     if (i >= 0) {
8068 #ifdef NO_TAINT_SUPPORT
8069         sv_setpvn(sv, s, i);
8070 #else
8071         const int oldtainted = TAINT_get;
8072         TAINT_NOT;
8073         sv_setpvn(sv, s, i);
8074         TAINT_set(oldtainted);
8075 #endif
8076         if (RXp_MATCH_UTF8(rx))
8077             SvUTF8_on(sv);
8078         else
8079             SvUTF8_off(sv);
8080         if (TAINTING_get) {
8081             if (RXp_MATCH_TAINTED(rx)) {
8082                 if (SvTYPE(sv) >= SVt_PVMG) {
8083                     MAGIC* const mg = SvMAGIC(sv);
8084                     MAGIC* mgt;
8085                     TAINT;
8086                     SvMAGIC_set(sv, mg->mg_moremagic);
8087                     SvTAINT(sv);
8088                     if ((mgt = SvMAGIC(sv))) {
8089                         mg->mg_moremagic = mgt;
8090                         SvMAGIC_set(sv, mg);
8091                     }
8092                 } else {
8093                     TAINT;
8094                     SvTAINT(sv);
8095                 }
8096             } else
8097                 SvTAINTED_off(sv);
8098         }
8099     } else {
8100       ret_undef:
8101         sv_setsv(sv,&PL_sv_undef);
8102         return;
8103     }
8104 }
8105 
8106 void
8107 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8108 							 SV const * const value)
8109 {
8110     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8111 
8112     PERL_UNUSED_ARG(rx);
8113     PERL_UNUSED_ARG(paren);
8114     PERL_UNUSED_ARG(value);
8115 
8116     if (!PL_localizing)
8117         Perl_croak_no_modify();
8118 }
8119 
8120 I32
8121 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8122                               const I32 paren)
8123 {
8124     struct regexp *const rx = ReANY(r);
8125     I32 i;
8126     I32 s1, t1;
8127 
8128     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8129 
8130     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
8131         || paren == RX_BUFF_IDX_CARET_FULLMATCH
8132         || paren == RX_BUFF_IDX_CARET_POSTMATCH
8133     )
8134     {
8135         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8136         if (!keepcopy) {
8137             /* on something like
8138              *    $r = qr/.../;
8139              *    /$qr/p;
8140              * the KEEPCOPY is set on the PMOP rather than the regex */
8141             if (PL_curpm && r == PM_GETRE(PL_curpm))
8142                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8143         }
8144         if (!keepcopy)
8145             goto warn_undef;
8146     }
8147 
8148     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8149     switch (paren) {
8150       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8151       case RX_BUFF_IDX_PREMATCH:       /* $` */
8152         if (rx->offs[0].start != -1) {
8153 			i = rx->offs[0].start;
8154 			if (i > 0) {
8155 				s1 = 0;
8156 				t1 = i;
8157 				goto getlen;
8158 			}
8159 	    }
8160         return 0;
8161 
8162       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8163       case RX_BUFF_IDX_POSTMATCH:       /* $' */
8164 	    if (rx->offs[0].end != -1) {
8165 			i = rx->sublen - rx->offs[0].end;
8166 			if (i > 0) {
8167 				s1 = rx->offs[0].end;
8168 				t1 = rx->sublen;
8169 				goto getlen;
8170 			}
8171 	    }
8172         return 0;
8173 
8174       default: /* $& / ${^MATCH}, $1, $2, ... */
8175 	    if (paren <= (I32)rx->nparens &&
8176             (s1 = rx->offs[paren].start) != -1 &&
8177             (t1 = rx->offs[paren].end) != -1)
8178 	    {
8179             i = t1 - s1;
8180             goto getlen;
8181         } else {
8182           warn_undef:
8183             if (ckWARN(WARN_UNINITIALIZED))
8184                 report_uninit((const SV *)sv);
8185             return 0;
8186         }
8187     }
8188   getlen:
8189     if (i > 0 && RXp_MATCH_UTF8(rx)) {
8190         const char * const s = rx->subbeg - rx->suboffset + s1;
8191         const U8 *ep;
8192         STRLEN el;
8193 
8194         i = t1 - s1;
8195         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8196 			i = el;
8197     }
8198     return i;
8199 }
8200 
8201 SV*
8202 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8203 {
8204     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8205 	PERL_UNUSED_ARG(rx);
8206 	if (0)
8207 	    return NULL;
8208 	else
8209 	    return newSVpvs("Regexp");
8210 }
8211 
8212 /* Scans the name of a named buffer from the pattern.
8213  * If flags is REG_RSN_RETURN_NULL returns null.
8214  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8215  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8216  * to the parsed name as looked up in the RExC_paren_names hash.
8217  * If there is an error throws a vFAIL().. type exception.
8218  */
8219 
8220 #define REG_RSN_RETURN_NULL    0
8221 #define REG_RSN_RETURN_NAME    1
8222 #define REG_RSN_RETURN_DATA    2
8223 
8224 STATIC SV*
8225 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8226 {
8227     char *name_start = RExC_parse;
8228 
8229     PERL_ARGS_ASSERT_REG_SCAN_NAME;
8230 
8231     assert (RExC_parse <= RExC_end);
8232     if (RExC_parse == RExC_end) NOOP;
8233     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
8234          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
8235           * using do...while */
8236 	if (UTF)
8237 	    do {
8238 		RExC_parse += UTF8SKIP(RExC_parse);
8239 	    } while (isWORDCHAR_utf8((U8*)RExC_parse));
8240 	else
8241 	    do {
8242 		RExC_parse++;
8243 	    } while (isWORDCHAR(*RExC_parse));
8244     } else {
8245         RExC_parse++; /* so the <- from the vFAIL is after the offending
8246                          character */
8247         vFAIL("Group name must start with a non-digit word character");
8248     }
8249     if ( flags ) {
8250         SV* sv_name
8251 	    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8252 			     SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8253         if ( flags == REG_RSN_RETURN_NAME)
8254             return sv_name;
8255         else if (flags==REG_RSN_RETURN_DATA) {
8256             HE *he_str = NULL;
8257             SV *sv_dat = NULL;
8258             if ( ! sv_name )      /* should not happen*/
8259                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8260             if (RExC_paren_names)
8261                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8262             if ( he_str )
8263                 sv_dat = HeVAL(he_str);
8264             if ( ! sv_dat )
8265                 vFAIL("Reference to nonexistent named group");
8266             return sv_dat;
8267         }
8268         else {
8269             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8270 		       (unsigned long) flags);
8271         }
8272         NOT_REACHED; /* NOTREACHED */
8273     }
8274     return NULL;
8275 }
8276 
8277 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
8278     int num;                                                    \
8279     if (RExC_lastparse!=RExC_parse) {                           \
8280         Perl_re_printf( aTHX_  "%s",                                        \
8281             Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
8282                 RExC_end - RExC_parse, 16,                      \
8283                 "", "",                                         \
8284                 PERL_PV_ESCAPE_UNI_DETECT |                     \
8285                 PERL_PV_PRETTY_ELLIPSES   |                     \
8286                 PERL_PV_PRETTY_LTGT       |                     \
8287                 PERL_PV_ESCAPE_RE         |                     \
8288                 PERL_PV_PRETTY_EXACTSIZE                        \
8289             )                                                   \
8290         );                                                      \
8291     } else                                                      \
8292         Perl_re_printf( aTHX_ "%16s","");                                   \
8293                                                                 \
8294     if (SIZE_ONLY)                                              \
8295        num = RExC_size + 1;                                     \
8296     else                                                        \
8297        num=REG_NODE_NUM(RExC_emit);                             \
8298     if (RExC_lastnum!=num)                                      \
8299        Perl_re_printf( aTHX_ "|%4d",num);                                   \
8300     else                                                        \
8301        Perl_re_printf( aTHX_ "|%4s","");                                    \
8302     Perl_re_printf( aTHX_ "|%*s%-4s",                                       \
8303         (int)((depth*2)), "",                                   \
8304         (funcname)                                              \
8305     );                                                          \
8306     RExC_lastnum=num;                                           \
8307     RExC_lastparse=RExC_parse;                                  \
8308 })
8309 
8310 
8311 
8312 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
8313     DEBUG_PARSE_MSG((funcname));                            \
8314     Perl_re_printf( aTHX_ "%4s","\n");                                  \
8315 })
8316 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
8317     DEBUG_PARSE_MSG((funcname));                            \
8318     Perl_re_printf( aTHX_ fmt "\n",args);                               \
8319 })
8320 
8321 /* This section of code defines the inversion list object and its methods.  The
8322  * interfaces are highly subject to change, so as much as possible is static to
8323  * this file.  An inversion list is here implemented as a malloc'd C UV array
8324  * as an SVt_INVLIST scalar.
8325  *
8326  * An inversion list for Unicode is an array of code points, sorted by ordinal
8327  * number.  The zeroth element is the first code point in the list.  The 1th
8328  * element is the first element beyond that not in the list.  In other words,
8329  * the first range is
8330  *  invlist[0]..(invlist[1]-1)
8331  * The other ranges follow.  Thus every element whose index is divisible by two
8332  * marks the beginning of a range that is in the list, and every element not
8333  * divisible by two marks the beginning of a range not in the list.  A single
8334  * element inversion list that contains the single code point N generally
8335  * consists of two elements
8336  *  invlist[0] == N
8337  *  invlist[1] == N+1
8338  * (The exception is when N is the highest representable value on the
8339  * machine, in which case the list containing just it would be a single
8340  * element, itself.  By extension, if the last range in the list extends to
8341  * infinity, then the first element of that range will be in the inversion list
8342  * at a position that is divisible by two, and is the final element in the
8343  * list.)
8344  * Taking the complement (inverting) an inversion list is quite simple, if the
8345  * first element is 0, remove it; otherwise add a 0 element at the beginning.
8346  * This implementation reserves an element at the beginning of each inversion
8347  * list to always contain 0; there is an additional flag in the header which
8348  * indicates if the list begins at the 0, or is offset to begin at the next
8349  * element.
8350  *
8351  * More about inversion lists can be found in "Unicode Demystified"
8352  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8353  * More will be coming when functionality is added later.
8354  *
8355  * The inversion list data structure is currently implemented as an SV pointing
8356  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
8357  * array of UV whose memory management is automatically handled by the existing
8358  * facilities for SV's.
8359  *
8360  * Some of the methods should always be private to the implementation, and some
8361  * should eventually be made public */
8362 
8363 /* The header definitions are in F<invlist_inline.h> */
8364 
8365 PERL_STATIC_INLINE UV*
8366 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8367 {
8368     /* Returns a pointer to the first element in the inversion list's array.
8369      * This is called upon initialization of an inversion list.  Where the
8370      * array begins depends on whether the list has the code point U+0000 in it
8371      * or not.  The other parameter tells it whether the code that follows this
8372      * call is about to put a 0 in the inversion list or not.  The first
8373      * element is either the element reserved for 0, if TRUE, or the element
8374      * after it, if FALSE */
8375 
8376     bool* offset = get_invlist_offset_addr(invlist);
8377     UV* zero_addr = (UV *) SvPVX(invlist);
8378 
8379     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8380 
8381     /* Must be empty */
8382     assert(! _invlist_len(invlist));
8383 
8384     *zero_addr = 0;
8385 
8386     /* 1^1 = 0; 1^0 = 1 */
8387     *offset = 1 ^ will_have_0;
8388     return zero_addr + *offset;
8389 }
8390 
8391 PERL_STATIC_INLINE void
8392 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8393 {
8394     /* Sets the current number of elements stored in the inversion list.
8395      * Updates SvCUR correspondingly */
8396     PERL_UNUSED_CONTEXT;
8397     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8398 
8399     assert(SvTYPE(invlist) == SVt_INVLIST);
8400 
8401     SvCUR_set(invlist,
8402               (len == 0)
8403                ? 0
8404                : TO_INTERNAL_SIZE(len + offset));
8405     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8406 }
8407 
8408 #ifndef PERL_IN_XSUB_RE
8409 
8410 STATIC void
8411 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8412 {
8413     /* Replaces the inversion list in 'src' with the one in 'dest'.  It steals
8414      * the list from 'src', so 'src' is made to have a NULL list.  This is
8415      * similar to what SvSetMagicSV() would do, if it were implemented on
8416      * inversion lists, though this routine avoids a copy */
8417 
8418     const UV src_len          = _invlist_len(src);
8419     const bool src_offset     = *get_invlist_offset_addr(src);
8420     const STRLEN src_byte_len = SvLEN(src);
8421     char * array              = SvPVX(src);
8422 
8423     const int oldtainted = TAINT_get;
8424 
8425     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8426 
8427     assert(SvTYPE(src) == SVt_INVLIST);
8428     assert(SvTYPE(dest) == SVt_INVLIST);
8429     assert(! invlist_is_iterating(src));
8430     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8431 
8432     /* Make sure it ends in the right place with a NUL, as our inversion list
8433      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8434      * asserts it */
8435     array[src_byte_len - 1] = '\0';
8436 
8437     TAINT_NOT;      /* Otherwise it breaks */
8438     sv_usepvn_flags(dest,
8439                     (char *) array,
8440                     src_byte_len - 1,
8441 
8442                     /* This flag is documented to cause a copy to be avoided */
8443                     SV_HAS_TRAILING_NUL);
8444     TAINT_set(oldtainted);
8445     SvPV_set(src, 0);
8446     SvLEN_set(src, 0);
8447     SvCUR_set(src, 0);
8448 
8449     /* Finish up copying over the other fields in an inversion list */
8450     *get_invlist_offset_addr(dest) = src_offset;
8451     invlist_set_len(dest, src_len, src_offset);
8452     *get_invlist_previous_index_addr(dest) = 0;
8453     invlist_iterfinish(dest);
8454 }
8455 
8456 PERL_STATIC_INLINE IV*
8457 S_get_invlist_previous_index_addr(SV* invlist)
8458 {
8459     /* Return the address of the IV that is reserved to hold the cached index
8460      * */
8461     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8462 
8463     assert(SvTYPE(invlist) == SVt_INVLIST);
8464 
8465     return &(((XINVLIST*) SvANY(invlist))->prev_index);
8466 }
8467 
8468 PERL_STATIC_INLINE IV
8469 S_invlist_previous_index(SV* const invlist)
8470 {
8471     /* Returns cached index of previous search */
8472 
8473     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8474 
8475     return *get_invlist_previous_index_addr(invlist);
8476 }
8477 
8478 PERL_STATIC_INLINE void
8479 S_invlist_set_previous_index(SV* const invlist, const IV index)
8480 {
8481     /* Caches <index> for later retrieval */
8482 
8483     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8484 
8485     assert(index == 0 || index < (int) _invlist_len(invlist));
8486 
8487     *get_invlist_previous_index_addr(invlist) = index;
8488 }
8489 
8490 PERL_STATIC_INLINE void
8491 S_invlist_trim(SV* invlist)
8492 {
8493     /* Free the not currently-being-used space in an inversion list */
8494 
8495     /* But don't free up the space needed for the 0 UV that is always at the
8496      * beginning of the list, nor the trailing NUL */
8497     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
8498 
8499     PERL_ARGS_ASSERT_INVLIST_TRIM;
8500 
8501     assert(SvTYPE(invlist) == SVt_INVLIST);
8502 
8503     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
8504 }
8505 
8506 PERL_STATIC_INLINE void
8507 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
8508 {
8509     PERL_ARGS_ASSERT_INVLIST_CLEAR;
8510 
8511     assert(SvTYPE(invlist) == SVt_INVLIST);
8512 
8513     invlist_set_len(invlist, 0, 0);
8514     invlist_trim(invlist);
8515 }
8516 
8517 #endif /* ifndef PERL_IN_XSUB_RE */
8518 
8519 PERL_STATIC_INLINE bool
8520 S_invlist_is_iterating(SV* const invlist)
8521 {
8522     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8523 
8524     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8525 }
8526 
8527 PERL_STATIC_INLINE UV
8528 S_invlist_max(SV* const invlist)
8529 {
8530     /* Returns the maximum number of elements storable in the inversion list's
8531      * array, without having to realloc() */
8532 
8533     PERL_ARGS_ASSERT_INVLIST_MAX;
8534 
8535     assert(SvTYPE(invlist) == SVt_INVLIST);
8536 
8537     /* Assumes worst case, in which the 0 element is not counted in the
8538      * inversion list, so subtracts 1 for that */
8539     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
8540            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8541            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8542 }
8543 
8544 #ifndef PERL_IN_XSUB_RE
8545 SV*
8546 Perl__new_invlist(pTHX_ IV initial_size)
8547 {
8548 
8549     /* Return a pointer to a newly constructed inversion list, with enough
8550      * space to store 'initial_size' elements.  If that number is negative, a
8551      * system default is used instead */
8552 
8553     SV* new_list;
8554 
8555     if (initial_size < 0) {
8556 	initial_size = 10;
8557     }
8558 
8559     /* Allocate the initial space */
8560     new_list = newSV_type(SVt_INVLIST);
8561 
8562     /* First 1 is in case the zero element isn't in the list; second 1 is for
8563      * trailing NUL */
8564     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8565     invlist_set_len(new_list, 0, 0);
8566 
8567     /* Force iterinit() to be used to get iteration to work */
8568     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8569 
8570     *get_invlist_previous_index_addr(new_list) = 0;
8571 
8572     return new_list;
8573 }
8574 
8575 SV*
8576 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8577 {
8578     /* Return a pointer to a newly constructed inversion list, initialized to
8579      * point to <list>, which has to be in the exact correct inversion list
8580      * form, including internal fields.  Thus this is a dangerous routine that
8581      * should not be used in the wrong hands.  The passed in 'list' contains
8582      * several header fields at the beginning that are not part of the
8583      * inversion list body proper */
8584 
8585     const STRLEN length = (STRLEN) list[0];
8586     const UV version_id =          list[1];
8587     const bool offset   =    cBOOL(list[2]);
8588 #define HEADER_LENGTH 3
8589     /* If any of the above changes in any way, you must change HEADER_LENGTH
8590      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8591      *      perl -E 'say int(rand 2**31-1)'
8592      */
8593 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8594                                         data structure type, so that one being
8595                                         passed in can be validated to be an
8596                                         inversion list of the correct vintage.
8597                                        */
8598 
8599     SV* invlist = newSV_type(SVt_INVLIST);
8600 
8601     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8602 
8603     if (version_id != INVLIST_VERSION_ID) {
8604         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8605     }
8606 
8607     /* The generated array passed in includes header elements that aren't part
8608      * of the list proper, so start it just after them */
8609     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8610 
8611     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8612 			       shouldn't touch it */
8613 
8614     *(get_invlist_offset_addr(invlist)) = offset;
8615 
8616     /* The 'length' passed to us is the physical number of elements in the
8617      * inversion list.  But if there is an offset the logical number is one
8618      * less than that */
8619     invlist_set_len(invlist, length  - offset, offset);
8620 
8621     invlist_set_previous_index(invlist, 0);
8622 
8623     /* Initialize the iteration pointer. */
8624     invlist_iterfinish(invlist);
8625 
8626     SvREADONLY_on(invlist);
8627 
8628     return invlist;
8629 }
8630 #endif /* ifndef PERL_IN_XSUB_RE */
8631 
8632 STATIC void
8633 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8634 {
8635     /* Grow the maximum size of an inversion list */
8636 
8637     PERL_ARGS_ASSERT_INVLIST_EXTEND;
8638 
8639     assert(SvTYPE(invlist) == SVt_INVLIST);
8640 
8641     /* Add one to account for the zero element at the beginning which may not
8642      * be counted by the calling parameters */
8643     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8644 }
8645 
8646 STATIC void
8647 S__append_range_to_invlist(pTHX_ SV* const invlist,
8648                                  const UV start, const UV end)
8649 {
8650    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8651     * the end of the inversion list.  The range must be above any existing
8652     * ones. */
8653 
8654     UV* array;
8655     UV max = invlist_max(invlist);
8656     UV len = _invlist_len(invlist);
8657     bool offset;
8658 
8659     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8660 
8661     if (len == 0) { /* Empty lists must be initialized */
8662         offset = start != 0;
8663         array = _invlist_array_init(invlist, ! offset);
8664     }
8665     else {
8666 	/* Here, the existing list is non-empty. The current max entry in the
8667 	 * list is generally the first value not in the set, except when the
8668 	 * set extends to the end of permissible values, in which case it is
8669 	 * the first entry in that final set, and so this call is an attempt to
8670 	 * append out-of-order */
8671 
8672 	UV final_element = len - 1;
8673 	array = invlist_array(invlist);
8674 	if (array[final_element] > start
8675 	    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8676 	{
8677 	    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8678 		     array[final_element], start,
8679 		     ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8680 	}
8681 
8682 	/* Here, it is a legal append.  If the new range begins with the first
8683 	 * value not in the set, it is extending the set, so the new first
8684 	 * value not in the set is one greater than the newly extended range.
8685 	 * */
8686         offset = *get_invlist_offset_addr(invlist);
8687 	if (array[final_element] == start) {
8688 	    if (end != UV_MAX) {
8689 		array[final_element] = end + 1;
8690 	    }
8691 	    else {
8692 		/* But if the end is the maximum representable on the machine,
8693 		 * just let the range that this would extend to have no end */
8694 		invlist_set_len(invlist, len - 1, offset);
8695 	    }
8696 	    return;
8697 	}
8698     }
8699 
8700     /* Here the new range doesn't extend any existing set.  Add it */
8701 
8702     len += 2;	/* Includes an element each for the start and end of range */
8703 
8704     /* If wll overflow the existing space, extend, which may cause the array to
8705      * be moved */
8706     if (max < len) {
8707 	invlist_extend(invlist, len);
8708 
8709         /* Have to set len here to avoid assert failure in invlist_array() */
8710         invlist_set_len(invlist, len, offset);
8711 
8712 	array = invlist_array(invlist);
8713     }
8714     else {
8715 	invlist_set_len(invlist, len, offset);
8716     }
8717 
8718     /* The next item on the list starts the range, the one after that is
8719      * one past the new range.  */
8720     array[len - 2] = start;
8721     if (end != UV_MAX) {
8722 	array[len - 1] = end + 1;
8723     }
8724     else {
8725 	/* But if the end is the maximum representable on the machine, just let
8726 	 * the range have no end */
8727 	invlist_set_len(invlist, len - 1, offset);
8728     }
8729 }
8730 
8731 #ifndef PERL_IN_XSUB_RE
8732 
8733 IV
8734 Perl__invlist_search(SV* const invlist, const UV cp)
8735 {
8736     /* Searches the inversion list for the entry that contains the input code
8737      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8738      * return value is the index into the list's array of the range that
8739      * contains <cp>, that is, 'i' such that
8740      *	array[i] <= cp < array[i+1]
8741      */
8742 
8743     IV low = 0;
8744     IV mid;
8745     IV high = _invlist_len(invlist);
8746     const IV highest_element = high - 1;
8747     const UV* array;
8748 
8749     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8750 
8751     /* If list is empty, return failure. */
8752     if (high == 0) {
8753 	return -1;
8754     }
8755 
8756     /* (We can't get the array unless we know the list is non-empty) */
8757     array = invlist_array(invlist);
8758 
8759     mid = invlist_previous_index(invlist);
8760     assert(mid >=0);
8761     if (mid > highest_element) {
8762         mid = highest_element;
8763     }
8764 
8765     /* <mid> contains the cache of the result of the previous call to this
8766      * function (0 the first time).  See if this call is for the same result,
8767      * or if it is for mid-1.  This is under the theory that calls to this
8768      * function will often be for related code points that are near each other.
8769      * And benchmarks show that caching gives better results.  We also test
8770      * here if the code point is within the bounds of the list.  These tests
8771      * replace others that would have had to be made anyway to make sure that
8772      * the array bounds were not exceeded, and these give us extra information
8773      * at the same time */
8774     if (cp >= array[mid]) {
8775         if (cp >= array[highest_element]) {
8776             return highest_element;
8777         }
8778 
8779         /* Here, array[mid] <= cp < array[highest_element].  This means that
8780          * the final element is not the answer, so can exclude it; it also
8781          * means that <mid> is not the final element, so can refer to 'mid + 1'
8782          * safely */
8783         if (cp < array[mid + 1]) {
8784             return mid;
8785         }
8786         high--;
8787         low = mid + 1;
8788     }
8789     else { /* cp < aray[mid] */
8790         if (cp < array[0]) { /* Fail if outside the array */
8791             return -1;
8792         }
8793         high = mid;
8794         if (cp >= array[mid - 1]) {
8795             goto found_entry;
8796         }
8797     }
8798 
8799     /* Binary search.  What we are looking for is <i> such that
8800      *	array[i] <= cp < array[i+1]
8801      * The loop below converges on the i+1.  Note that there may not be an
8802      * (i+1)th element in the array, and things work nonetheless */
8803     while (low < high) {
8804 	mid = (low + high) / 2;
8805         assert(mid <= highest_element);
8806 	if (array[mid] <= cp) { /* cp >= array[mid] */
8807 	    low = mid + 1;
8808 
8809 	    /* We could do this extra test to exit the loop early.
8810 	    if (cp < array[low]) {
8811 		return mid;
8812 	    }
8813 	    */
8814 	}
8815 	else { /* cp < array[mid] */
8816 	    high = mid;
8817 	}
8818     }
8819 
8820   found_entry:
8821     high--;
8822     invlist_set_previous_index(invlist, high);
8823     return high;
8824 }
8825 
8826 void
8827 Perl__invlist_populate_swatch(SV* const invlist,
8828                               const UV start, const UV end, U8* swatch)
8829 {
8830     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8831      * but is used when the swash has an inversion list.  This makes this much
8832      * faster, as it uses a binary search instead of a linear one.  This is
8833      * intimately tied to that function, and perhaps should be in utf8.c,
8834      * except it is intimately tied to inversion lists as well.  It assumes
8835      * that <swatch> is all 0's on input */
8836 
8837     UV current = start;
8838     const IV len = _invlist_len(invlist);
8839     IV i;
8840     const UV * array;
8841 
8842     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8843 
8844     if (len == 0) { /* Empty inversion list */
8845         return;
8846     }
8847 
8848     array = invlist_array(invlist);
8849 
8850     /* Find which element it is */
8851     i = _invlist_search(invlist, start);
8852 
8853     /* We populate from <start> to <end> */
8854     while (current < end) {
8855         UV upper;
8856 
8857 	/* The inversion list gives the results for every possible code point
8858 	 * after the first one in the list.  Only those ranges whose index is
8859 	 * even are ones that the inversion list matches.  For the odd ones,
8860 	 * and if the initial code point is not in the list, we have to skip
8861 	 * forward to the next element */
8862         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8863             i++;
8864             if (i >= len) { /* Finished if beyond the end of the array */
8865                 return;
8866             }
8867             current = array[i];
8868 	    if (current >= end) {   /* Finished if beyond the end of what we
8869 				       are populating */
8870                 if (LIKELY(end < UV_MAX)) {
8871                     return;
8872                 }
8873 
8874                 /* We get here when the upper bound is the maximum
8875                  * representable on the machine, and we are looking for just
8876                  * that code point.  Have to special case it */
8877                 i = len;
8878                 goto join_end_of_list;
8879             }
8880         }
8881         assert(current >= start);
8882 
8883 	/* The current range ends one below the next one, except don't go past
8884 	 * <end> */
8885         i++;
8886         upper = (i < len && array[i] < end) ? array[i] : end;
8887 
8888 	/* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8889 	 * for each code point in it */
8890         for (; current < upper; current++) {
8891             const STRLEN offset = (STRLEN)(current - start);
8892             swatch[offset >> 3] |= 1 << (offset & 7);
8893         }
8894 
8895       join_end_of_list:
8896 
8897 	/* Quit if at the end of the list */
8898         if (i >= len) {
8899 
8900 	    /* But first, have to deal with the highest possible code point on
8901 	     * the platform.  The previous code assumes that <end> is one
8902 	     * beyond where we want to populate, but that is impossible at the
8903 	     * platform's infinity, so have to handle it specially */
8904             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8905 	    {
8906                 const STRLEN offset = (STRLEN)(end - start);
8907                 swatch[offset >> 3] |= 1 << (offset & 7);
8908             }
8909             return;
8910         }
8911 
8912 	/* Advance to the next range, which will be for code points not in the
8913 	 * inversion list */
8914         current = array[i];
8915     }
8916 
8917     return;
8918 }
8919 
8920 void
8921 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8922                                          const bool complement_b, SV** output)
8923 {
8924     /* Take the union of two inversion lists and point <output> to it.  *output
8925      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8926      * the reference count to that list will be decremented if not already a
8927      * temporary (mortal); otherwise just its contents will be modified to be
8928      * the union.  The first list, <a>, may be NULL, in which case a copy of
8929      * the second list is returned.  If <complement_b> is TRUE, the union is
8930      * taken of the complement (inversion) of <b> instead of b itself.
8931      *
8932      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8933      * Richard Gillam, published by Addison-Wesley, and explained at some
8934      * length there.  The preface says to incorporate its examples into your
8935      * code at your own risk.
8936      *
8937      * The algorithm is like a merge sort.
8938      *
8939      * XXX A potential performance improvement is to keep track as we go along
8940      * if only one of the inputs contributes to the result, meaning the other
8941      * is a subset of that one.  In that case, we can skip the final copy and
8942      * return the larger of the input lists, but then outside code might need
8943      * to keep track of whether to free the input list or not */
8944 
8945     const UV* array_a;    /* a's array */
8946     const UV* array_b;
8947     UV len_a;	    /* length of a's array */
8948     UV len_b;
8949 
8950     SV* u;			/* the resulting union */
8951     UV* array_u;
8952     UV len_u = 0;
8953 
8954     UV i_a = 0;		    /* current index into a's array */
8955     UV i_b = 0;
8956     UV i_u = 0;
8957 
8958     /* running count, as explained in the algorithm source book; items are
8959      * stopped accumulating and are output when the count changes to/from 0.
8960      * The count is incremented when we start a range that's in the set, and
8961      * decremented when we start a range that's not in the set.  So its range
8962      * is 0 to 2.  Only when the count is zero is something not in the set.
8963      */
8964     UV count = 0;
8965 
8966     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8967     assert(a != b);
8968 
8969     len_b = _invlist_len(b);
8970     if (len_b == 0) {
8971 
8972         /* Here, 'b' is empty.  If the output is the complement of 'b', the
8973          * union is all possible code points, and we need not even look at 'a'.
8974          * It's easiest to create a new inversion list that matches everything.
8975          * */
8976         if (complement_b) {
8977             SV* everything = _new_invlist(1);
8978             _append_range_to_invlist(everything, 0, UV_MAX);
8979 
8980             /* If the output didn't exist, just point it at the new list */
8981             if (*output == NULL) {
8982                 *output = everything;
8983                 return;
8984             }
8985 
8986             /* Otherwise, replace its contents with the new list */
8987             invlist_replace_list_destroys_src(*output, everything);
8988             SvREFCNT_dec_NN(everything);
8989             return;
8990         }
8991 
8992         /* Here, we don't want the complement of 'b', and since it is empty,
8993          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
8994          * output will be empty */
8995 
8996         if (a == NULL) {
8997             *output = _new_invlist(0);
8998             return;
8999         }
9000 
9001         if (_invlist_len(a) == 0) {
9002             invlist_clear(*output);
9003             return;
9004         }
9005 
9006         /* Here, 'a' is not empty, and entirely determines the union.  If the
9007          * output is not to overwrite 'b', we can just return 'a'. */
9008         if (*output != b) {
9009 
9010             /* If the output is to overwrite 'a', we have a no-op, as it's
9011              * already in 'a' */
9012             if (*output == a) {
9013                 return;
9014             }
9015 
9016             /* But otherwise we have to copy 'a' to the output */
9017             *output = invlist_clone(a);
9018             return;
9019         }
9020 
9021         /* Here, 'b' is to be overwritten by the output, which will be 'a' */
9022         u = invlist_clone(a);
9023         invlist_replace_list_destroys_src(*output, u);
9024         SvREFCNT_dec_NN(u);
9025 
9026 	return;
9027     }
9028 
9029     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9030 
9031         /* Here, 'a' is empty (and b is not).  That means the union will come
9032          * entirely from 'b'.  If the output is not to overwrite 'a', we can
9033          * just return what's in 'b'.  */
9034         if (*output != a) {
9035 
9036             /* If the output is to overwrite 'b', it's already in 'b', but
9037              * otherwise we have to copy 'b' to the output */
9038             if (*output != b) {
9039                 *output = invlist_clone(b);
9040             }
9041 
9042             /* And if the output is to be the inversion of 'b', do that */
9043             if (complement_b) {
9044                 _invlist_invert(*output);
9045             }
9046 
9047             return;
9048         }
9049 
9050         /* Here, 'a', which is empty or even NULL, is to be overwritten by the
9051          * output, which will either be 'b' or the complement of 'b' */
9052 
9053         if (a == NULL) {
9054             *output = invlist_clone(b);
9055         }
9056         else {
9057             u = invlist_clone(b);
9058             invlist_replace_list_destroys_src(*output, u);
9059             SvREFCNT_dec_NN(u);
9060 	}
9061 
9062         if (complement_b) {
9063             _invlist_invert(*output);
9064         }
9065 
9066 	return;
9067     }
9068 
9069     /* Here both lists exist and are non-empty */
9070     array_a = invlist_array(a);
9071     array_b = invlist_array(b);
9072 
9073     /* If are to take the union of 'a' with the complement of b, set it
9074      * up so are looking at b's complement. */
9075     if (complement_b) {
9076 
9077 	/* To complement, we invert: if the first element is 0, remove it.  To
9078 	 * do this, we just pretend the array starts one later */
9079         if (array_b[0] == 0) {
9080             array_b++;
9081             len_b--;
9082         }
9083         else {
9084 
9085             /* But if the first element is not zero, we pretend the list starts
9086              * at the 0 that is always stored immediately before the array. */
9087             array_b--;
9088             len_b++;
9089         }
9090     }
9091 
9092     /* Size the union for the worst case: that the sets are completely
9093      * disjoint */
9094     u = _new_invlist(len_a + len_b);
9095 
9096     /* Will contain U+0000 if either component does */
9097     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
9098 				      || (len_b > 0 && array_b[0] == 0));
9099 
9100     /* Go through each list item by item, stopping when exhausted one of
9101      * them */
9102     while (i_a < len_a && i_b < len_b) {
9103 	UV cp;	    /* The element to potentially add to the union's array */
9104 	bool cp_in_set;   /* is it in the the input list's set or not */
9105 
9106 	/* We need to take one or the other of the two inputs for the union.
9107 	 * Since we are merging two sorted lists, we take the smaller of the
9108 	 * next items.  In case of a tie, we take the one that is in its set
9109 	 * first.  If we took one not in the set first, it would decrement the
9110 	 * count, possibly to 0 which would cause it to be output as ending the
9111 	 * range, and the next time through we would take the same number, and
9112 	 * output it again as beginning the next range.  By doing it the
9113 	 * opposite way, there is no possibility that the count will be
9114 	 * momentarily decremented to 0, and thus the two adjoining ranges will
9115 	 * be seamlessly merged.  (In a tie and both are in the set or both not
9116 	 * in the set, it doesn't matter which we take first.) */
9117 	if (array_a[i_a] < array_b[i_b]
9118 	    || (array_a[i_a] == array_b[i_b]
9119 		&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9120 	{
9121 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9122 	    cp= array_a[i_a++];
9123 	}
9124 	else {
9125 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9126 	    cp = array_b[i_b++];
9127 	}
9128 
9129 	/* Here, have chosen which of the two inputs to look at.  Only output
9130 	 * if the running count changes to/from 0, which marks the
9131 	 * beginning/end of a range that's in the set */
9132 	if (cp_in_set) {
9133 	    if (count == 0) {
9134 		array_u[i_u++] = cp;
9135 	    }
9136 	    count++;
9137 	}
9138 	else {
9139 	    count--;
9140 	    if (count == 0) {
9141 		array_u[i_u++] = cp;
9142 	    }
9143 	}
9144     }
9145 
9146     /* Here, we are finished going through at least one of the lists, which
9147      * means there is something remaining in at most one.  We check if the list
9148      * that hasn't been exhausted is positioned such that we are in the middle
9149      * of a range in its set or not.  (i_a and i_b point to the element beyond
9150      * the one we care about.) If in the set, we decrement 'count'; if 0, there
9151      * is potentially more to output.
9152      * There are four cases:
9153      *	1) Both weren't in their sets, count is 0, and remains 0.  What's left
9154      *	   in the union is entirely from the non-exhausted set.
9155      *	2) Both were in their sets, count is 2.  Nothing further should
9156      *	   be output, as everything that remains will be in the exhausted
9157      *	   list's set, hence in the union; decrementing to 1 but not 0 insures
9158      *	   that
9159      *	3) the exhausted was in its set, non-exhausted isn't, count is 1.
9160      *	   Nothing further should be output because the union includes
9161      *	   everything from the exhausted set.  Not decrementing ensures that.
9162      *	4) the exhausted wasn't in its set, non-exhausted is, count is 1;
9163      *	   decrementing to 0 insures that we look at the remainder of the
9164      *	   non-exhausted set */
9165     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9166 	|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9167     {
9168 	count--;
9169     }
9170 
9171     /* The final length is what we've output so far, plus what else is about to
9172      * be output.  (If 'count' is non-zero, then the input list we exhausted
9173      * has everything remaining up to the machine's limit in its set, and hence
9174      * in the union, so there will be no further output. */
9175     len_u = i_u;
9176     if (count == 0) {
9177 	/* At most one of the subexpressions will be non-zero */
9178 	len_u += (len_a - i_a) + (len_b - i_b);
9179     }
9180 
9181     /* Set the result to the final length, which can change the pointer to
9182      * array_u, so re-find it.  (Note that it is unlikely that this will
9183      * change, as we are shrinking the space, not enlarging it) */
9184     if (len_u != _invlist_len(u)) {
9185 	invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9186 	invlist_trim(u);
9187 	array_u = invlist_array(u);
9188     }
9189 
9190     /* When 'count' is 0, the list that was exhausted (if one was shorter than
9191      * the other) ended with everything above it not in its set.  That means
9192      * that the remaining part of the union is precisely the same as the
9193      * non-exhausted list, so can just copy it unchanged.  (If both lists were
9194      * exhausted at the same time, then the operations below will be both 0.)
9195      */
9196     if (count == 0) {
9197 	IV copy_count; /* At most one will have a non-zero copy count */
9198 	if ((copy_count = len_a - i_a) > 0) {
9199 	    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9200 	}
9201 	else if ((copy_count = len_b - i_b) > 0) {
9202 	    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9203 	}
9204     }
9205 
9206     /* If the output is not to overwrite either of the inputs, just return the
9207      * calculated union */
9208     if (a != *output && b != *output) {
9209         *output = u;
9210     }
9211     else {
9212         /*  Here, the output is to be the same as one of the input scalars,
9213          *  hence replacing it.  The simple thing to do is to free the input
9214          *  scalar, making it instead be the output one.  But experience has
9215          *  shown [perl #127392] that if the input is a mortal, we can get a
9216          *  huge build-up of these during regex compilation before they get
9217          *  freed.  So for that case, replace just the input's interior with
9218          *  the output's, and then free the output */
9219 
9220         assert(! invlist_is_iterating(*output));
9221 
9222         if (! SvTEMP(*output)) {
9223             SvREFCNT_dec_NN(*output);
9224             *output = u;
9225         }
9226         else {
9227             invlist_replace_list_destroys_src(*output, u);
9228             SvREFCNT_dec_NN(u);
9229         }
9230     }
9231 
9232     return;
9233 }
9234 
9235 void
9236 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9237                                                const bool complement_b, SV** i)
9238 {
9239     /* Take the intersection of two inversion lists and point <i> to it.  *i
9240      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
9241      * the reference count to that list will be decremented if not already a
9242      * temporary (mortal); otherwise just its contents will be modified to be
9243      * the intersection.  The first list, <a>, may be NULL, in which case an
9244      * empty list is returned.  If <complement_b> is TRUE, the result will be
9245      * the intersection of <a> and the complement (or inversion) of <b> instead
9246      * of <b> directly.
9247      *
9248      * The basis for this comes from "Unicode Demystified" Chapter 13 by
9249      * Richard Gillam, published by Addison-Wesley, and explained at some
9250      * length there.  The preface says to incorporate its examples into your
9251      * code at your own risk.  In fact, it had bugs
9252      *
9253      * The algorithm is like a merge sort, and is essentially the same as the
9254      * union above
9255      */
9256 
9257     const UV* array_a;		/* a's array */
9258     const UV* array_b;
9259     UV len_a;	/* length of a's array */
9260     UV len_b;
9261 
9262     SV* r;		     /* the resulting intersection */
9263     UV* array_r;
9264     UV len_r = 0;
9265 
9266     UV i_a = 0;		    /* current index into a's array */
9267     UV i_b = 0;
9268     UV i_r = 0;
9269 
9270     /* running count, as explained in the algorithm source book; items are
9271      * stopped accumulating and are output when the count changes to/from 2.
9272      * The count is incremented when we start a range that's in the set, and
9273      * decremented when we start a range that's not in the set.  So its range
9274      * is 0 to 2.  Only when the count is 2 is something in the intersection.
9275      */
9276     UV count = 0;
9277 
9278     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9279     assert(a != b);
9280 
9281     /* Special case if either one is empty */
9282     len_a = (a == NULL) ? 0 : _invlist_len(a);
9283     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9284         if (len_a != 0 && complement_b) {
9285 
9286             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9287              * must be empty.  Here, also we are using 'b's complement, which
9288              * hence must be every possible code point.  Thus the intersection
9289              * is simply 'a'. */
9290 
9291             if (*i == a) {  /* No-op */
9292                 return;
9293             }
9294 
9295             /* If not overwriting either input, just make a copy of 'a' */
9296             if (*i != b) {
9297                 *i = invlist_clone(a);
9298                 return;
9299             }
9300 
9301             /* Here we are overwriting 'b' with 'a's contents */
9302             r = invlist_clone(a);
9303             invlist_replace_list_destroys_src(*i, r);
9304             SvREFCNT_dec_NN(r);
9305             return;
9306         }
9307 
9308         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
9309          * intersection must be empty */
9310         if (*i == NULL) {
9311             *i = _new_invlist(0);
9312             return;
9313         }
9314 
9315         invlist_clear(*i);
9316 	return;
9317     }
9318 
9319     /* Here both lists exist and are non-empty */
9320     array_a = invlist_array(a);
9321     array_b = invlist_array(b);
9322 
9323     /* If are to take the intersection of 'a' with the complement of b, set it
9324      * up so are looking at b's complement. */
9325     if (complement_b) {
9326 
9327 	/* To complement, we invert: if the first element is 0, remove it.  To
9328 	 * do this, we just pretend the array starts one later */
9329         if (array_b[0] == 0) {
9330             array_b++;
9331             len_b--;
9332         }
9333         else {
9334 
9335             /* But if the first element is not zero, we pretend the list starts
9336              * at the 0 that is always stored immediately before the array. */
9337             array_b--;
9338             len_b++;
9339         }
9340     }
9341 
9342     /* Size the intersection for the worst case: that the intersection ends up
9343      * fragmenting everything to be completely disjoint */
9344     r= _new_invlist(len_a + len_b);
9345 
9346     /* Will contain U+0000 iff both components do */
9347     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9348 				     && len_b > 0 && array_b[0] == 0);
9349 
9350     /* Go through each list item by item, stopping when exhausted one of
9351      * them */
9352     while (i_a < len_a && i_b < len_b) {
9353 	UV cp;	    /* The element to potentially add to the intersection's
9354 		       array */
9355 	bool cp_in_set;	/* Is it in the input list's set or not */
9356 
9357 	/* We need to take one or the other of the two inputs for the
9358 	 * intersection.  Since we are merging two sorted lists, we take the
9359 	 * smaller of the next items.  In case of a tie, we take the one that
9360 	 * is not in its set first (a difference from the union algorithm).  If
9361 	 * we took one in the set first, it would increment the count, possibly
9362 	 * to 2 which would cause it to be output as starting a range in the
9363 	 * intersection, and the next time through we would take that same
9364 	 * number, and output it again as ending the set.  By doing it the
9365 	 * opposite of this, there is no possibility that the count will be
9366 	 * momentarily incremented to 2.  (In a tie and both are in the set or
9367 	 * both not in the set, it doesn't matter which we take first.) */
9368 	if (array_a[i_a] < array_b[i_b]
9369 	    || (array_a[i_a] == array_b[i_b]
9370 		&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9371 	{
9372 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9373 	    cp= array_a[i_a++];
9374 	}
9375 	else {
9376 	    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9377 	    cp= array_b[i_b++];
9378 	}
9379 
9380 	/* Here, have chosen which of the two inputs to look at.  Only output
9381 	 * if the running count changes to/from 2, which marks the
9382 	 * beginning/end of a range that's in the intersection */
9383 	if (cp_in_set) {
9384 	    count++;
9385 	    if (count == 2) {
9386 		array_r[i_r++] = cp;
9387 	    }
9388 	}
9389 	else {
9390 	    if (count == 2) {
9391 		array_r[i_r++] = cp;
9392 	    }
9393 	    count--;
9394 	}
9395     }
9396 
9397     /* Here, we are finished going through at least one of the lists, which
9398      * means there is something remaining in at most one.  We check if the list
9399      * that has been exhausted is positioned such that we are in the middle
9400      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
9401      * the ones we care about.)  There are four cases:
9402      *	1) Both weren't in their sets, count is 0, and remains 0.  There's
9403      *	   nothing left in the intersection.
9404      *	2) Both were in their sets, count is 2 and perhaps is incremented to
9405      *	   above 2.  What should be output is exactly that which is in the
9406      *	   non-exhausted set, as everything it has is also in the intersection
9407      *	   set, and everything it doesn't have can't be in the intersection
9408      *	3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9409      *	   gets incremented to 2.  Like the previous case, the intersection is
9410      *	   everything that remains in the non-exhausted set.
9411      *	4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9412      *	   remains 1.  And the intersection has nothing more. */
9413     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9414 	|| (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9415     {
9416 	count++;
9417     }
9418 
9419     /* The final length is what we've output so far plus what else is in the
9420      * intersection.  At most one of the subexpressions below will be non-zero
9421      * */
9422     len_r = i_r;
9423     if (count >= 2) {
9424 	len_r += (len_a - i_a) + (len_b - i_b);
9425     }
9426 
9427     /* Set the result to the final length, which can change the pointer to
9428      * array_r, so re-find it.  (Note that it is unlikely that this will
9429      * change, as we are shrinking the space, not enlarging it) */
9430     if (len_r != _invlist_len(r)) {
9431 	invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9432 	invlist_trim(r);
9433 	array_r = invlist_array(r);
9434     }
9435 
9436     /* Finish outputting any remaining */
9437     if (count >= 2) { /* At most one will have a non-zero copy count */
9438 	IV copy_count;
9439 	if ((copy_count = len_a - i_a) > 0) {
9440 	    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9441 	}
9442 	else if ((copy_count = len_b - i_b) > 0) {
9443 	    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9444 	}
9445     }
9446 
9447     /* If the output is not to overwrite either of the inputs, just return the
9448      * calculated intersection */
9449     if (a != *i && b != *i) {
9450         *i = r;
9451     }
9452     else {
9453         /*  Here, the output is to be the same as one of the input scalars,
9454          *  hence replacing it.  The simple thing to do is to free the input
9455          *  scalar, making it instead be the output one.  But experience has
9456          *  shown [perl #127392] that if the input is a mortal, we can get a
9457          *  huge build-up of these during regex compilation before they get
9458          *  freed.  So for that case, replace just the input's interior with
9459          *  the output's, and then free the output.  A short-cut in this case
9460          *  is if the output is empty, we can just set the input to be empty */
9461 
9462         assert(! invlist_is_iterating(*i));
9463 
9464         if (! SvTEMP(*i)) {
9465             SvREFCNT_dec_NN(*i);
9466             *i = r;
9467         }
9468         else {
9469             if (len_r) {
9470                 invlist_replace_list_destroys_src(*i, r);
9471             }
9472             else {
9473                 invlist_clear(*i);
9474             }
9475             SvREFCNT_dec_NN(r);
9476         }
9477     }
9478 
9479     return;
9480 }
9481 
9482 SV*
9483 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9484 {
9485     /* Add the range from 'start' to 'end' inclusive to the inversion list's
9486      * set.  A pointer to the inversion list is returned.  This may actually be
9487      * a new list, in which case the passed in one has been destroyed.  The
9488      * passed-in inversion list can be NULL, in which case a new one is created
9489      * with just the one range in it */
9490 
9491     SV* range_invlist;
9492     UV len;
9493 
9494     if (invlist == NULL) {
9495 	invlist = _new_invlist(2);
9496 	len = 0;
9497     }
9498     else {
9499 	len = _invlist_len(invlist);
9500     }
9501 
9502     /* If comes after the final entry actually in the list, can just append it
9503      * to the end, */
9504     if (len == 0
9505 	|| (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9506             && start >= invlist_array(invlist)[len - 1]))
9507     {
9508 	_append_range_to_invlist(invlist, start, end);
9509 	return invlist;
9510     }
9511 
9512     /* Here, can't just append things, create and return a new inversion list
9513      * which is the union of this range and the existing inversion list.  (If
9514      * the new range is well-behaved wrt to the old one, we could just insert
9515      * it, doing a Move() down on the tail of the old one (potentially growing
9516      * it first).  But to determine that means we would have the extra
9517      * (possibly throw-away) work of first finding where the new one goes and
9518      * whether it disrupts (splits) an existing range, so it doesn't appear to
9519      * me (khw) that it's worth it) */
9520     range_invlist = _new_invlist(2);
9521     _append_range_to_invlist(range_invlist, start, end);
9522 
9523     _invlist_union(invlist, range_invlist, &invlist);
9524 
9525     /* The temporary can be freed */
9526     SvREFCNT_dec_NN(range_invlist);
9527 
9528     return invlist;
9529 }
9530 
9531 SV*
9532 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9533                                  UV** other_elements_ptr)
9534 {
9535     /* Create and return an inversion list whose contents are to be populated
9536      * by the caller.  The caller gives the number of elements (in 'size') and
9537      * the very first element ('element0').  This function will set
9538      * '*other_elements_ptr' to an array of UVs, where the remaining elements
9539      * are to be placed.
9540      *
9541      * Obviously there is some trust involved that the caller will properly
9542      * fill in the other elements of the array.
9543      *
9544      * (The first element needs to be passed in, as the underlying code does
9545      * things differently depending on whether it is zero or non-zero) */
9546 
9547     SV* invlist = _new_invlist(size);
9548     bool offset;
9549 
9550     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9551 
9552     _append_range_to_invlist(invlist, element0, element0);
9553     offset = *get_invlist_offset_addr(invlist);
9554 
9555     invlist_set_len(invlist, size, offset);
9556     *other_elements_ptr = invlist_array(invlist) + 1;
9557     return invlist;
9558 }
9559 
9560 #endif
9561 
9562 PERL_STATIC_INLINE SV*
9563 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9564     return _add_range_to_invlist(invlist, cp, cp);
9565 }
9566 
9567 #ifndef PERL_IN_XSUB_RE
9568 void
9569 Perl__invlist_invert(pTHX_ SV* const invlist)
9570 {
9571     /* Complement the input inversion list.  This adds a 0 if the list didn't
9572      * have a zero; removes it otherwise.  As described above, the data
9573      * structure is set up so that this is very efficient */
9574 
9575     PERL_ARGS_ASSERT__INVLIST_INVERT;
9576 
9577     assert(! invlist_is_iterating(invlist));
9578 
9579     /* The inverse of matching nothing is matching everything */
9580     if (_invlist_len(invlist) == 0) {
9581 	_append_range_to_invlist(invlist, 0, UV_MAX);
9582 	return;
9583     }
9584 
9585     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9586 }
9587 
9588 #endif
9589 
9590 PERL_STATIC_INLINE SV*
9591 S_invlist_clone(pTHX_ SV* const invlist)
9592 {
9593 
9594     /* Return a new inversion list that is a copy of the input one, which is
9595      * unchanged.  The new list will not be mortal even if the old one was. */
9596 
9597     /* Need to allocate extra space to accommodate Perl's addition of a
9598      * trailing NUL to SvPV's, since it thinks they are always strings */
9599     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9600     STRLEN physical_length = SvCUR(invlist);
9601     bool offset = *(get_invlist_offset_addr(invlist));
9602 
9603     PERL_ARGS_ASSERT_INVLIST_CLONE;
9604 
9605     *(get_invlist_offset_addr(new_invlist)) = offset;
9606     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9607     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9608 
9609     return new_invlist;
9610 }
9611 
9612 PERL_STATIC_INLINE STRLEN*
9613 S_get_invlist_iter_addr(SV* invlist)
9614 {
9615     /* Return the address of the UV that contains the current iteration
9616      * position */
9617 
9618     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9619 
9620     assert(SvTYPE(invlist) == SVt_INVLIST);
9621 
9622     return &(((XINVLIST*) SvANY(invlist))->iterator);
9623 }
9624 
9625 PERL_STATIC_INLINE void
9626 S_invlist_iterinit(SV* invlist)	/* Initialize iterator for invlist */
9627 {
9628     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9629 
9630     *get_invlist_iter_addr(invlist) = 0;
9631 }
9632 
9633 PERL_STATIC_INLINE void
9634 S_invlist_iterfinish(SV* invlist)
9635 {
9636     /* Terminate iterator for invlist.  This is to catch development errors.
9637      * Any iteration that is interrupted before completed should call this
9638      * function.  Functions that add code points anywhere else but to the end
9639      * of an inversion list assert that they are not in the middle of an
9640      * iteration.  If they were, the addition would make the iteration
9641      * problematical: if the iteration hadn't reached the place where things
9642      * were being added, it would be ok */
9643 
9644     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9645 
9646     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9647 }
9648 
9649 STATIC bool
9650 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9651 {
9652     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9653      * This call sets in <*start> and <*end>, the next range in <invlist>.
9654      * Returns <TRUE> if successful and the next call will return the next
9655      * range; <FALSE> if was already at the end of the list.  If the latter,
9656      * <*start> and <*end> are unchanged, and the next call to this function
9657      * will start over at the beginning of the list */
9658 
9659     STRLEN* pos = get_invlist_iter_addr(invlist);
9660     UV len = _invlist_len(invlist);
9661     UV *array;
9662 
9663     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9664 
9665     if (*pos >= len) {
9666 	*pos = (STRLEN) UV_MAX;	/* Force iterinit() to be required next time */
9667 	return FALSE;
9668     }
9669 
9670     array = invlist_array(invlist);
9671 
9672     *start = array[(*pos)++];
9673 
9674     if (*pos >= len) {
9675 	*end = UV_MAX;
9676     }
9677     else {
9678 	*end = array[(*pos)++] - 1;
9679     }
9680 
9681     return TRUE;
9682 }
9683 
9684 PERL_STATIC_INLINE UV
9685 S_invlist_highest(SV* const invlist)
9686 {
9687     /* Returns the highest code point that matches an inversion list.  This API
9688      * has an ambiguity, as it returns 0 under either the highest is actually
9689      * 0, or if the list is empty.  If this distinction matters to you, check
9690      * for emptiness before calling this function */
9691 
9692     UV len = _invlist_len(invlist);
9693     UV *array;
9694 
9695     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9696 
9697     if (len == 0) {
9698 	return 0;
9699     }
9700 
9701     array = invlist_array(invlist);
9702 
9703     /* The last element in the array in the inversion list always starts a
9704      * range that goes to infinity.  That range may be for code points that are
9705      * matched in the inversion list, or it may be for ones that aren't
9706      * matched.  In the latter case, the highest code point in the set is one
9707      * less than the beginning of this range; otherwise it is the final element
9708      * of this range: infinity */
9709     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9710            ? UV_MAX
9711            : array[len - 1] - 1;
9712 }
9713 
9714 STATIC SV *
9715 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
9716 {
9717     /* Get the contents of an inversion list into a string SV so that they can
9718      * be printed out.  If 'traditional_style' is TRUE, it uses the format
9719      * traditionally done for debug tracing; otherwise it uses a format
9720      * suitable for just copying to the output, with blanks between ranges and
9721      * a dash between range components */
9722 
9723     UV start, end;
9724     SV* output;
9725     const char intra_range_delimiter = (traditional_style ? '\t' : '-');
9726     const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
9727 
9728     if (traditional_style) {
9729         output = newSVpvs("\n");
9730     }
9731     else {
9732         output = newSVpvs("");
9733     }
9734 
9735     PERL_ARGS_ASSERT_INVLIST_CONTENTS;
9736 
9737     assert(! invlist_is_iterating(invlist));
9738 
9739     invlist_iterinit(invlist);
9740     while (invlist_iternext(invlist, &start, &end)) {
9741 	if (end == UV_MAX) {
9742 	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c",
9743                                           start, intra_range_delimiter,
9744                                                  inter_range_delimiter);
9745 	}
9746 	else if (end != start) {
9747 	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c",
9748 		                          start,
9749                                                    intra_range_delimiter,
9750                                                   end, inter_range_delimiter);
9751 	}
9752 	else {
9753 	    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c",
9754                                           start, inter_range_delimiter);
9755 	}
9756     }
9757 
9758     if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
9759         SvCUR_set(output, SvCUR(output) - 1);
9760     }
9761 
9762     return output;
9763 }
9764 
9765 #ifndef PERL_IN_XSUB_RE
9766 void
9767 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9768                          const char * const indent, SV* const invlist)
9769 {
9770     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9771      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9772      * the string 'indent'.  The output looks like this:
9773          [0] 0x000A .. 0x000D
9774          [2] 0x0085
9775          [4] 0x2028 .. 0x2029
9776          [6] 0x3104 .. INFINITY
9777      * This means that the first range of code points matched by the list are
9778      * 0xA through 0xD; the second range contains only the single code point
9779      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9780      * are used to define each range (except if the final range extends to
9781      * infinity, only a single element is needed).  The array index of the
9782      * first element for the corresponding range is given in brackets. */
9783 
9784     UV start, end;
9785     STRLEN count = 0;
9786 
9787     PERL_ARGS_ASSERT__INVLIST_DUMP;
9788 
9789     if (invlist_is_iterating(invlist)) {
9790         Perl_dump_indent(aTHX_ level, file,
9791              "%sCan't dump inversion list because is in middle of iterating\n",
9792              indent);
9793         return;
9794     }
9795 
9796     invlist_iterinit(invlist);
9797     while (invlist_iternext(invlist, &start, &end)) {
9798 	if (end == UV_MAX) {
9799 	    Perl_dump_indent(aTHX_ level, file,
9800                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9801                                    indent, (UV)count, start);
9802 	}
9803 	else if (end != start) {
9804 	    Perl_dump_indent(aTHX_ level, file,
9805                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9806 		                indent, (UV)count, start,         end);
9807 	}
9808 	else {
9809 	    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9810                                             indent, (UV)count, start);
9811 	}
9812         count += 2;
9813     }
9814 }
9815 
9816 void
9817 Perl__load_PL_utf8_foldclosures (pTHX)
9818 {
9819     assert(! PL_utf8_foldclosures);
9820 
9821     /* If the folds haven't been read in, call a fold function
9822      * to force that */
9823     if (! PL_utf8_tofold) {
9824         U8 dummy[UTF8_MAXBYTES_CASE+1];
9825 
9826         /* This string is just a short named one above \xff */
9827         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9828         assert(PL_utf8_tofold); /* Verify that worked */
9829     }
9830     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9831 }
9832 #endif
9833 
9834 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
9835 bool
9836 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9837 {
9838     /* Return a boolean as to if the two passed in inversion lists are
9839      * identical.  The final argument, if TRUE, says to take the complement of
9840      * the second inversion list before doing the comparison */
9841 
9842     const UV* array_a = invlist_array(a);
9843     const UV* array_b = invlist_array(b);
9844     UV len_a = _invlist_len(a);
9845     UV len_b = _invlist_len(b);
9846 
9847     UV i = 0;		    /* current index into the arrays */
9848     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9849 
9850     PERL_ARGS_ASSERT__INVLISTEQ;
9851 
9852     /* If are to compare 'a' with the complement of b, set it
9853      * up so are looking at b's complement. */
9854     if (complement_b) {
9855 
9856         /* The complement of nothing is everything, so <a> would have to have
9857          * just one element, starting at zero (ending at infinity) */
9858         if (len_b == 0) {
9859             return (len_a == 1 && array_a[0] == 0);
9860         }
9861         else if (array_b[0] == 0) {
9862 
9863             /* Otherwise, to complement, we invert.  Here, the first element is
9864              * 0, just remove it.  To do this, we just pretend the array starts
9865              * one later */
9866 
9867             array_b++;
9868             len_b--;
9869         }
9870         else {
9871 
9872             /* But if the first element is not zero, we pretend the list starts
9873              * at the 0 that is always stored immediately before the array. */
9874             array_b--;
9875             len_b++;
9876         }
9877     }
9878 
9879     /* Make sure that the lengths are the same, as well as the final element
9880      * before looping through the remainder.  (Thus we test the length, final,
9881      * and first elements right off the bat) */
9882     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9883         retval = FALSE;
9884     }
9885     else for (i = 0; i < len_a - 1; i++) {
9886         if (array_a[i] != array_b[i]) {
9887             retval = FALSE;
9888             break;
9889         }
9890     }
9891 
9892     return retval;
9893 }
9894 #endif
9895 
9896 /*
9897  * As best we can, determine the characters that can match the start of
9898  * the given EXACTF-ish node.
9899  *
9900  * Returns the invlist as a new SV*; it is the caller's responsibility to
9901  * call SvREFCNT_dec() when done with it.
9902  */
9903 STATIC SV*
9904 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9905 {
9906     const U8 * s = (U8*)STRING(node);
9907     SSize_t bytelen = STR_LEN(node);
9908     UV uc;
9909     /* Start out big enough for 2 separate code points */
9910     SV* invlist = _new_invlist(4);
9911 
9912     PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9913 
9914     if (! UTF) {
9915         uc = *s;
9916 
9917         /* We punt and assume can match anything if the node begins
9918          * with a multi-character fold.  Things are complicated.  For
9919          * example, /ffi/i could match any of:
9920          *  "\N{LATIN SMALL LIGATURE FFI}"
9921          *  "\N{LATIN SMALL LIGATURE FF}I"
9922          *  "F\N{LATIN SMALL LIGATURE FI}"
9923          *  plus several other things; and making sure we have all the
9924          *  possibilities is hard. */
9925         if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9926             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9927         }
9928         else {
9929             /* Any Latin1 range character can potentially match any
9930              * other depending on the locale */
9931             if (OP(node) == EXACTFL) {
9932                 _invlist_union(invlist, PL_Latin1, &invlist);
9933             }
9934             else {
9935                 /* But otherwise, it matches at least itself.  We can
9936                  * quickly tell if it has a distinct fold, and if so,
9937                  * it matches that as well */
9938                 invlist = add_cp_to_invlist(invlist, uc);
9939                 if (IS_IN_SOME_FOLD_L1(uc))
9940                     invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9941             }
9942 
9943             /* Some characters match above-Latin1 ones under /i.  This
9944              * is true of EXACTFL ones when the locale is UTF-8 */
9945             if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9946                 && (! isASCII(uc) || (OP(node) != EXACTFA
9947                                     && OP(node) != EXACTFA_NO_TRIE)))
9948             {
9949                 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9950             }
9951         }
9952     }
9953     else {  /* Pattern is UTF-8 */
9954         U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9955         STRLEN foldlen = UTF8SKIP(s);
9956         const U8* e = s + bytelen;
9957         SV** listp;
9958 
9959         uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9960 
9961         /* The only code points that aren't folded in a UTF EXACTFish
9962          * node are are the problematic ones in EXACTFL nodes */
9963         if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9964             /* We need to check for the possibility that this EXACTFL
9965              * node begins with a multi-char fold.  Therefore we fold
9966              * the first few characters of it so that we can make that
9967              * check */
9968             U8 *d = folded;
9969             int i;
9970 
9971             for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9972                 if (isASCII(*s)) {
9973                     *(d++) = (U8) toFOLD(*s);
9974                     s++;
9975                 }
9976                 else {
9977                     STRLEN len;
9978                     to_utf8_fold(s, d, &len);
9979                     d += len;
9980                     s += UTF8SKIP(s);
9981                 }
9982             }
9983 
9984             /* And set up so the code below that looks in this folded
9985              * buffer instead of the node's string */
9986             e = d;
9987             foldlen = UTF8SKIP(folded);
9988             s = folded;
9989         }
9990 
9991         /* When we reach here 's' points to the fold of the first
9992          * character(s) of the node; and 'e' points to far enough along
9993          * the folded string to be just past any possible multi-char
9994          * fold. 'foldlen' is the length in bytes of the first
9995          * character in 's'
9996          *
9997          * Unlike the non-UTF-8 case, the macro for determining if a
9998          * string is a multi-char fold requires all the characters to
9999          * already be folded.  This is because of all the complications
10000          * if not.  Note that they are folded anyway, except in EXACTFL
10001          * nodes.  Like the non-UTF case above, we punt if the node
10002          * begins with a multi-char fold  */
10003 
10004         if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10005             invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10006         }
10007         else {  /* Single char fold */
10008 
10009             /* It matches all the things that fold to it, which are
10010              * found in PL_utf8_foldclosures (including itself) */
10011             invlist = add_cp_to_invlist(invlist, uc);
10012             if (! PL_utf8_foldclosures)
10013                 _load_PL_utf8_foldclosures();
10014             if ((listp = hv_fetch(PL_utf8_foldclosures,
10015                                 (char *) s, foldlen, FALSE)))
10016             {
10017                 AV* list = (AV*) *listp;
10018                 IV k;
10019                 for (k = 0; k <= av_tindex_nomg(list); k++) {
10020                     SV** c_p = av_fetch(list, k, FALSE);
10021                     UV c;
10022                     assert(c_p);
10023 
10024                     c = SvUV(*c_p);
10025 
10026                     /* /aa doesn't allow folds between ASCII and non- */
10027                     if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
10028                         && isASCII(c) != isASCII(uc))
10029                     {
10030                         continue;
10031                     }
10032 
10033                     invlist = add_cp_to_invlist(invlist, c);
10034                 }
10035             }
10036         }
10037     }
10038 
10039     return invlist;
10040 }
10041 
10042 #undef HEADER_LENGTH
10043 #undef TO_INTERNAL_SIZE
10044 #undef FROM_INTERNAL_SIZE
10045 #undef INVLIST_VERSION_ID
10046 
10047 /* End of inversion list object */
10048 
10049 STATIC void
10050 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10051 {
10052     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10053      * constructs, and updates RExC_flags with them.  On input, RExC_parse
10054      * should point to the first flag; it is updated on output to point to the
10055      * final ')' or ':'.  There needs to be at least one flag, or this will
10056      * abort */
10057 
10058     /* for (?g), (?gc), and (?o) warnings; warning
10059        about (?c) will warn about (?g) -- japhy    */
10060 
10061 #define WASTED_O  0x01
10062 #define WASTED_G  0x02
10063 #define WASTED_C  0x04
10064 #define WASTED_GC (WASTED_G|WASTED_C)
10065     I32 wastedflags = 0x00;
10066     U32 posflags = 0, negflags = 0;
10067     U32 *flagsp = &posflags;
10068     char has_charset_modifier = '\0';
10069     regex_charset cs;
10070     bool has_use_defaults = FALSE;
10071     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10072     int x_mod_count = 0;
10073 
10074     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10075 
10076     /* '^' as an initial flag sets certain defaults */
10077     if (UCHARAT(RExC_parse) == '^') {
10078         RExC_parse++;
10079         has_use_defaults = TRUE;
10080         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10081         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
10082                                         ? REGEX_UNICODE_CHARSET
10083                                         : REGEX_DEPENDS_CHARSET);
10084     }
10085 
10086     cs = get_regex_charset(RExC_flags);
10087     if (cs == REGEX_DEPENDS_CHARSET
10088         && (RExC_utf8 || RExC_uni_semantics))
10089     {
10090         cs = REGEX_UNICODE_CHARSET;
10091     }
10092 
10093     while (RExC_parse < RExC_end) {
10094         /* && strchr("iogcmsx", *RExC_parse) */
10095         /* (?g), (?gc) and (?o) are useless here
10096            and must be globally applied -- japhy */
10097         switch (*RExC_parse) {
10098 
10099             /* Code for the imsxn flags */
10100             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10101 
10102             case LOCALE_PAT_MOD:
10103                 if (has_charset_modifier) {
10104                     goto excess_modifier;
10105                 }
10106                 else if (flagsp == &negflags) {
10107                     goto neg_modifier;
10108                 }
10109                 cs = REGEX_LOCALE_CHARSET;
10110                 has_charset_modifier = LOCALE_PAT_MOD;
10111                 break;
10112             case UNICODE_PAT_MOD:
10113                 if (has_charset_modifier) {
10114                     goto excess_modifier;
10115                 }
10116                 else if (flagsp == &negflags) {
10117                     goto neg_modifier;
10118                 }
10119                 cs = REGEX_UNICODE_CHARSET;
10120                 has_charset_modifier = UNICODE_PAT_MOD;
10121                 break;
10122             case ASCII_RESTRICT_PAT_MOD:
10123                 if (flagsp == &negflags) {
10124                     goto neg_modifier;
10125                 }
10126                 if (has_charset_modifier) {
10127                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10128                         goto excess_modifier;
10129                     }
10130                     /* Doubled modifier implies more restricted */
10131                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10132                 }
10133                 else {
10134                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
10135                 }
10136                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10137                 break;
10138             case DEPENDS_PAT_MOD:
10139                 if (has_use_defaults) {
10140                     goto fail_modifiers;
10141                 }
10142                 else if (flagsp == &negflags) {
10143                     goto neg_modifier;
10144                 }
10145                 else if (has_charset_modifier) {
10146                     goto excess_modifier;
10147                 }
10148 
10149                 /* The dual charset means unicode semantics if the
10150                  * pattern (or target, not known until runtime) are
10151                  * utf8, or something in the pattern indicates unicode
10152                  * semantics */
10153                 cs = (RExC_utf8 || RExC_uni_semantics)
10154                      ? REGEX_UNICODE_CHARSET
10155                      : REGEX_DEPENDS_CHARSET;
10156                 has_charset_modifier = DEPENDS_PAT_MOD;
10157                 break;
10158               excess_modifier:
10159                 RExC_parse++;
10160                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10161                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10162                 }
10163                 else if (has_charset_modifier == *(RExC_parse - 1)) {
10164                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
10165                                         *(RExC_parse - 1));
10166                 }
10167                 else {
10168                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10169                 }
10170                 NOT_REACHED; /*NOTREACHED*/
10171               neg_modifier:
10172                 RExC_parse++;
10173                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10174                                     *(RExC_parse - 1));
10175                 NOT_REACHED; /*NOTREACHED*/
10176             case ONCE_PAT_MOD: /* 'o' */
10177             case GLOBAL_PAT_MOD: /* 'g' */
10178                 if (PASS2 && ckWARN(WARN_REGEXP)) {
10179                     const I32 wflagbit = *RExC_parse == 'o'
10180                                          ? WASTED_O
10181                                          : WASTED_G;
10182                     if (! (wastedflags & wflagbit) ) {
10183                         wastedflags |= wflagbit;
10184 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10185                         vWARN5(
10186                             RExC_parse + 1,
10187                             "Useless (%s%c) - %suse /%c modifier",
10188                             flagsp == &negflags ? "?-" : "?",
10189                             *RExC_parse,
10190                             flagsp == &negflags ? "don't " : "",
10191                             *RExC_parse
10192                         );
10193                     }
10194                 }
10195                 break;
10196 
10197             case CONTINUE_PAT_MOD: /* 'c' */
10198                 if (PASS2 && ckWARN(WARN_REGEXP)) {
10199                     if (! (wastedflags & WASTED_C) ) {
10200                         wastedflags |= WASTED_GC;
10201 			/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10202                         vWARN3(
10203                             RExC_parse + 1,
10204                             "Useless (%sc) - %suse /gc modifier",
10205                             flagsp == &negflags ? "?-" : "?",
10206                             flagsp == &negflags ? "don't " : ""
10207                         );
10208                     }
10209                 }
10210                 break;
10211             case KEEPCOPY_PAT_MOD: /* 'p' */
10212                 if (flagsp == &negflags) {
10213                     if (PASS2)
10214                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10215                 } else {
10216                     *flagsp |= RXf_PMf_KEEPCOPY;
10217                 }
10218                 break;
10219             case '-':
10220                 /* A flag is a default iff it is following a minus, so
10221                  * if there is a minus, it means will be trying to
10222                  * re-specify a default which is an error */
10223                 if (has_use_defaults || flagsp == &negflags) {
10224                     goto fail_modifiers;
10225                 }
10226                 flagsp = &negflags;
10227                 wastedflags = 0;  /* reset so (?g-c) warns twice */
10228                 break;
10229             case ':':
10230             case ')':
10231                 RExC_flags |= posflags;
10232                 RExC_flags &= ~negflags;
10233                 set_regex_charset(&RExC_flags, cs);
10234                 if (RExC_flags & RXf_PMf_FOLD) {
10235                     RExC_contains_i = 1;
10236                 }
10237                 if (PASS2) {
10238                     STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
10239                 }
10240                 return;
10241                 /*NOTREACHED*/
10242             default:
10243               fail_modifiers:
10244                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10245 		/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10246                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
10247                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10248                 NOT_REACHED; /*NOTREACHED*/
10249         }
10250 
10251         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10252     }
10253 
10254     vFAIL("Sequence (?... not terminated");
10255 }
10256 
10257 /*
10258  - reg - regular expression, i.e. main body or parenthesized thing
10259  *
10260  * Caller must absorb opening parenthesis.
10261  *
10262  * Combining parenthesis handling with the base level of regular expression
10263  * is a trifle forced, but the need to tie the tails of the branches to what
10264  * follows makes it hard to avoid.
10265  */
10266 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10267 #ifdef DEBUGGING
10268 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10269 #else
10270 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10271 #endif
10272 
10273 PERL_STATIC_INLINE regnode *
10274 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10275                              I32 *flagp,
10276                              char * parse_start,
10277                              char ch
10278                       )
10279 {
10280     regnode *ret;
10281     char* name_start = RExC_parse;
10282     U32 num = 0;
10283     SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
10284                                             ? REG_RSN_RETURN_NULL
10285                                             : REG_RSN_RETURN_DATA);
10286     GET_RE_DEBUG_FLAGS_DECL;
10287 
10288     PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10289 
10290     if (RExC_parse == name_start || *RExC_parse != ch) {
10291         /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10292         vFAIL2("Sequence %.3s... not terminated",parse_start);
10293     }
10294 
10295     if (!SIZE_ONLY) {
10296         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10297         RExC_rxi->data->data[num]=(void*)sv_dat;
10298         SvREFCNT_inc_simple_void(sv_dat);
10299     }
10300     RExC_sawback = 1;
10301     ret = reganode(pRExC_state,
10302                    ((! FOLD)
10303                      ? NREF
10304                      : (ASCII_FOLD_RESTRICTED)
10305                        ? NREFFA
10306                        : (AT_LEAST_UNI_SEMANTICS)
10307                          ? NREFFU
10308                          : (LOC)
10309                            ? NREFFL
10310                            : NREFF),
10311                     num);
10312     *flagp |= HASWIDTH;
10313 
10314     Set_Node_Offset(ret, parse_start+1);
10315     Set_Node_Cur_Length(ret, parse_start);
10316 
10317     nextchar(pRExC_state);
10318     return ret;
10319 }
10320 
10321 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
10322    flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
10323    needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
10324    upgraded to UTF-8.  Otherwise would only return NULL if regbranch() returns
10325    NULL, which cannot happen.  */
10326 STATIC regnode *
10327 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
10328     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10329      * 2 is like 1, but indicates that nextchar() has been called to advance
10330      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
10331      * this flag alerts us to the need to check for that */
10332 {
10333     regnode *ret;		/* Will be the head of the group. */
10334     regnode *br;
10335     regnode *lastbr;
10336     regnode *ender = NULL;
10337     I32 parno = 0;
10338     I32 flags;
10339     U32 oregflags = RExC_flags;
10340     bool have_branch = 0;
10341     bool is_open = 0;
10342     I32 freeze_paren = 0;
10343     I32 after_freeze = 0;
10344     I32 num; /* numeric backreferences */
10345 
10346     char * parse_start = RExC_parse; /* MJD */
10347     char * const oregcomp_parse = RExC_parse;
10348 
10349     GET_RE_DEBUG_FLAGS_DECL;
10350 
10351     PERL_ARGS_ASSERT_REG;
10352     DEBUG_PARSE("reg ");
10353 
10354     *flagp = 0;				/* Tentatively. */
10355 
10356     /* Having this true makes it feasible to have a lot fewer tests for the
10357      * parse pointer being in scope.  For example, we can write
10358      *      while(isFOO(*RExC_parse)) RExC_parse++;
10359      * instead of
10360      *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
10361      */
10362     assert(*RExC_end == '\0');
10363 
10364     /* Make an OPEN node, if parenthesized. */
10365     if (paren) {
10366 
10367         /* Under /x, space and comments can be gobbled up between the '(' and
10368          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
10369          * intervening space, as the sequence is a token, and a token should be
10370          * indivisible */
10371         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
10372 
10373         if (RExC_parse >= RExC_end) {
10374 	    vFAIL("Unmatched (");
10375         }
10376 
10377         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
10378 	    char *start_verb = RExC_parse + 1;
10379 	    STRLEN verb_len;
10380 	    char *start_arg = NULL;
10381 	    unsigned char op = 0;
10382             int arg_required = 0;
10383             int internal_argval = -1; /* if >-1 we are not allowed an argument*/
10384 
10385             if (has_intervening_patws) {
10386                 RExC_parse++;   /* past the '*' */
10387                 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
10388             }
10389 	    while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
10390 	        if ( *RExC_parse == ':' ) {
10391 	            start_arg = RExC_parse + 1;
10392 	            break;
10393 	        }
10394 	        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10395 	    }
10396 	    verb_len = RExC_parse - start_verb;
10397 	    if ( start_arg ) {
10398                 if (RExC_parse >= RExC_end) {
10399                     goto unterminated_verb_pattern;
10400                 }
10401 	        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10402 	        while ( RExC_parse < RExC_end && *RExC_parse != ')' )
10403                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10404 	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10405                   unterminated_verb_pattern:
10406 	            vFAIL("Unterminated verb pattern argument");
10407 	        if ( RExC_parse == start_arg )
10408 	            start_arg = NULL;
10409 	    } else {
10410 	        if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
10411 	            vFAIL("Unterminated verb pattern");
10412 	    }
10413 
10414             /* Here, we know that RExC_parse < RExC_end */
10415 
10416 	    switch ( *start_verb ) {
10417             case 'A':  /* (*ACCEPT) */
10418                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
10419 		    op = ACCEPT;
10420 		    internal_argval = RExC_nestroot;
10421 		}
10422 		break;
10423             case 'C':  /* (*COMMIT) */
10424                 if ( memEQs(start_verb,verb_len,"COMMIT") )
10425                     op = COMMIT;
10426                 break;
10427             case 'F':  /* (*FAIL) */
10428                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
10429 		    op = OPFAIL;
10430 		}
10431 		break;
10432             case ':':  /* (*:NAME) */
10433 	    case 'M':  /* (*MARK:NAME) */
10434 	        if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
10435                     op = MARKPOINT;
10436                     arg_required = 1;
10437                 }
10438                 break;
10439             case 'P':  /* (*PRUNE) */
10440                 if ( memEQs(start_verb,verb_len,"PRUNE") )
10441                     op = PRUNE;
10442                 break;
10443             case 'S':   /* (*SKIP) */
10444                 if ( memEQs(start_verb,verb_len,"SKIP") )
10445                     op = SKIP;
10446                 break;
10447             case 'T':  /* (*THEN) */
10448                 /* [19:06] <TimToady> :: is then */
10449                 if ( memEQs(start_verb,verb_len,"THEN") ) {
10450                     op = CUTGROUP;
10451                     RExC_seen |= REG_CUTGROUP_SEEN;
10452                 }
10453                 break;
10454 	    }
10455 	    if ( ! op ) {
10456 	        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10457                 vFAIL2utf8f(
10458                     "Unknown verb pattern '%"UTF8f"'",
10459                     UTF8fARG(UTF, verb_len, start_verb));
10460 	    }
10461             if ( arg_required && !start_arg ) {
10462                 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
10463                     verb_len, start_verb);
10464             }
10465             if (internal_argval == -1) {
10466                 ret = reganode(pRExC_state, op, 0);
10467             } else {
10468                 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
10469             }
10470             RExC_seen |= REG_VERBARG_SEEN;
10471             if ( ! SIZE_ONLY ) {
10472                 if (start_arg) {
10473                     SV *sv = newSVpvn( start_arg,
10474                                        RExC_parse - start_arg);
10475                     ARG(ret) = add_data( pRExC_state,
10476                                          STR_WITH_LEN("S"));
10477                     RExC_rxi->data->data[ARG(ret)]=(void*)sv;
10478                     ret->flags = 1;
10479                 } else {
10480                     ret->flags = 0;
10481                 }
10482                 if ( internal_argval != -1 )
10483                     ARG2L_SET(ret, internal_argval);
10484             }
10485 	    nextchar(pRExC_state);
10486 	    return ret;
10487         }
10488         else if (*RExC_parse == '?') { /* (?...) */
10489 	    bool is_logical = 0;
10490 	    const char * const seqstart = RExC_parse;
10491             const char * endptr;
10492             if (has_intervening_patws) {
10493                 RExC_parse++;
10494                 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
10495             }
10496 
10497 	    RExC_parse++;           /* past the '?' */
10498             paren = *RExC_parse;    /* might be a trailing NUL, if not
10499                                        well-formed */
10500             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10501             if (RExC_parse > RExC_end) {
10502                 paren = '\0';
10503             }
10504 	    ret = NULL;			/* For look-ahead/behind. */
10505 	    switch (paren) {
10506 
10507 	    case 'P':	/* (?P...) variants for those used to PCRE/Python */
10508 	        paren = *RExC_parse;
10509 		if ( paren == '<') {    /* (?P<...>) named capture */
10510                     RExC_parse++;
10511                     if (RExC_parse >= RExC_end) {
10512                         vFAIL("Sequence (?P<... not terminated");
10513                     }
10514 		    goto named_capture;
10515                 }
10516                 else if (paren == '>') {   /* (?P>name) named recursion */
10517                     RExC_parse++;
10518                     if (RExC_parse >= RExC_end) {
10519                         vFAIL("Sequence (?P>... not terminated");
10520                     }
10521                     goto named_recursion;
10522                 }
10523                 else if (paren == '=') {   /* (?P=...)  named backref */
10524                     RExC_parse++;
10525                     return handle_named_backref(pRExC_state, flagp,
10526                                                 parse_start, ')');
10527                 }
10528                 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10529                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10530 		vFAIL3("Sequence (%.*s...) not recognized",
10531                                 RExC_parse-seqstart, seqstart);
10532 		NOT_REACHED; /*NOTREACHED*/
10533             case '<':           /* (?<...) */
10534 		if (*RExC_parse == '!')
10535 		    paren = ',';
10536 		else if (*RExC_parse != '=')
10537               named_capture:
10538 		{               /* (?<...>) */
10539 		    char *name_start;
10540 		    SV *svname;
10541 		    paren= '>';
10542                 /* FALLTHROUGH */
10543             case '\'':          /* (?'...') */
10544                     name_start = RExC_parse;
10545                     svname = reg_scan_name(pRExC_state,
10546                         SIZE_ONLY    /* reverse test from the others */
10547                         ? REG_RSN_RETURN_NAME
10548                         : REG_RSN_RETURN_NULL);
10549 		    if (   RExC_parse == name_start
10550                         || RExC_parse >= RExC_end
10551                         || *RExC_parse != paren)
10552                     {
10553 		        vFAIL2("Sequence (?%c... not terminated",
10554 		            paren=='>' ? '<' : paren);
10555                     }
10556 		    if (SIZE_ONLY) {
10557 			HE *he_str;
10558 			SV *sv_dat = NULL;
10559                         if (!svname) /* shouldn't happen */
10560                             Perl_croak(aTHX_
10561                                 "panic: reg_scan_name returned NULL");
10562                         if (!RExC_paren_names) {
10563                             RExC_paren_names= newHV();
10564                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
10565 #ifdef DEBUGGING
10566                             RExC_paren_name_list= newAV();
10567                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10568 #endif
10569                         }
10570                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10571                         if ( he_str )
10572                             sv_dat = HeVAL(he_str);
10573                         if ( ! sv_dat ) {
10574                             /* croak baby croak */
10575                             Perl_croak(aTHX_
10576                                 "panic: paren_name hash element allocation failed");
10577                         } else if ( SvPOK(sv_dat) ) {
10578                             /* (?|...) can mean we have dupes so scan to check
10579                                its already been stored. Maybe a flag indicating
10580                                we are inside such a construct would be useful,
10581                                but the arrays are likely to be quite small, so
10582                                for now we punt -- dmq */
10583                             IV count = SvIV(sv_dat);
10584                             I32 *pv = (I32*)SvPVX(sv_dat);
10585                             IV i;
10586                             for ( i = 0 ; i < count ; i++ ) {
10587                                 if ( pv[i] == RExC_npar ) {
10588                                     count = 0;
10589                                     break;
10590                                 }
10591                             }
10592                             if ( count ) {
10593                                 pv = (I32*)SvGROW(sv_dat,
10594                                                 SvCUR(sv_dat) + sizeof(I32)+1);
10595                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10596                                 pv[count] = RExC_npar;
10597                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10598                             }
10599                         } else {
10600                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
10601                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
10602                                                                 sizeof(I32));
10603                             SvIOK_on(sv_dat);
10604                             SvIV_set(sv_dat, 1);
10605                         }
10606 #ifdef DEBUGGING
10607                         /* Yes this does cause a memory leak in debugging Perls
10608                          * */
10609                         if (!av_store(RExC_paren_name_list,
10610                                       RExC_npar, SvREFCNT_inc(svname)))
10611                             SvREFCNT_dec_NN(svname);
10612 #endif
10613 
10614                         /*sv_dump(sv_dat);*/
10615                     }
10616                     nextchar(pRExC_state);
10617 		    paren = 1;
10618 		    goto capturing_parens;
10619 		}
10620                 RExC_seen |= REG_LOOKBEHIND_SEEN;
10621 		RExC_in_lookbehind++;
10622 		RExC_parse++;
10623                 if (RExC_parse >= RExC_end) {
10624                     vFAIL("Sequence (?... not terminated");
10625                 }
10626 
10627                 /* FALLTHROUGH */
10628 	    case '=':           /* (?=...) */
10629 		RExC_seen_zerolen++;
10630                 break;
10631 	    case '!':           /* (?!...) */
10632 		RExC_seen_zerolen++;
10633 		/* check if we're really just a "FAIL" assertion */
10634                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10635                                         FALSE /* Don't force to /x */ );
10636 	        if (*RExC_parse == ')') {
10637                     ret=reganode(pRExC_state, OPFAIL, 0);
10638 	            nextchar(pRExC_state);
10639 	            return ret;
10640 	        }
10641 	        break;
10642 	    case '|':           /* (?|...) */
10643 	        /* branch reset, behave like a (?:...) except that
10644 	           buffers in alternations share the same numbers */
10645 	        paren = ':';
10646 	        after_freeze = freeze_paren = RExC_npar;
10647 	        break;
10648 	    case ':':           /* (?:...) */
10649 	    case '>':           /* (?>...) */
10650 		break;
10651 	    case '$':           /* (?$...) */
10652 	    case '@':           /* (?@...) */
10653 		vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10654 		break;
10655 	    case '0' :           /* (?0) */
10656 	    case 'R' :           /* (?R) */
10657                 if (RExC_parse == RExC_end || *RExC_parse != ')')
10658 		    FAIL("Sequence (?R) not terminated");
10659                 num = 0;
10660                 RExC_seen |= REG_RECURSE_SEEN;
10661 		*flagp |= POSTPONED;
10662                 goto gen_recurse_regop;
10663 		/*notreached*/
10664             /* named and numeric backreferences */
10665             case '&':            /* (?&NAME) */
10666                 parse_start = RExC_parse - 1;
10667               named_recursion:
10668                 {
10669     		    SV *sv_dat = reg_scan_name(pRExC_state,
10670     		        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10671     		     num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10672                 }
10673                 if (RExC_parse >= RExC_end || *RExC_parse != ')')
10674                     vFAIL("Sequence (?&... not terminated");
10675                 goto gen_recurse_regop;
10676                 /* NOTREACHED */
10677             case '+':
10678                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10679                     RExC_parse++;
10680                     vFAIL("Illegal pattern");
10681                 }
10682                 goto parse_recursion;
10683                 /* NOTREACHED*/
10684             case '-': /* (?-1) */
10685                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10686                     RExC_parse--; /* rewind to let it be handled later */
10687                     goto parse_flags;
10688                 }
10689                 /* FALLTHROUGH */
10690             case '1': case '2': case '3': case '4': /* (?1) */
10691 	    case '5': case '6': case '7': case '8': case '9':
10692 	        RExC_parse = (char *) seqstart + 1;  /* Point to the digit */
10693               parse_recursion:
10694                 {
10695                     bool is_neg = FALSE;
10696                     UV unum;
10697                     parse_start = RExC_parse - 1; /* MJD */
10698                     if (*RExC_parse == '-') {
10699                         RExC_parse++;
10700                         is_neg = TRUE;
10701                     }
10702                     if (grok_atoUV(RExC_parse, &unum, &endptr)
10703                         && unum <= I32_MAX
10704                     ) {
10705                         num = (I32)unum;
10706                         RExC_parse = (char*)endptr;
10707                     } else
10708                         num = I32_MAX;
10709                     if (is_neg) {
10710                         /* Some limit for num? */
10711                         num = -num;
10712                     }
10713                 }
10714 	        if (*RExC_parse!=')')
10715 	            vFAIL("Expecting close bracket");
10716 
10717               gen_recurse_regop:
10718                 if ( paren == '-' ) {
10719                     /*
10720                     Diagram of capture buffer numbering.
10721                     Top line is the normal capture buffer numbers
10722                     Bottom line is the negative indexing as from
10723                     the X (the (?-2))
10724 
10725                     +   1 2    3 4 5 X          6 7
10726                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10727                     -   5 4    3 2 1 X          x x
10728 
10729                     */
10730                     num = RExC_npar + num;
10731                     if (num < 1)  {
10732                         RExC_parse++;
10733                         vFAIL("Reference to nonexistent group");
10734                     }
10735                 } else if ( paren == '+' ) {
10736                     num = RExC_npar + num - 1;
10737                 }
10738                 /* We keep track how many GOSUB items we have produced.
10739                    To start off the ARG2L() of the GOSUB holds its "id",
10740                    which is used later in conjunction with RExC_recurse
10741                    to calculate the offset we need to jump for the GOSUB,
10742                    which it will store in the final representation.
10743                    We have to defer the actual calculation until much later
10744                    as the regop may move.
10745                  */
10746 
10747                 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10748                 if (!SIZE_ONLY) {
10749 		    if (num > (I32)RExC_rx->nparens) {
10750 			RExC_parse++;
10751 			vFAIL("Reference to nonexistent group");
10752 	            }
10753 	            RExC_recurse_count++;
10754                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
10755                         "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10756                               22, "|    |", (int)(depth * 2 + 1), "",
10757                               (UV)ARG(ret), (IV)ARG2L(ret)));
10758                 }
10759                 RExC_seen |= REG_RECURSE_SEEN;
10760 
10761                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10762 		Set_Node_Offset(ret, parse_start); /* MJD */
10763 
10764                 *flagp |= POSTPONED;
10765                 assert(*RExC_parse == ')');
10766                 nextchar(pRExC_state);
10767                 return ret;
10768 
10769             /* NOTREACHED */
10770 
10771 	    case '?':           /* (??...) */
10772 		is_logical = 1;
10773 		if (*RExC_parse != '{') {
10774                     RExC_parse += SKIP_IF_CHAR(RExC_parse);
10775                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10776                     vFAIL2utf8f(
10777                         "Sequence (%"UTF8f"...) not recognized",
10778                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10779 		    NOT_REACHED; /*NOTREACHED*/
10780 		}
10781 		*flagp |= POSTPONED;
10782 		paren = '{';
10783                 RExC_parse++;
10784 		/* FALLTHROUGH */
10785 	    case '{':           /* (?{...}) */
10786 	    {
10787 		U32 n = 0;
10788 		struct reg_code_block *cb;
10789 
10790 		RExC_seen_zerolen++;
10791 
10792 		if (   !pRExC_state->num_code_blocks
10793 		    || pRExC_state->code_index >= pRExC_state->num_code_blocks
10794 		    || pRExC_state->code_blocks[pRExC_state->code_index].start
10795 			!= (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10796 			    - RExC_start)
10797 		) {
10798 		    if (RExC_pm_flags & PMf_USE_RE_EVAL)
10799 			FAIL("panic: Sequence (?{...}): no code block found\n");
10800 		    FAIL("Eval-group not allowed at runtime, use re 'eval'");
10801 		}
10802 		/* this is a pre-compiled code block (?{...}) */
10803 		cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10804 		RExC_parse = RExC_start + cb->end;
10805 		if (!SIZE_ONLY) {
10806 		    OP *o = cb->block;
10807 		    if (cb->src_regex) {
10808 			n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10809 			RExC_rxi->data->data[n] =
10810 			    (void*)SvREFCNT_inc((SV*)cb->src_regex);
10811 			RExC_rxi->data->data[n+1] = (void*)o;
10812 		    }
10813 		    else {
10814 			n = add_data(pRExC_state,
10815 			       (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10816 			RExC_rxi->data->data[n] = (void*)o;
10817 		    }
10818 		}
10819 		pRExC_state->code_index++;
10820 		nextchar(pRExC_state);
10821 
10822 		if (is_logical) {
10823                     regnode *eval;
10824 		    ret = reg_node(pRExC_state, LOGICAL);
10825 
10826                     eval = reg2Lanode(pRExC_state, EVAL,
10827                                        n,
10828 
10829                                        /* for later propagation into (??{})
10830                                         * return value */
10831                                        RExC_flags & RXf_PMf_COMPILETIME
10832                                       );
10833 		    if (!SIZE_ONLY) {
10834 			ret->flags = 2;
10835                     }
10836                     REGTAIL(pRExC_state, ret, eval);
10837                     /* deal with the length of this later - MJD */
10838 		    return ret;
10839 		}
10840 		ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10841 		Set_Node_Length(ret, RExC_parse - parse_start + 1);
10842 		Set_Node_Offset(ret, parse_start);
10843 		return ret;
10844 	    }
10845 	    case '(':           /* (?(?{...})...) and (?(?=...)...) */
10846 	    {
10847 	        int is_define= 0;
10848                 const int DEFINE_len = sizeof("DEFINE") - 1;
10849 		if (RExC_parse[0] == '?') {        /* (?(?...)) */
10850                     if (   RExC_parse < RExC_end - 1
10851                         && (   RExC_parse[1] == '='
10852                             || RExC_parse[1] == '!'
10853                             || RExC_parse[1] == '<'
10854                             || RExC_parse[1] == '{')
10855                     ) { /* Lookahead or eval. */
10856 			I32 flag;
10857                         regnode *tail;
10858 
10859 			ret = reg_node(pRExC_state, LOGICAL);
10860 			if (!SIZE_ONLY)
10861 			    ret->flags = 1;
10862 
10863                         tail = reg(pRExC_state, 1, &flag, depth+1);
10864                         if (flag & (RESTART_PASS1|NEED_UTF8)) {
10865                             *flagp = flag & (RESTART_PASS1|NEED_UTF8);
10866                             return NULL;
10867                         }
10868                         REGTAIL(pRExC_state, ret, tail);
10869 			goto insert_if;
10870 		    }
10871 		    /* Fall through to ‘Unknown switch condition’ at the
10872 		       end of the if/else chain. */
10873 		}
10874 		else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
10875 		         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10876 	        {
10877 	            char ch = RExC_parse[0] == '<' ? '>' : '\'';
10878 	            char *name_start= RExC_parse++;
10879 	            U32 num = 0;
10880 	            SV *sv_dat=reg_scan_name(pRExC_state,
10881 	                SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10882 	            if (   RExC_parse == name_start
10883                         || RExC_parse >= RExC_end
10884                         || *RExC_parse != ch)
10885                     {
10886                         vFAIL2("Sequence (?(%c... not terminated",
10887                             (ch == '>' ? '<' : ch));
10888                     }
10889                     RExC_parse++;
10890 	            if (!SIZE_ONLY) {
10891                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
10892                         RExC_rxi->data->data[num]=(void*)sv_dat;
10893                         SvREFCNT_inc_simple_void(sv_dat);
10894                     }
10895                     ret = reganode(pRExC_state,NGROUPP,num);
10896                     goto insert_if_check_paren;
10897 		}
10898 		else if (RExC_end - RExC_parse >= DEFINE_len
10899                         && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10900                 {
10901 		    ret = reganode(pRExC_state,DEFINEP,0);
10902 		    RExC_parse += DEFINE_len;
10903 		    is_define = 1;
10904 		    goto insert_if_check_paren;
10905 		}
10906 		else if (RExC_parse[0] == 'R') {
10907 		    RExC_parse++;
10908                     /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
10909                      * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
10910                      * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
10911                      */
10912 		    parno = 0;
10913                     if (RExC_parse[0] == '0') {
10914                         parno = 1;
10915                         RExC_parse++;
10916                     }
10917                     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10918                         UV uv;
10919                         if (grok_atoUV(RExC_parse, &uv, &endptr)
10920                             && uv <= I32_MAX
10921                         ) {
10922                             parno = (I32)uv + 1;
10923                             RExC_parse = (char*)endptr;
10924                         }
10925                         /* else "Switch condition not recognized" below */
10926 		    } else if (RExC_parse[0] == '&') {
10927 		        SV *sv_dat;
10928 		        RExC_parse++;
10929 		        sv_dat = reg_scan_name(pRExC_state,
10930                             SIZE_ONLY
10931                             ? REG_RSN_RETURN_NULL
10932                             : REG_RSN_RETURN_DATA);
10933 
10934                         /* we should only have a false sv_dat when
10935                          * SIZE_ONLY is true, and we always have false
10936                          * sv_dat when SIZE_ONLY is true.
10937                          * reg_scan_name() will VFAIL() if the name is
10938                          * unknown when SIZE_ONLY is false, and otherwise
10939                          * will return something, and when SIZE_ONLY is
10940                          * true, reg_scan_name() just parses the string,
10941                          * and doesnt return anything. (in theory) */
10942                         assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
10943 
10944                         if (sv_dat)
10945                             parno = 1 + *((I32 *)SvPVX(sv_dat));
10946 		    }
10947 		    ret = reganode(pRExC_state,INSUBP,parno);
10948 		    goto insert_if_check_paren;
10949 		}
10950 		else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10951                     /* (?(1)...) */
10952 		    char c;
10953                     UV uv;
10954                     if (grok_atoUV(RExC_parse, &uv, &endptr)
10955                         && uv <= I32_MAX
10956                     ) {
10957                         parno = (I32)uv;
10958                         RExC_parse = (char*)endptr;
10959                     }
10960                     else {
10961                         vFAIL("panic: grok_atoUV returned FALSE");
10962                     }
10963                     ret = reganode(pRExC_state, GROUPP, parno);
10964 
10965                  insert_if_check_paren:
10966 		    if (UCHARAT(RExC_parse) != ')') {
10967                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10968 			vFAIL("Switch condition not recognized");
10969 		    }
10970 		    nextchar(pRExC_state);
10971 		  insert_if:
10972                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10973                     br = regbranch(pRExC_state, &flags, 1,depth+1);
10974 		    if (br == NULL) {
10975                         if (flags & (RESTART_PASS1|NEED_UTF8)) {
10976                             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10977                             return NULL;
10978                         }
10979                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10980                               (UV) flags);
10981                     } else
10982                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
10983                                                           LONGJMP, 0));
10984 		    c = UCHARAT(RExC_parse);
10985                     nextchar(pRExC_state);
10986 		    if (flags&HASWIDTH)
10987 			*flagp |= HASWIDTH;
10988 		    if (c == '|') {
10989 		        if (is_define)
10990 		            vFAIL("(?(DEFINE)....) does not allow branches");
10991 
10992                         /* Fake one for optimizer.  */
10993                         lastbr = reganode(pRExC_state, IFTHEN, 0);
10994 
10995                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10996                             if (flags & (RESTART_PASS1|NEED_UTF8)) {
10997                                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10998                                 return NULL;
10999                             }
11000                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
11001                                   (UV) flags);
11002                         }
11003                         REGTAIL(pRExC_state, ret, lastbr);
11004 		 	if (flags&HASWIDTH)
11005 			    *flagp |= HASWIDTH;
11006                         c = UCHARAT(RExC_parse);
11007                         nextchar(pRExC_state);
11008 		    }
11009 		    else
11010 			lastbr = NULL;
11011                     if (c != ')') {
11012                         if (RExC_parse >= RExC_end)
11013                             vFAIL("Switch (?(condition)... not terminated");
11014                         else
11015                             vFAIL("Switch (?(condition)... contains too many branches");
11016                     }
11017 		    ender = reg_node(pRExC_state, TAIL);
11018                     REGTAIL(pRExC_state, br, ender);
11019 		    if (lastbr) {
11020                         REGTAIL(pRExC_state, lastbr, ender);
11021                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11022 		    }
11023 		    else
11024                         REGTAIL(pRExC_state, ret, ender);
11025                     RExC_size++; /* XXX WHY do we need this?!!
11026                                     For large programs it seems to be required
11027                                     but I can't figure out why. -- dmq*/
11028 		    return ret;
11029 		}
11030                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11031                 vFAIL("Unknown switch condition (?(...))");
11032 	    }
11033 	    case '[':           /* (?[ ... ]) */
11034                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
11035                                          oregcomp_parse);
11036             case 0: /* A NUL */
11037 		RExC_parse--; /* for vFAIL to print correctly */
11038                 vFAIL("Sequence (? incomplete");
11039                 break;
11040 	    default: /* e.g., (?i) */
11041 	        RExC_parse = (char *) seqstart + 1;
11042               parse_flags:
11043 		parse_lparen_question_flags(pRExC_state);
11044                 if (UCHARAT(RExC_parse) != ':') {
11045                     if (RExC_parse < RExC_end)
11046                         nextchar(pRExC_state);
11047                     *flagp = TRYAGAIN;
11048                     return NULL;
11049                 }
11050                 paren = ':';
11051                 nextchar(pRExC_state);
11052                 ret = NULL;
11053                 goto parse_rest;
11054             } /* end switch */
11055 	}
11056 	else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
11057 	  capturing_parens:
11058 	    parno = RExC_npar;
11059 	    RExC_npar++;
11060 
11061 	    ret = reganode(pRExC_state, OPEN, parno);
11062 	    if (!SIZE_ONLY ){
11063 	        if (!RExC_nestroot)
11064 	            RExC_nestroot = parno;
11065                 if (RExC_open_parens && !RExC_open_parens[parno])
11066 	        {
11067                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11068                         "%*s%*s Setting open paren #%"IVdf" to %d\n",
11069                         22, "|    |", (int)(depth * 2 + 1), "",
11070 			(IV)parno, REG_NODE_NUM(ret)));
11071                     RExC_open_parens[parno]= ret;
11072 	        }
11073 	    }
11074             Set_Node_Length(ret, 1); /* MJD */
11075             Set_Node_Offset(ret, RExC_parse); /* MJD */
11076 	    is_open = 1;
11077 	} else {
11078             /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
11079             paren = ':';
11080 	    ret = NULL;
11081 	}
11082     }
11083     else                        /* ! paren */
11084 	ret = NULL;
11085 
11086    parse_rest:
11087     /* Pick up the branches, linking them together. */
11088     parse_start = RExC_parse;   /* MJD */
11089     br = regbranch(pRExC_state, &flags, 1,depth+1);
11090 
11091     /*     branch_len = (paren != 0); */
11092 
11093     if (br == NULL) {
11094         if (flags & (RESTART_PASS1|NEED_UTF8)) {
11095             *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11096             return NULL;
11097         }
11098         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
11099     }
11100     if (*RExC_parse == '|') {
11101 	if (!SIZE_ONLY && RExC_extralen) {
11102 	    reginsert(pRExC_state, BRANCHJ, br, depth+1);
11103 	}
11104 	else {                  /* MJD */
11105 	    reginsert(pRExC_state, BRANCH, br, depth+1);
11106             Set_Node_Length(br, paren != 0);
11107             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
11108         }
11109 	have_branch = 1;
11110 	if (SIZE_ONLY)
11111 	    RExC_extralen += 1;		/* For BRANCHJ-BRANCH. */
11112     }
11113     else if (paren == ':') {
11114 	*flagp |= flags&SIMPLE;
11115     }
11116     if (is_open) {				/* Starts with OPEN. */
11117         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
11118     }
11119     else if (paren != '?')		/* Not Conditional */
11120 	ret = br;
11121     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11122     lastbr = br;
11123     while (*RExC_parse == '|') {
11124 	if (!SIZE_ONLY && RExC_extralen) {
11125 	    ender = reganode(pRExC_state, LONGJMP,0);
11126 
11127             /* Append to the previous. */
11128             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
11129 	}
11130 	if (SIZE_ONLY)
11131 	    RExC_extralen += 2;		/* Account for LONGJMP. */
11132 	nextchar(pRExC_state);
11133 	if (freeze_paren) {
11134 	    if (RExC_npar > after_freeze)
11135 	        after_freeze = RExC_npar;
11136             RExC_npar = freeze_paren;
11137         }
11138         br = regbranch(pRExC_state, &flags, 0, depth+1);
11139 
11140 	if (br == NULL) {
11141             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11142                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11143                 return NULL;
11144             }
11145             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
11146         }
11147         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
11148 	lastbr = br;
11149 	*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11150     }
11151 
11152     if (have_branch || paren != ':') {
11153 	/* Make a closing node, and hook it on the end. */
11154 	switch (paren) {
11155 	case ':':
11156 	    ender = reg_node(pRExC_state, TAIL);
11157 	    break;
11158 	case 1: case 2:
11159 	    ender = reganode(pRExC_state, CLOSE, parno);
11160             if ( RExC_close_parens ) {
11161                 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11162                         "%*s%*s Setting close paren #%"IVdf" to %d\n",
11163                         22, "|    |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
11164                 RExC_close_parens[parno]= ender;
11165 	        if (RExC_nestroot == parno)
11166 	            RExC_nestroot = 0;
11167 	    }
11168             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
11169             Set_Node_Length(ender,1); /* MJD */
11170 	    break;
11171 	case '<':
11172 	case ',':
11173 	case '=':
11174 	case '!':
11175 	    *flagp &= ~HASWIDTH;
11176 	    /* FALLTHROUGH */
11177 	case '>':
11178 	    ender = reg_node(pRExC_state, SUCCEED);
11179 	    break;
11180 	case 0:
11181 	    ender = reg_node(pRExC_state, END);
11182 	    if (!SIZE_ONLY) {
11183                 assert(!RExC_end_op); /* there can only be one! */
11184                 RExC_end_op = ender;
11185                 if (RExC_close_parens) {
11186                     DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11187                         "%*s%*s Setting close paren #0 (END) to %d\n",
11188                         22, "|    |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender)));
11189 
11190                     RExC_close_parens[0]= ender;
11191                 }
11192             }
11193 	    break;
11194 	}
11195         DEBUG_PARSE_r(if (!SIZE_ONLY) {
11196             DEBUG_PARSE_MSG("lsbr");
11197             regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
11198             regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11199             Perl_re_printf( aTHX_  "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
11200                           SvPV_nolen_const(RExC_mysv1),
11201                           (IV)REG_NODE_NUM(lastbr),
11202                           SvPV_nolen_const(RExC_mysv2),
11203                           (IV)REG_NODE_NUM(ender),
11204                           (IV)(ender - lastbr)
11205             );
11206         });
11207         REGTAIL(pRExC_state, lastbr, ender);
11208 
11209 	if (have_branch && !SIZE_ONLY) {
11210             char is_nothing= 1;
11211 	    if (depth==1)
11212                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
11213 
11214 	    /* Hook the tails of the branches to the closing node. */
11215 	    for (br = ret; br; br = regnext(br)) {
11216 		const U8 op = PL_regkind[OP(br)];
11217 		if (op == BRANCH) {
11218                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
11219                     if ( OP(NEXTOPER(br)) != NOTHING
11220                          || regnext(NEXTOPER(br)) != ender)
11221                         is_nothing= 0;
11222 		}
11223 		else if (op == BRANCHJ) {
11224                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
11225                     /* for now we always disable this optimisation * /
11226                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
11227                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
11228                     */
11229                         is_nothing= 0;
11230 		}
11231 	    }
11232             if (is_nothing) {
11233                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
11234                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11235                     DEBUG_PARSE_MSG("NADA");
11236                     regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
11237                     regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
11238                     Perl_re_printf( aTHX_  "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
11239                                   SvPV_nolen_const(RExC_mysv1),
11240                                   (IV)REG_NODE_NUM(ret),
11241                                   SvPV_nolen_const(RExC_mysv2),
11242                                   (IV)REG_NODE_NUM(ender),
11243                                   (IV)(ender - ret)
11244                     );
11245                 });
11246                 OP(br)= NOTHING;
11247                 if (OP(ender) == TAIL) {
11248                     NEXT_OFF(br)= 0;
11249                     RExC_emit= br + 1;
11250                 } else {
11251                     regnode *opt;
11252                     for ( opt= br + 1; opt < ender ; opt++ )
11253                         OP(opt)= OPTIMIZED;
11254                     NEXT_OFF(br)= ender - br;
11255                 }
11256             }
11257 	}
11258     }
11259 
11260     {
11261         const char *p;
11262         static const char parens[] = "=!<,>";
11263 
11264 	if (paren && (p = strchr(parens, paren))) {
11265 	    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
11266 	    int flag = (p - parens) > 1;
11267 
11268 	    if (paren == '>')
11269 		node = SUSPEND, flag = 0;
11270 	    reginsert(pRExC_state, node,ret, depth+1);
11271             Set_Node_Cur_Length(ret, parse_start);
11272 	    Set_Node_Offset(ret, parse_start + 1);
11273 	    ret->flags = flag;
11274             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
11275 	}
11276     }
11277 
11278     /* Check for proper termination. */
11279     if (paren) {
11280         /* restore original flags, but keep (?p) and, if we've changed from /d
11281          * rules to /u, keep the /u */
11282 	RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
11283         if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
11284             set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
11285         }
11286 	if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
11287 	    RExC_parse = oregcomp_parse;
11288 	    vFAIL("Unmatched (");
11289 	}
11290 	nextchar(pRExC_state);
11291     }
11292     else if (!paren && RExC_parse < RExC_end) {
11293 	if (*RExC_parse == ')') {
11294 	    RExC_parse++;
11295 	    vFAIL("Unmatched )");
11296 	}
11297 	else
11298 	    FAIL("Junk on end of regexp");	/* "Can't happen". */
11299 	NOT_REACHED; /* NOTREACHED */
11300     }
11301 
11302     if (RExC_in_lookbehind) {
11303 	RExC_in_lookbehind--;
11304     }
11305     if (after_freeze > RExC_npar)
11306         RExC_npar = after_freeze;
11307     return(ret);
11308 }
11309 
11310 /*
11311  - regbranch - one alternative of an | operator
11312  *
11313  * Implements the concatenation operator.
11314  *
11315  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11316  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11317  */
11318 STATIC regnode *
11319 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
11320 {
11321     regnode *ret;
11322     regnode *chain = NULL;
11323     regnode *latest;
11324     I32 flags = 0, c = 0;
11325     GET_RE_DEBUG_FLAGS_DECL;
11326 
11327     PERL_ARGS_ASSERT_REGBRANCH;
11328 
11329     DEBUG_PARSE("brnc");
11330 
11331     if (first)
11332 	ret = NULL;
11333     else {
11334 	if (!SIZE_ONLY && RExC_extralen)
11335 	    ret = reganode(pRExC_state, BRANCHJ,0);
11336 	else {
11337 	    ret = reg_node(pRExC_state, BRANCH);
11338             Set_Node_Length(ret, 1);
11339         }
11340     }
11341 
11342     if (!first && SIZE_ONLY)
11343 	RExC_extralen += 1;			/* BRANCHJ */
11344 
11345     *flagp = WORST;			/* Tentatively. */
11346 
11347     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11348                             FALSE /* Don't force to /x */ );
11349     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
11350 	flags &= ~TRYAGAIN;
11351         latest = regpiece(pRExC_state, &flags,depth+1);
11352 	if (latest == NULL) {
11353 	    if (flags & TRYAGAIN)
11354 		continue;
11355             if (flags & (RESTART_PASS1|NEED_UTF8)) {
11356                 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11357                 return NULL;
11358             }
11359             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
11360 	}
11361 	else if (ret == NULL)
11362 	    ret = latest;
11363 	*flagp |= flags&(HASWIDTH|POSTPONED);
11364 	if (chain == NULL) 	/* First piece. */
11365 	    *flagp |= flags&SPSTART;
11366 	else {
11367 	    /* FIXME adding one for every branch after the first is probably
11368 	     * excessive now we have TRIE support. (hv) */
11369 	    MARK_NAUGHTY(1);
11370             REGTAIL(pRExC_state, chain, latest);
11371 	}
11372 	chain = latest;
11373 	c++;
11374     }
11375     if (chain == NULL) {	/* Loop ran zero times. */
11376 	chain = reg_node(pRExC_state, NOTHING);
11377 	if (ret == NULL)
11378 	    ret = chain;
11379     }
11380     if (c == 1) {
11381 	*flagp |= flags&SIMPLE;
11382     }
11383 
11384     return ret;
11385 }
11386 
11387 /*
11388  - regpiece - something followed by possible [*+?]
11389  *
11390  * Note that the branching code sequences used for ? and the general cases
11391  * of * and + are somewhat optimized:  they use the same NOTHING node as
11392  * both the endmarker for their branch list and the body of the last branch.
11393  * It might seem that this node could be dispensed with entirely, but the
11394  * endmarker role is not redundant.
11395  *
11396  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
11397  * TRYAGAIN.
11398  * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11399  * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11400  */
11401 STATIC regnode *
11402 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11403 {
11404     regnode *ret;
11405     char op;
11406     char *next;
11407     I32 flags;
11408     const char * const origparse = RExC_parse;
11409     I32 min;
11410     I32 max = REG_INFTY;
11411 #ifdef RE_TRACK_PATTERN_OFFSETS
11412     char *parse_start;
11413 #endif
11414     const char *maxpos = NULL;
11415     UV uv;
11416 
11417     /* Save the original in case we change the emitted regop to a FAIL. */
11418     regnode * const orig_emit = RExC_emit;
11419 
11420     GET_RE_DEBUG_FLAGS_DECL;
11421 
11422     PERL_ARGS_ASSERT_REGPIECE;
11423 
11424     DEBUG_PARSE("piec");
11425 
11426     ret = regatom(pRExC_state, &flags,depth+1);
11427     if (ret == NULL) {
11428 	if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
11429 	    *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
11430         else
11431             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
11432 	return(NULL);
11433     }
11434 
11435     op = *RExC_parse;
11436 
11437     if (op == '{' && regcurly(RExC_parse)) {
11438 	maxpos = NULL;
11439 #ifdef RE_TRACK_PATTERN_OFFSETS
11440         parse_start = RExC_parse; /* MJD */
11441 #endif
11442 	next = RExC_parse + 1;
11443 	while (isDIGIT(*next) || *next == ',') {
11444 	    if (*next == ',') {
11445 		if (maxpos)
11446 		    break;
11447 		else
11448 		    maxpos = next;
11449 	    }
11450 	    next++;
11451 	}
11452 	if (*next == '}') {		/* got one */
11453             const char* endptr;
11454 	    if (!maxpos)
11455 		maxpos = next;
11456 	    RExC_parse++;
11457             if (isDIGIT(*RExC_parse)) {
11458                 if (!grok_atoUV(RExC_parse, &uv, &endptr))
11459                     vFAIL("Invalid quantifier in {,}");
11460                 if (uv >= REG_INFTY)
11461                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11462                 min = (I32)uv;
11463             } else {
11464                 min = 0;
11465             }
11466 	    if (*maxpos == ',')
11467 		maxpos++;
11468 	    else
11469 		maxpos = RExC_parse;
11470             if (isDIGIT(*maxpos)) {
11471                 if (!grok_atoUV(maxpos, &uv, &endptr))
11472                     vFAIL("Invalid quantifier in {,}");
11473                 if (uv >= REG_INFTY)
11474                     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11475                 max = (I32)uv;
11476             } else {
11477 		max = REG_INFTY;		/* meaning "infinity" */
11478             }
11479 	    RExC_parse = next;
11480 	    nextchar(pRExC_state);
11481             if (max < min) {    /* If can't match, warn and optimize to fail
11482                                    unconditionally */
11483                 if (SIZE_ONLY) {
11484 
11485                     /* We can't back off the size because we have to reserve
11486                      * enough space for all the things we are about to throw
11487                      * away, but we can shrink it by the amount we are about
11488                      * to re-use here */
11489                     RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
11490                 }
11491                 else {
11492                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
11493                     RExC_emit = orig_emit;
11494                 }
11495                 ret = reganode(pRExC_state, OPFAIL, 0);
11496                 return ret;
11497             }
11498             else if (min == max && *RExC_parse == '?')
11499             {
11500                 if (PASS2) {
11501                     ckWARN2reg(RExC_parse + 1,
11502                                "Useless use of greediness modifier '%c'",
11503                                *RExC_parse);
11504                 }
11505             }
11506 
11507 	  do_curly:
11508 	    if ((flags&SIMPLE)) {
11509                 if (min == 0 && max == REG_INFTY) {
11510                     reginsert(pRExC_state, STAR, ret, depth+1);
11511                     ret->flags = 0;
11512                     MARK_NAUGHTY(4);
11513                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11514                     goto nest_check;
11515                 }
11516                 if (min == 1 && max == REG_INFTY) {
11517                     reginsert(pRExC_state, PLUS, ret, depth+1);
11518                     ret->flags = 0;
11519                     MARK_NAUGHTY(3);
11520                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11521                     goto nest_check;
11522                 }
11523                 MARK_NAUGHTY_EXP(2, 2);
11524 		reginsert(pRExC_state, CURLY, ret, depth+1);
11525                 Set_Node_Offset(ret, parse_start+1); /* MJD */
11526                 Set_Node_Cur_Length(ret, parse_start);
11527 	    }
11528 	    else {
11529 		regnode * const w = reg_node(pRExC_state, WHILEM);
11530 
11531 		w->flags = 0;
11532                 REGTAIL(pRExC_state, ret, w);
11533 		if (!SIZE_ONLY && RExC_extralen) {
11534 		    reginsert(pRExC_state, LONGJMP,ret, depth+1);
11535 		    reginsert(pRExC_state, NOTHING,ret, depth+1);
11536 		    NEXT_OFF(ret) = 3;	/* Go over LONGJMP. */
11537 		}
11538 		reginsert(pRExC_state, CURLYX,ret, depth+1);
11539                                 /* MJD hk */
11540                 Set_Node_Offset(ret, parse_start+1);
11541                 Set_Node_Length(ret,
11542                                 op == '{' ? (RExC_parse - parse_start) : 1);
11543 
11544 		if (!SIZE_ONLY && RExC_extralen)
11545 		    NEXT_OFF(ret) = 3;	/* Go over NOTHING to LONGJMP. */
11546                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11547 		if (SIZE_ONLY)
11548 		    RExC_whilem_seen++, RExC_extralen += 3;
11549                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
11550 	    }
11551 	    ret->flags = 0;
11552 
11553 	    if (min > 0)
11554 		*flagp = WORST;
11555 	    if (max > 0)
11556 		*flagp |= HASWIDTH;
11557 	    if (!SIZE_ONLY) {
11558 		ARG1_SET(ret, (U16)min);
11559 		ARG2_SET(ret, (U16)max);
11560 	    }
11561             if (max == REG_INFTY)
11562                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11563 
11564 	    goto nest_check;
11565 	}
11566     }
11567 
11568     if (!ISMULT1(op)) {
11569 	*flagp = flags;
11570 	return(ret);
11571     }
11572 
11573 #if 0				/* Now runtime fix should be reliable. */
11574 
11575     /* if this is reinstated, don't forget to put this back into perldiag:
11576 
11577 	    =item Regexp *+ operand could be empty at {#} in regex m/%s/
11578 
11579 	   (F) The part of the regexp subject to either the * or + quantifier
11580            could match an empty string. The {#} shows in the regular
11581            expression about where the problem was discovered.
11582 
11583     */
11584 
11585     if (!(flags&HASWIDTH) && op != '?')
11586       vFAIL("Regexp *+ operand could be empty");
11587 #endif
11588 
11589 #ifdef RE_TRACK_PATTERN_OFFSETS
11590     parse_start = RExC_parse;
11591 #endif
11592     nextchar(pRExC_state);
11593 
11594     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11595 
11596     if (op == '*') {
11597 	min = 0;
11598 	goto do_curly;
11599     }
11600     else if (op == '+') {
11601 	min = 1;
11602 	goto do_curly;
11603     }
11604     else if (op == '?') {
11605 	min = 0; max = 1;
11606 	goto do_curly;
11607     }
11608   nest_check:
11609     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11610 	SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11611 	ckWARN2reg(RExC_parse,
11612 		   "%"UTF8f" matches null string many times",
11613 		   UTF8fARG(UTF, (RExC_parse >= origparse
11614                                  ? RExC_parse - origparse
11615                                  : 0),
11616 		   origparse));
11617 	(void)ReREFCNT_inc(RExC_rx_sv);
11618     }
11619 
11620     if (*RExC_parse == '?') {
11621 	nextchar(pRExC_state);
11622 	reginsert(pRExC_state, MINMOD, ret, depth+1);
11623         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11624     }
11625     else if (*RExC_parse == '+') {
11626         regnode *ender;
11627         nextchar(pRExC_state);
11628         ender = reg_node(pRExC_state, SUCCEED);
11629         REGTAIL(pRExC_state, ret, ender);
11630         reginsert(pRExC_state, SUSPEND, ret, depth+1);
11631         ret->flags = 0;
11632         ender = reg_node(pRExC_state, TAIL);
11633         REGTAIL(pRExC_state, ret, ender);
11634     }
11635 
11636     if (ISMULT2(RExC_parse)) {
11637 	RExC_parse++;
11638 	vFAIL("Nested quantifiers");
11639     }
11640 
11641     return(ret);
11642 }
11643 
11644 STATIC bool
11645 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11646                 regnode ** node_p,
11647                 UV * code_point_p,
11648                 int * cp_count,
11649                 I32 * flagp,
11650                 const bool strict,
11651                 const U32 depth
11652     )
11653 {
11654  /* This routine teases apart the various meanings of \N and returns
11655   * accordingly.  The input parameters constrain which meaning(s) is/are valid
11656   * in the current context.
11657   *
11658   * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11659   *
11660   * If <code_point_p> is not NULL, the context is expecting the result to be a
11661   * single code point.  If this \N instance turns out to a single code point,
11662   * the function returns TRUE and sets *code_point_p to that code point.
11663   *
11664   * If <node_p> is not NULL, the context is expecting the result to be one of
11665   * the things representable by a regnode.  If this \N instance turns out to be
11666   * one such, the function generates the regnode, returns TRUE and sets *node_p
11667   * to point to that regnode.
11668   *
11669   * If this instance of \N isn't legal in any context, this function will
11670   * generate a fatal error and not return.
11671   *
11672   * On input, RExC_parse should point to the first char following the \N at the
11673   * time of the call.  On successful return, RExC_parse will have been updated
11674   * to point to just after the sequence identified by this routine.  Also
11675   * *flagp has been updated as needed.
11676   *
11677   * When there is some problem with the current context and this \N instance,
11678   * the function returns FALSE, without advancing RExC_parse, nor setting
11679   * *node_p, nor *code_point_p, nor *flagp.
11680   *
11681   * If <cp_count> is not NULL, the caller wants to know the length (in code
11682   * points) that this \N sequence matches.  This is set even if the function
11683   * returns FALSE, as detailed below.
11684   *
11685   * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11686   *
11687   * Probably the most common case is for the \N to specify a single code point.
11688   * *cp_count will be set to 1, and *code_point_p will be set to that code
11689   * point.
11690   *
11691   * Another possibility is for the input to be an empty \N{}, which for
11692   * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
11693   * will be set to a generated NOTHING node.
11694   *
11695   * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11696   * set to 0. *node_p will be set to a generated REG_ANY node.
11697   *
11698   * The fourth possibility is that \N resolves to a sequence of more than one
11699   * code points.  *cp_count will be set to the number of code points in the
11700   * sequence. *node_p * will be set to a generated node returned by this
11701   * function calling S_reg().
11702   *
11703   * The final possibility is that it is premature to be calling this function;
11704   * that pass1 needs to be restarted.  This can happen when this changes from
11705   * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
11706   * latter occurs only when the fourth possibility would otherwise be in
11707   * effect, and is because one of those code points requires the pattern to be
11708   * recompiled as UTF-8.  The function returns FALSE, and sets the
11709   * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate.  When this
11710   * happens, the caller needs to desist from continuing parsing, and return
11711   * this information to its caller.  This is not set for when there is only one
11712   * code point, as this can be called as part of an ANYOF node, and they can
11713   * store above-Latin1 code points without the pattern having to be in UTF-8.
11714   *
11715   * For non-single-quoted regexes, the tokenizer has resolved character and
11716   * sequence names inside \N{...} into their Unicode values, normalizing the
11717   * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11718   * hex-represented code points in the sequence.  This is done there because
11719   * the names can vary based on what charnames pragma is in scope at the time,
11720   * so we need a way to take a snapshot of what they resolve to at the time of
11721   * the original parse. [perl #56444].
11722   *
11723   * That parsing is skipped for single-quoted regexes, so we may here get
11724   * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
11725   * parser.  But if the single-quoted regex is something like '\N{U+41}', that
11726   * is legal and handled here.  The code point is Unicode, and has to be
11727   * translated into the native character set for non-ASCII platforms.
11728   */
11729 
11730     char * endbrace;    /* points to '}' following the name */
11731     char *endchar;	/* Points to '.' or '}' ending cur char in the input
11732                            stream */
11733     char* p = RExC_parse; /* Temporary */
11734 
11735     GET_RE_DEBUG_FLAGS_DECL;
11736 
11737     PERL_ARGS_ASSERT_GROK_BSLASH_N;
11738 
11739     GET_RE_DEBUG_FLAGS;
11740 
11741     assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
11742     assert(! (node_p && cp_count));               /* At most 1 should be set */
11743 
11744     if (cp_count) {     /* Initialize return for the most common case */
11745         *cp_count = 1;
11746     }
11747 
11748     /* The [^\n] meaning of \N ignores spaces and comments under the /x
11749      * modifier.  The other meanings do not, so use a temporary until we find
11750      * out which we are being called with */
11751     skip_to_be_ignored_text(pRExC_state, &p,
11752                             FALSE /* Don't force to /x */ );
11753 
11754     /* Disambiguate between \N meaning a named character versus \N meaning
11755      * [^\n].  The latter is assumed when the {...} following the \N is a legal
11756      * quantifier, or there is no '{' at all */
11757     if (*p != '{' || regcurly(p)) {
11758 	RExC_parse = p;
11759         if (cp_count) {
11760             *cp_count = -1;
11761         }
11762 
11763 	if (! node_p) {
11764             return FALSE;
11765         }
11766 
11767 	*node_p = reg_node(pRExC_state, REG_ANY);
11768 	*flagp |= HASWIDTH|SIMPLE;
11769 	MARK_NAUGHTY(1);
11770         Set_Node_Length(*node_p, 1); /* MJD */
11771 	return TRUE;
11772     }
11773 
11774     /* Here, we have decided it should be a named character or sequence */
11775 
11776     /* The test above made sure that the next real character is a '{', but
11777      * under the /x modifier, it could be separated by space (or a comment and
11778      * \n) and this is not allowed (for consistency with \x{...} and the
11779      * tokenizer handling of \N{NAME}). */
11780     if (*RExC_parse != '{') {
11781 	vFAIL("Missing braces on \\N{}");
11782     }
11783 
11784     RExC_parse++;	/* Skip past the '{' */
11785 
11786     if (! (endbrace = strchr(RExC_parse, '}'))  /* no trailing brace */
11787 	|| ! (endbrace == RExC_parse		/* nothing between the {} */
11788               || (endbrace - RExC_parse >= 2	/* U+ (bad hex is checked... */
11789                   && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11790                                                        error msg) */
11791     {
11792 	if (endbrace) RExC_parse = endbrace;	/* position msg's '<--HERE' */
11793 	vFAIL("\\N{NAME} must be resolved by the lexer");
11794     }
11795 
11796     REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
11797                                         semantics */
11798 
11799     if (endbrace == RExC_parse) {   /* empty: \N{} */
11800         if (strict) {
11801             RExC_parse++;   /* Position after the "}" */
11802             vFAIL("Zero length \\N{}");
11803         }
11804         if (cp_count) {
11805             *cp_count = 0;
11806         }
11807         nextchar(pRExC_state);
11808 	if (! node_p) {
11809             return FALSE;
11810         }
11811 
11812         *node_p = reg_node(pRExC_state,NOTHING);
11813         return TRUE;
11814     }
11815 
11816     RExC_parse += 2;	/* Skip past the 'U+' */
11817 
11818     /* Because toke.c has generated a special construct for us guaranteed not
11819      * to have NULs, we can use a str function */
11820     endchar = RExC_parse + strcspn(RExC_parse, ".}");
11821 
11822     /* Code points are separated by dots.  If none, there is only one code
11823      * point, and is terminated by the brace */
11824 
11825     if (endchar >= endbrace) {
11826 	STRLEN length_of_hex;
11827 	I32 grok_hex_flags;
11828 
11829         /* Here, exactly one code point.  If that isn't what is wanted, fail */
11830         if (! code_point_p) {
11831             RExC_parse = p;
11832             return FALSE;
11833         }
11834 
11835         /* Convert code point from hex */
11836 	length_of_hex = (STRLEN)(endchar - RExC_parse);
11837 	grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11838                            | PERL_SCAN_DISALLOW_PREFIX
11839 
11840                              /* No errors in the first pass (See [perl
11841                               * #122671].)  We let the code below find the
11842                               * errors when there are multiple chars. */
11843                            | ((SIZE_ONLY)
11844                               ? PERL_SCAN_SILENT_ILLDIGIT
11845                               : 0);
11846 
11847         /* This routine is the one place where both single- and double-quotish
11848          * \N{U+xxxx} are evaluated.  The value is a Unicode code point which
11849          * must be converted to native. */
11850 	*code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11851                                          &length_of_hex,
11852                                          &grok_hex_flags,
11853                                          NULL));
11854 
11855 	/* The tokenizer should have guaranteed validity, but it's possible to
11856          * bypass it by using single quoting, so check.  Don't do the check
11857          * here when there are multiple chars; we do it below anyway. */
11858         if (length_of_hex == 0
11859             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11860         {
11861             RExC_parse += length_of_hex;	/* Includes all the valid */
11862             RExC_parse += (RExC_orig_utf8)	/* point to after 1st invalid */
11863                             ? UTF8SKIP(RExC_parse)
11864                             : 1;
11865             /* Guard against malformed utf8 */
11866             if (RExC_parse >= endchar) {
11867                 RExC_parse = endchar;
11868             }
11869             vFAIL("Invalid hexadecimal number in \\N{U+...}");
11870         }
11871 
11872         RExC_parse = endbrace + 1;
11873         return TRUE;
11874     }
11875     else {  /* Is a multiple character sequence */
11876 	SV * substitute_parse;
11877 	STRLEN len;
11878 	char *orig_end = RExC_end;
11879 	char *save_start = RExC_start;
11880         I32 flags;
11881 
11882         /* Count the code points, if desired, in the sequence */
11883         if (cp_count) {
11884             *cp_count = 0;
11885             while (RExC_parse < endbrace) {
11886                 /* Point to the beginning of the next character in the sequence. */
11887                 RExC_parse = endchar + 1;
11888                 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11889                 (*cp_count)++;
11890             }
11891         }
11892 
11893         /* Fail if caller doesn't want to handle a multi-code-point sequence.
11894          * But don't backup up the pointer if the caller want to know how many
11895          * code points there are (they can then handle things) */
11896         if (! node_p) {
11897             if (! cp_count) {
11898                 RExC_parse = p;
11899             }
11900             return FALSE;
11901         }
11902 
11903 	/* What is done here is to convert this to a sub-pattern of the form
11904          * \x{char1}\x{char2}...  and then call reg recursively to parse it
11905          * (enclosing in "(?: ... )" ).  That way, it retains its atomicness,
11906          * while not having to worry about special handling that some code
11907          * points may have. */
11908 
11909 	substitute_parse = newSVpvs("?:");
11910 
11911 	while (RExC_parse < endbrace) {
11912 
11913 	    /* Convert to notation the rest of the code understands */
11914 	    sv_catpv(substitute_parse, "\\x{");
11915 	    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11916 	    sv_catpv(substitute_parse, "}");
11917 
11918 	    /* Point to the beginning of the next character in the sequence. */
11919 	    RExC_parse = endchar + 1;
11920 	    endchar = RExC_parse + strcspn(RExC_parse, ".}");
11921 
11922 	}
11923         sv_catpv(substitute_parse, ")");
11924 
11925         len = SvCUR(substitute_parse);
11926 
11927 	/* Don't allow empty number */
11928 	if (len < (STRLEN) 8) {
11929             RExC_parse = endbrace;
11930 	    vFAIL("Invalid hexadecimal number in \\N{U+...}");
11931 	}
11932 
11933         RExC_parse = RExC_start = RExC_adjusted_start
11934                                               = SvPV_nolen(substitute_parse);
11935 	RExC_end = RExC_parse + len;
11936 
11937         /* The values are Unicode, and therefore not subject to recoding, but
11938          * have to be converted to native on a non-Unicode (meaning non-ASCII)
11939          * platform. */
11940 	RExC_override_recoding = 1;
11941 #ifdef EBCDIC
11942         RExC_recode_x_to_native = 1;
11943 #endif
11944 
11945         if (node_p) {
11946             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11947                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11948                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11949                     return FALSE;
11950                 }
11951                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11952                     (UV) flags);
11953             }
11954             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11955         }
11956 
11957         /* Restore the saved values */
11958 	RExC_start = RExC_adjusted_start = save_start;
11959 	RExC_parse = endbrace;
11960 	RExC_end = orig_end;
11961 	RExC_override_recoding = 0;
11962 #ifdef EBCDIC
11963         RExC_recode_x_to_native = 0;
11964 #endif
11965 
11966         SvREFCNT_dec_NN(substitute_parse);
11967         nextchar(pRExC_state);
11968 
11969         return TRUE;
11970     }
11971 }
11972 
11973 
11974 /*
11975  * reg_recode
11976  *
11977  * It returns the code point in utf8 for the value in *encp.
11978  *    value: a code value in the source encoding
11979  *    encp:  a pointer to an Encode object
11980  *
11981  * If the result from Encode is not a single character,
11982  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11983  */
11984 STATIC UV
11985 S_reg_recode(pTHX_ const U8 value, SV **encp)
11986 {
11987     STRLEN numlen = 1;
11988     SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
11989     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11990     const STRLEN newlen = SvCUR(sv);
11991     UV uv = UNICODE_REPLACEMENT;
11992 
11993     PERL_ARGS_ASSERT_REG_RECODE;
11994 
11995     if (newlen)
11996 	uv = SvUTF8(sv)
11997 	     ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11998 	     : *(U8*)s;
11999 
12000     if (!newlen || numlen != newlen) {
12001 	uv = UNICODE_REPLACEMENT;
12002 	*encp = NULL;
12003     }
12004     return uv;
12005 }
12006 
12007 PERL_STATIC_INLINE U8
12008 S_compute_EXACTish(RExC_state_t *pRExC_state)
12009 {
12010     U8 op;
12011 
12012     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12013 
12014     if (! FOLD) {
12015         return (LOC)
12016                 ? EXACTL
12017                 : EXACT;
12018     }
12019 
12020     op = get_regex_charset(RExC_flags);
12021     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
12022         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
12023                  been, so there is no hole */
12024     }
12025 
12026     return op + EXACTF;
12027 }
12028 
12029 PERL_STATIC_INLINE void
12030 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
12031                          regnode *node, I32* flagp, STRLEN len, UV code_point,
12032                          bool downgradable)
12033 {
12034     /* This knows the details about sizing an EXACTish node, setting flags for
12035      * it (by setting <*flagp>, and potentially populating it with a single
12036      * character.
12037      *
12038      * If <len> (the length in bytes) is non-zero, this function assumes that
12039      * the node has already been populated, and just does the sizing.  In this
12040      * case <code_point> should be the final code point that has already been
12041      * placed into the node.  This value will be ignored except that under some
12042      * circumstances <*flagp> is set based on it.
12043      *
12044      * If <len> is zero, the function assumes that the node is to contain only
12045      * the single character given by <code_point> and calculates what <len>
12046      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
12047      * additionally will populate the node's STRING with <code_point> or its
12048      * fold if folding.
12049      *
12050      * In both cases <*flagp> is appropriately set
12051      *
12052      * It knows that under FOLD, the Latin Sharp S and UTF characters above
12053      * 255, must be folded (the former only when the rules indicate it can
12054      * match 'ss')
12055      *
12056      * When it does the populating, it looks at the flag 'downgradable'.  If
12057      * true with a node that folds, it checks if the single code point
12058      * participates in a fold, and if not downgrades the node to an EXACT.
12059      * This helps the optimizer */
12060 
12061     bool len_passed_in = cBOOL(len != 0);
12062     U8 character[UTF8_MAXBYTES_CASE+1];
12063 
12064     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
12065 
12066     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
12067      * sizing difference, and is extra work that is thrown away */
12068     if (downgradable && ! PASS2) {
12069         downgradable = FALSE;
12070     }
12071 
12072     if (! len_passed_in) {
12073         if (UTF) {
12074             if (UVCHR_IS_INVARIANT(code_point)) {
12075                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
12076                     *character = (U8) code_point;
12077                 }
12078                 else { /* Here is /i and not /l. (toFOLD() is defined on just
12079                           ASCII, which isn't the same thing as INVARIANT on
12080                           EBCDIC, but it works there, as the extra invariants
12081                           fold to themselves) */
12082                     *character = toFOLD((U8) code_point);
12083 
12084                     /* We can downgrade to an EXACT node if this character
12085                      * isn't a folding one.  Note that this assumes that
12086                      * nothing above Latin1 folds to some other invariant than
12087                      * one of these alphabetics; otherwise we would also have
12088                      * to check:
12089                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12090                      *      || ASCII_FOLD_RESTRICTED))
12091                      */
12092                     if (downgradable && PL_fold[code_point] == code_point) {
12093                         OP(node) = EXACT;
12094                     }
12095                 }
12096                 len = 1;
12097             }
12098             else if (FOLD && (! LOC
12099                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
12100             {   /* Folding, and ok to do so now */
12101                 UV folded = _to_uni_fold_flags(
12102                                    code_point,
12103                                    character,
12104                                    &len,
12105                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12106                                                       ? FOLD_FLAGS_NOMIX_ASCII
12107                                                       : 0));
12108                 if (downgradable
12109                     && folded == code_point /* This quickly rules out many
12110                                                cases, avoiding the
12111                                                _invlist_contains_cp() overhead
12112                                                for those.  */
12113                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
12114                 {
12115                     OP(node) = (LOC)
12116                                ? EXACTL
12117                                : EXACT;
12118                 }
12119             }
12120             else if (code_point <= MAX_UTF8_TWO_BYTE) {
12121 
12122                 /* Not folding this cp, and can output it directly */
12123                 *character = UTF8_TWO_BYTE_HI(code_point);
12124                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
12125                 len = 2;
12126             }
12127             else {
12128                 uvchr_to_utf8( character, code_point);
12129                 len = UTF8SKIP(character);
12130             }
12131         } /* Else pattern isn't UTF8.  */
12132         else if (! FOLD) {
12133             *character = (U8) code_point;
12134             len = 1;
12135         } /* Else is folded non-UTF8 */
12136 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12137    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12138                                       || UNICODE_DOT_DOT_VERSION > 0)
12139         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
12140 #else
12141         else if (1) {
12142 #endif
12143             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
12144              * comments at join_exact()); */
12145             *character = (U8) code_point;
12146             len = 1;
12147 
12148             /* Can turn into an EXACT node if we know the fold at compile time,
12149              * and it folds to itself and doesn't particpate in other folds */
12150             if (downgradable
12151                 && ! LOC
12152                 && PL_fold_latin1[code_point] == code_point
12153                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12154                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
12155             {
12156                 OP(node) = EXACT;
12157             }
12158         } /* else is Sharp s.  May need to fold it */
12159         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
12160             *character = 's';
12161             *(character + 1) = 's';
12162             len = 2;
12163         }
12164         else {
12165             *character = LATIN_SMALL_LETTER_SHARP_S;
12166             len = 1;
12167         }
12168     }
12169 
12170     if (SIZE_ONLY) {
12171         RExC_size += STR_SZ(len);
12172     }
12173     else {
12174         RExC_emit += STR_SZ(len);
12175         STR_LEN(node) = len;
12176         if (! len_passed_in) {
12177             Copy((char *) character, STRING(node), len, char);
12178         }
12179     }
12180 
12181     *flagp |= HASWIDTH;
12182 
12183     /* A single character node is SIMPLE, except for the special-cased SHARP S
12184      * under /di. */
12185     if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
12186 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
12187    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
12188                                       || UNICODE_DOT_DOT_VERSION > 0)
12189         && ( code_point != LATIN_SMALL_LETTER_SHARP_S
12190             || ! FOLD || ! DEPENDS_SEMANTICS)
12191 #endif
12192     ) {
12193         *flagp |= SIMPLE;
12194     }
12195 
12196     /* The OP may not be well defined in PASS1 */
12197     if (PASS2 && OP(node) == EXACTFL) {
12198         RExC_contains_locale = 1;
12199     }
12200 }
12201 
12202 
12203 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
12204  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
12205 
12206 static I32
12207 S_backref_value(char *p)
12208 {
12209     const char* endptr;
12210     UV val;
12211     if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
12212         return (I32)val;
12213     return I32_MAX;
12214 }
12215 
12216 
12217 /*
12218  - regatom - the lowest level
12219 
12220    Try to identify anything special at the start of the pattern. If there
12221    is, then handle it as required. This may involve generating a single regop,
12222    such as for an assertion; or it may involve recursing, such as to
12223    handle a () structure.
12224 
12225    If the string doesn't start with something special then we gobble up
12226    as much literal text as we can.
12227 
12228    Once we have been able to handle whatever type of thing started the
12229    sequence, we return.
12230 
12231    Note: we have to be careful with escapes, as they can be both literal
12232    and special, and in the case of \10 and friends, context determines which.
12233 
12234    A summary of the code structure is:
12235 
12236    switch (first_byte) {
12237 	cases for each special:
12238 	    handle this special;
12239 	    break;
12240 	case '\\':
12241 	    switch (2nd byte) {
12242 		cases for each unambiguous special:
12243 		    handle this special;
12244 		    break;
12245 		cases for each ambigous special/literal:
12246 		    disambiguate;
12247 		    if (special)  handle here
12248 		    else goto defchar;
12249 		default: // unambiguously literal:
12250 		    goto defchar;
12251 	    }
12252 	default:  // is a literal char
12253 	    // FALL THROUGH
12254 	defchar:
12255 	    create EXACTish node for literal;
12256 	    while (more input and node isn't full) {
12257 		switch (input_byte) {
12258 		   cases for each special;
12259                        make sure parse pointer is set so that the next call to
12260                            regatom will see this special first
12261                        goto loopdone; // EXACTish node terminated by prev. char
12262 		   default:
12263 		       append char to EXACTISH node;
12264 		}
12265 	        get next input byte;
12266 	    }
12267         loopdone:
12268    }
12269    return the generated node;
12270 
12271    Specifically there are two separate switches for handling
12272    escape sequences, with the one for handling literal escapes requiring
12273    a dummy entry for all of the special escapes that are actually handled
12274    by the other.
12275 
12276    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
12277    TRYAGAIN.
12278    Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
12279    restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
12280    Otherwise does not return NULL.
12281 */
12282 
12283 STATIC regnode *
12284 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12285 {
12286     regnode *ret = NULL;
12287     I32 flags = 0;
12288     char *parse_start;
12289     U8 op;
12290     int invert = 0;
12291     U8 arg;
12292 
12293     GET_RE_DEBUG_FLAGS_DECL;
12294 
12295     *flagp = WORST;		/* Tentatively. */
12296 
12297     DEBUG_PARSE("atom");
12298 
12299     PERL_ARGS_ASSERT_REGATOM;
12300 
12301   tryagain:
12302     parse_start = RExC_parse;
12303     assert(RExC_parse < RExC_end);
12304     switch ((U8)*RExC_parse) {
12305     case '^':
12306 	RExC_seen_zerolen++;
12307 	nextchar(pRExC_state);
12308 	if (RExC_flags & RXf_PMf_MULTILINE)
12309 	    ret = reg_node(pRExC_state, MBOL);
12310 	else
12311 	    ret = reg_node(pRExC_state, SBOL);
12312         Set_Node_Length(ret, 1); /* MJD */
12313 	break;
12314     case '$':
12315 	nextchar(pRExC_state);
12316 	if (*RExC_parse)
12317 	    RExC_seen_zerolen++;
12318 	if (RExC_flags & RXf_PMf_MULTILINE)
12319 	    ret = reg_node(pRExC_state, MEOL);
12320 	else
12321 	    ret = reg_node(pRExC_state, SEOL);
12322         Set_Node_Length(ret, 1); /* MJD */
12323 	break;
12324     case '.':
12325 	nextchar(pRExC_state);
12326 	if (RExC_flags & RXf_PMf_SINGLELINE)
12327 	    ret = reg_node(pRExC_state, SANY);
12328 	else
12329 	    ret = reg_node(pRExC_state, REG_ANY);
12330 	*flagp |= HASWIDTH|SIMPLE;
12331 	MARK_NAUGHTY(1);
12332         Set_Node_Length(ret, 1); /* MJD */
12333 	break;
12334     case '[':
12335     {
12336 	char * const oregcomp_parse = ++RExC_parse;
12337         ret = regclass(pRExC_state, flagp,depth+1,
12338                        FALSE, /* means parse the whole char class */
12339                        TRUE, /* allow multi-char folds */
12340                        FALSE, /* don't silence non-portable warnings. */
12341                        (bool) RExC_strict,
12342                        TRUE, /* Allow an optimized regnode result */
12343                        NULL,
12344                        NULL);
12345         if (ret == NULL) {
12346             if (*flagp & (RESTART_PASS1|NEED_UTF8))
12347                 return NULL;
12348             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12349                   (UV) *flagp);
12350         }
12351 	if (*RExC_parse != ']') {
12352 	    RExC_parse = oregcomp_parse;
12353 	    vFAIL("Unmatched [");
12354 	}
12355 	nextchar(pRExC_state);
12356         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
12357 	break;
12358     }
12359     case '(':
12360 	nextchar(pRExC_state);
12361         ret = reg(pRExC_state, 2, &flags,depth+1);
12362 	if (ret == NULL) {
12363 		if (flags & TRYAGAIN) {
12364 		    if (RExC_parse >= RExC_end) {
12365 			 /* Make parent create an empty node if needed. */
12366 			*flagp |= TRYAGAIN;
12367 			return(NULL);
12368 		    }
12369 		    goto tryagain;
12370 		}
12371                 if (flags & (RESTART_PASS1|NEED_UTF8)) {
12372                     *flagp = flags & (RESTART_PASS1|NEED_UTF8);
12373                     return NULL;
12374                 }
12375                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
12376                                                                  (UV) flags);
12377 	}
12378 	*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12379 	break;
12380     case '|':
12381     case ')':
12382 	if (flags & TRYAGAIN) {
12383 	    *flagp |= TRYAGAIN;
12384 	    return NULL;
12385 	}
12386 	vFAIL("Internal urp");
12387 				/* Supposed to be caught earlier. */
12388 	break;
12389     case '?':
12390     case '+':
12391     case '*':
12392 	RExC_parse++;
12393 	vFAIL("Quantifier follows nothing");
12394 	break;
12395     case '\\':
12396 	/* Special Escapes
12397 
12398 	   This switch handles escape sequences that resolve to some kind
12399 	   of special regop and not to literal text. Escape sequnces that
12400 	   resolve to literal text are handled below in the switch marked
12401 	   "Literal Escapes".
12402 
12403 	   Every entry in this switch *must* have a corresponding entry
12404 	   in the literal escape switch. However, the opposite is not
12405 	   required, as the default for this switch is to jump to the
12406 	   literal text handling code.
12407 	*/
12408 	RExC_parse++;
12409 	switch ((U8)*RExC_parse) {
12410 	/* Special Escapes */
12411 	case 'A':
12412 	    RExC_seen_zerolen++;
12413 	    ret = reg_node(pRExC_state, SBOL);
12414             /* SBOL is shared with /^/ so we set the flags so we can tell
12415              * /\A/ from /^/ in split. We check ret because first pass we
12416              * have no regop struct to set the flags on. */
12417             if (PASS2)
12418                 ret->flags = 1;
12419 	    *flagp |= SIMPLE;
12420 	    goto finish_meta_pat;
12421 	case 'G':
12422 	    ret = reg_node(pRExC_state, GPOS);
12423             RExC_seen |= REG_GPOS_SEEN;
12424 	    *flagp |= SIMPLE;
12425 	    goto finish_meta_pat;
12426 	case 'K':
12427 	    RExC_seen_zerolen++;
12428 	    ret = reg_node(pRExC_state, KEEPS);
12429 	    *flagp |= SIMPLE;
12430 	    /* XXX:dmq : disabling in-place substitution seems to
12431 	     * be necessary here to avoid cases of memory corruption, as
12432 	     * with: C<$_="x" x 80; s/x\K/y/> -- rgs
12433 	     */
12434             RExC_seen |= REG_LOOKBEHIND_SEEN;
12435 	    goto finish_meta_pat;
12436 	case 'Z':
12437 	    ret = reg_node(pRExC_state, SEOL);
12438 	    *flagp |= SIMPLE;
12439 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
12440 	    goto finish_meta_pat;
12441 	case 'z':
12442 	    ret = reg_node(pRExC_state, EOS);
12443 	    *flagp |= SIMPLE;
12444 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
12445 	    goto finish_meta_pat;
12446 	case 'C':
12447 	    vFAIL("\\C no longer supported");
12448 	case 'X':
12449 	    ret = reg_node(pRExC_state, CLUMP);
12450 	    *flagp |= HASWIDTH;
12451 	    goto finish_meta_pat;
12452 
12453 	case 'W':
12454             invert = 1;
12455             /* FALLTHROUGH */
12456 	case 'w':
12457             arg = ANYOF_WORDCHAR;
12458             goto join_posix;
12459 
12460 	case 'B':
12461             invert = 1;
12462             /* FALLTHROUGH */
12463 	case 'b':
12464           {
12465 	    regex_charset charset = get_regex_charset(RExC_flags);
12466 
12467 	    RExC_seen_zerolen++;
12468             RExC_seen |= REG_LOOKBEHIND_SEEN;
12469 	    op = BOUND + charset;
12470 
12471             if (op == BOUNDL) {
12472                 RExC_contains_locale = 1;
12473             }
12474 
12475 	    ret = reg_node(pRExC_state, op);
12476 	    *flagp |= SIMPLE;
12477 	    if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
12478                 FLAGS(ret) = TRADITIONAL_BOUND;
12479                 if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
12480                     OP(ret) = BOUNDA;
12481                 }
12482             }
12483             else {
12484                 STRLEN length;
12485                 char name = *RExC_parse;
12486                 char * endbrace;
12487                 RExC_parse += 2;
12488                 endbrace = strchr(RExC_parse, '}');
12489 
12490                 if (! endbrace) {
12491                     vFAIL2("Missing right brace on \\%c{}", name);
12492                 }
12493                 /* XXX Need to decide whether to take spaces or not.  Should be
12494                  * consistent with \p{}, but that currently is SPACE, which
12495                  * means vertical too, which seems wrong
12496                  * while (isBLANK(*RExC_parse)) {
12497                     RExC_parse++;
12498                 }*/
12499                 if (endbrace == RExC_parse) {
12500                     RExC_parse++;  /* After the '}' */
12501                     vFAIL2("Empty \\%c{}", name);
12502                 }
12503                 length = endbrace - RExC_parse;
12504                 /*while (isBLANK(*(RExC_parse + length - 1))) {
12505                     length--;
12506                 }*/
12507                 switch (*RExC_parse) {
12508                     case 'g':
12509                         if (length != 1
12510                             && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
12511                         {
12512                             goto bad_bound_type;
12513                         }
12514                         FLAGS(ret) = GCB_BOUND;
12515                         break;
12516                     case 'l':
12517                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12518                             goto bad_bound_type;
12519                         }
12520                         FLAGS(ret) = LB_BOUND;
12521                         break;
12522                     case 's':
12523                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12524                             goto bad_bound_type;
12525                         }
12526                         FLAGS(ret) = SB_BOUND;
12527                         break;
12528                     case 'w':
12529                         if (length != 2 || *(RExC_parse + 1) != 'b') {
12530                             goto bad_bound_type;
12531                         }
12532                         FLAGS(ret) = WB_BOUND;
12533                         break;
12534                     default:
12535                       bad_bound_type:
12536                         RExC_parse = endbrace;
12537 			vFAIL2utf8f(
12538                             "'%"UTF8f"' is an unknown bound type",
12539 			    UTF8fARG(UTF, length, endbrace - length));
12540                         NOT_REACHED; /*NOTREACHED*/
12541                 }
12542                 RExC_parse = endbrace;
12543                 REQUIRE_UNI_RULES(flagp, NULL);
12544 
12545                 if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
12546                     OP(ret) = BOUNDU;
12547                     length += 4;
12548 
12549                     /* Don't have to worry about UTF-8, in this message because
12550                      * to get here the contents of the \b must be ASCII */
12551                     ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
12552                               "Using /u for '%.*s' instead of /%s",
12553                               (unsigned) length,
12554                               endbrace - length + 1,
12555                               (charset == REGEX_ASCII_RESTRICTED_CHARSET)
12556                               ? ASCII_RESTRICT_PAT_MODS
12557                               : ASCII_MORE_RESTRICT_PAT_MODS);
12558                 }
12559 	    }
12560 
12561             if (PASS2 && invert) {
12562                 OP(ret) += NBOUND - BOUND;
12563             }
12564 	    goto finish_meta_pat;
12565           }
12566 
12567 	case 'D':
12568             invert = 1;
12569             /* FALLTHROUGH */
12570 	case 'd':
12571             arg = ANYOF_DIGIT;
12572             if (! DEPENDS_SEMANTICS) {
12573                 goto join_posix;
12574             }
12575 
12576             /* \d doesn't have any matches in the upper Latin1 range, hence /d
12577              * is equivalent to /u.  Changing to /u saves some branches at
12578              * runtime */
12579             op = POSIXU;
12580             goto join_posix_op_known;
12581 
12582 	case 'R':
12583 	    ret = reg_node(pRExC_state, LNBREAK);
12584 	    *flagp |= HASWIDTH|SIMPLE;
12585 	    goto finish_meta_pat;
12586 
12587 	case 'H':
12588             invert = 1;
12589             /* FALLTHROUGH */
12590 	case 'h':
12591 	    arg = ANYOF_BLANK;
12592             op = POSIXU;
12593             goto join_posix_op_known;
12594 
12595 	case 'V':
12596             invert = 1;
12597             /* FALLTHROUGH */
12598 	case 'v':
12599 	    arg = ANYOF_VERTWS;
12600             op = POSIXU;
12601             goto join_posix_op_known;
12602 
12603 	case 'S':
12604             invert = 1;
12605             /* FALLTHROUGH */
12606 	case 's':
12607             arg = ANYOF_SPACE;
12608 
12609           join_posix:
12610 
12611 	    op = POSIXD + get_regex_charset(RExC_flags);
12612             if (op > POSIXA) {  /* /aa is same as /a */
12613                 op = POSIXA;
12614             }
12615             else if (op == POSIXL) {
12616                 RExC_contains_locale = 1;
12617             }
12618 
12619           join_posix_op_known:
12620 
12621             if (invert) {
12622                 op += NPOSIXD - POSIXD;
12623             }
12624 
12625 	    ret = reg_node(pRExC_state, op);
12626             if (! SIZE_ONLY) {
12627                 FLAGS(ret) = namedclass_to_classnum(arg);
12628             }
12629 
12630 	    *flagp |= HASWIDTH|SIMPLE;
12631             /* FALLTHROUGH */
12632 
12633           finish_meta_pat:
12634 	    nextchar(pRExC_state);
12635             Set_Node_Length(ret, 2); /* MJD */
12636 	    break;
12637 	case 'p':
12638 	case 'P':
12639             RExC_parse--;
12640 
12641             ret = regclass(pRExC_state, flagp,depth+1,
12642                            TRUE, /* means just parse this element */
12643                            FALSE, /* don't allow multi-char folds */
12644                            FALSE, /* don't silence non-portable warnings.  It
12645                                      would be a bug if these returned
12646                                      non-portables */
12647                            (bool) RExC_strict,
12648                            TRUE, /* Allow an optimized regnode result */
12649                            NULL,
12650                            NULL);
12651             if (*flagp & RESTART_PASS1)
12652                 return NULL;
12653             /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
12654              * multi-char folds are allowed.  */
12655             if (!ret)
12656                 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12657                       (UV) *flagp);
12658 
12659             RExC_parse--;
12660 
12661             Set_Node_Offset(ret, parse_start);
12662             Set_Node_Cur_Length(ret, parse_start - 2);
12663             nextchar(pRExC_state);
12664 	    break;
12665         case 'N':
12666             /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12667              * \N{...} evaluates to a sequence of more than one code points).
12668              * The function call below returns a regnode, which is our result.
12669              * The parameters cause it to fail if the \N{} evaluates to a
12670              * single code point; we handle those like any other literal.  The
12671              * reason that the multicharacter case is handled here and not as
12672              * part of the EXACtish code is because of quantifiers.  In
12673              * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12674              * this way makes that Just Happen. dmq.
12675              * join_exact() will join this up with adjacent EXACTish nodes
12676              * later on, if appropriate. */
12677             ++RExC_parse;
12678             if (grok_bslash_N(pRExC_state,
12679                               &ret,     /* Want a regnode returned */
12680                               NULL,     /* Fail if evaluates to a single code
12681                                            point */
12682                               NULL,     /* Don't need a count of how many code
12683                                            points */
12684                               flagp,
12685                               RExC_strict,
12686                               depth)
12687             ) {
12688                 break;
12689             }
12690 
12691             if (*flagp & RESTART_PASS1)
12692                 return NULL;
12693 
12694             /* Here, evaluates to a single code point.  Go get that */
12695             RExC_parse = parse_start;
12696             goto defchar;
12697 
12698 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
12699       parse_named_seq:
12700         {
12701             char ch;
12702             if (   RExC_parse >= RExC_end - 1
12703                 || ((   ch = RExC_parse[1]) != '<'
12704                                       && ch != '\''
12705                                       && ch != '{'))
12706             {
12707 	        RExC_parse++;
12708 		/* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12709 	        vFAIL2("Sequence %.2s... not terminated",parse_start);
12710 	    } else {
12711 		RExC_parse += 2;
12712                 ret = handle_named_backref(pRExC_state,
12713                                            flagp,
12714                                            parse_start,
12715                                            (ch == '<')
12716                                            ? '>'
12717                                            : (ch == '{')
12718                                              ? '}'
12719                                              : '\'');
12720             }
12721             break;
12722 	}
12723 	case 'g':
12724 	case '1': case '2': case '3': case '4':
12725 	case '5': case '6': case '7': case '8': case '9':
12726 	    {
12727 		I32 num;
12728 		bool hasbrace = 0;
12729 
12730 		if (*RExC_parse == 'g') {
12731                     bool isrel = 0;
12732 
12733 		    RExC_parse++;
12734 		    if (*RExC_parse == '{') {
12735 		        RExC_parse++;
12736 		        hasbrace = 1;
12737 		    }
12738 		    if (*RExC_parse == '-') {
12739 		        RExC_parse++;
12740 		        isrel = 1;
12741 		    }
12742 		    if (hasbrace && !isDIGIT(*RExC_parse)) {
12743 		        if (isrel) RExC_parse--;
12744                         RExC_parse -= 2;
12745 		        goto parse_named_seq;
12746                     }
12747 
12748                     if (RExC_parse >= RExC_end) {
12749                         goto unterminated_g;
12750                     }
12751                     num = S_backref_value(RExC_parse);
12752                     if (num == 0)
12753                         vFAIL("Reference to invalid group 0");
12754                     else if (num == I32_MAX) {
12755                          if (isDIGIT(*RExC_parse))
12756 			    vFAIL("Reference to nonexistent group");
12757                         else
12758                           unterminated_g:
12759                             vFAIL("Unterminated \\g... pattern");
12760                     }
12761 
12762                     if (isrel) {
12763                         num = RExC_npar - num;
12764                         if (num < 1)
12765                             vFAIL("Reference to nonexistent or unclosed group");
12766                     }
12767                 }
12768                 else {
12769                     num = S_backref_value(RExC_parse);
12770                     /* bare \NNN might be backref or octal - if it is larger
12771                      * than or equal RExC_npar then it is assumed to be an
12772                      * octal escape. Note RExC_npar is +1 from the actual
12773                      * number of parens. */
12774                     /* Note we do NOT check if num == I32_MAX here, as that is
12775                      * handled by the RExC_npar check */
12776 
12777                     if (
12778                         /* any numeric escape < 10 is always a backref */
12779                         num > 9
12780                         /* any numeric escape < RExC_npar is a backref */
12781                         && num >= RExC_npar
12782                         /* cannot be an octal escape if it starts with 8 */
12783                         && *RExC_parse != '8'
12784                         /* cannot be an octal escape it it starts with 9 */
12785                         && *RExC_parse != '9'
12786                     )
12787                     {
12788                         /* Probably not a backref, instead likely to be an
12789                          * octal character escape, e.g. \35 or \777.
12790                          * The above logic should make it obvious why using
12791                          * octal escapes in patterns is problematic. - Yves */
12792                         RExC_parse = parse_start;
12793                         goto defchar;
12794                     }
12795                 }
12796 
12797                 /* At this point RExC_parse points at a numeric escape like
12798                  * \12 or \88 or something similar, which we should NOT treat
12799                  * as an octal escape. It may or may not be a valid backref
12800                  * escape. For instance \88888888 is unlikely to be a valid
12801                  * backref. */
12802                 while (isDIGIT(*RExC_parse))
12803                     RExC_parse++;
12804                 if (hasbrace) {
12805                     if (*RExC_parse != '}')
12806                         vFAIL("Unterminated \\g{...} pattern");
12807                     RExC_parse++;
12808                 }
12809                 if (!SIZE_ONLY) {
12810                     if (num > (I32)RExC_rx->nparens)
12811                         vFAIL("Reference to nonexistent group");
12812                 }
12813                 RExC_sawback = 1;
12814                 ret = reganode(pRExC_state,
12815                                ((! FOLD)
12816                                  ? REF
12817                                  : (ASCII_FOLD_RESTRICTED)
12818                                    ? REFFA
12819                                    : (AT_LEAST_UNI_SEMANTICS)
12820                                      ? REFFU
12821                                      : (LOC)
12822                                        ? REFFL
12823                                        : REFF),
12824                                 num);
12825                 *flagp |= HASWIDTH;
12826 
12827                 /* override incorrect value set in reganode MJD */
12828                 Set_Node_Offset(ret, parse_start);
12829                 Set_Node_Cur_Length(ret, parse_start-1);
12830                 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12831                                         FALSE /* Don't force to /x */ );
12832 	    }
12833 	    break;
12834 	case '\0':
12835 	    if (RExC_parse >= RExC_end)
12836 		FAIL("Trailing \\");
12837 	    /* FALLTHROUGH */
12838 	default:
12839 	    /* Do not generate "unrecognized" warnings here, we fall
12840 	       back into the quick-grab loop below */
12841             RExC_parse = parse_start;
12842 	    goto defchar;
12843 	} /* end of switch on a \foo sequence */
12844 	break;
12845 
12846     case '#':
12847 
12848         /* '#' comments should have been spaced over before this function was
12849          * called */
12850         assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
12851 	/*
12852         if (RExC_flags & RXf_PMf_EXTENDED) {
12853 	    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12854 	    if (RExC_parse < RExC_end)
12855 		goto tryagain;
12856 	}
12857         */
12858 
12859 	/* FALLTHROUGH */
12860 
12861     default:
12862 	  defchar: {
12863 
12864             /* Here, we have determined that the next thing is probably a
12865              * literal character.  RExC_parse points to the first byte of its
12866              * definition.  (It still may be an escape sequence that evaluates
12867              * to a single character) */
12868 
12869 	    STRLEN len = 0;
12870 	    UV ender = 0;
12871 	    char *p;
12872 	    char *s;
12873 #define MAX_NODE_STRING_SIZE 127
12874 	    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12875 	    char *s0;
12876 	    U8 upper_parse = MAX_NODE_STRING_SIZE;
12877             U8 node_type = compute_EXACTish(pRExC_state);
12878             bool next_is_quantifier;
12879             char * oldp = NULL;
12880 
12881             /* We can convert EXACTF nodes to EXACTFU if they contain only
12882              * characters that match identically regardless of the target
12883              * string's UTF8ness.  The reason to do this is that EXACTF is not
12884              * trie-able, EXACTFU is.
12885              *
12886              * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
12887              * contain only above-Latin1 characters (hence must be in UTF8),
12888              * which don't participate in folds with Latin1-range characters,
12889              * as the latter's folds aren't known until runtime.  (We don't
12890              * need to figure this out until pass 2) */
12891             bool maybe_exactfu = PASS2
12892                                && (node_type == EXACTF || node_type == EXACTFL);
12893 
12894             /* If a folding node contains only code points that don't
12895              * participate in folds, it can be changed into an EXACT node,
12896              * which allows the optimizer more things to look for */
12897             bool maybe_exact;
12898 
12899 	    ret = reg_node(pRExC_state, node_type);
12900 
12901             /* In pass1, folded, we use a temporary buffer instead of the
12902              * actual node, as the node doesn't exist yet */
12903 	    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12904 
12905             s0 = s;
12906 
12907 	  reparse:
12908 
12909             /* We look for the EXACTFish to EXACT node optimizaton only if
12910              * folding.  (And we don't need to figure this out until pass 2).
12911              * XXX It might actually make sense to split the node into portions
12912              * that are exact and ones that aren't, so that we could later use
12913              * the exact ones to find the longest fixed and floating strings.
12914              * One would want to join them back into a larger node.  One could
12915              * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
12916             maybe_exact = FOLD && PASS2;
12917 
12918 	    /* XXX The node can hold up to 255 bytes, yet this only goes to
12919              * 127.  I (khw) do not know why.  Keeping it somewhat less than
12920              * 255 allows us to not have to worry about overflow due to
12921              * converting to utf8 and fold expansion, but that value is
12922              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
12923              * split up by this limit into a single one using the real max of
12924              * 255.  Even at 127, this breaks under rare circumstances.  If
12925              * folding, we do not want to split a node at a character that is a
12926              * non-final in a multi-char fold, as an input string could just
12927              * happen to want to match across the node boundary.  The join
12928              * would solve that problem if the join actually happens.  But a
12929              * series of more than two nodes in a row each of 127 would cause
12930              * the first join to succeed to get to 254, but then there wouldn't
12931              * be room for the next one, which could at be one of those split
12932              * multi-char folds.  I don't know of any fool-proof solution.  One
12933              * could back off to end with only a code point that isn't such a
12934              * non-final, but it is possible for there not to be any in the
12935              * entire node. */
12936 
12937             assert(   ! UTF     /* Is at the beginning of a character */
12938                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
12939                    || UTF8_IS_START(UCHARAT(RExC_parse)));
12940 
12941 	    for (p = RExC_parse;
12942 	         len < upper_parse && p < RExC_end;
12943 	         len++)
12944 	    {
12945 		oldp = p;
12946 
12947                 /* White space has already been ignored */
12948                 assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
12949                        || ! is_PATWS_safe((p), RExC_end, UTF));
12950 
12951 		switch ((U8)*p) {
12952 		case '^':
12953 		case '$':
12954 		case '.':
12955 		case '[':
12956 		case '(':
12957 		case ')':
12958 		case '|':
12959 		    goto loopdone;
12960 		case '\\':
12961 		    /* Literal Escapes Switch
12962 
12963 		       This switch is meant to handle escape sequences that
12964 		       resolve to a literal character.
12965 
12966 		       Every escape sequence that represents something
12967 		       else, like an assertion or a char class, is handled
12968 		       in the switch marked 'Special Escapes' above in this
12969 		       routine, but also has an entry here as anything that
12970 		       isn't explicitly mentioned here will be treated as
12971 		       an unescaped equivalent literal.
12972 		    */
12973 
12974 		    switch ((U8)*++p) {
12975 		    /* These are all the special escapes. */
12976 		    case 'A':             /* Start assertion */
12977 		    case 'b': case 'B':   /* Word-boundary assertion*/
12978 		    case 'C':             /* Single char !DANGEROUS! */
12979 		    case 'd': case 'D':   /* digit class */
12980 		    case 'g': case 'G':   /* generic-backref, pos assertion */
12981 		    case 'h': case 'H':   /* HORIZWS */
12982 		    case 'k': case 'K':   /* named backref, keep marker */
12983 		    case 'p': case 'P':   /* Unicode property */
12984 		              case 'R':   /* LNBREAK */
12985 		    case 's': case 'S':   /* space class */
12986 		    case 'v': case 'V':   /* VERTWS */
12987 		    case 'w': case 'W':   /* word class */
12988                     case 'X':             /* eXtended Unicode "combining
12989                                              character sequence" */
12990 		    case 'z': case 'Z':   /* End of line/string assertion */
12991 			--p;
12992 			goto loopdone;
12993 
12994 	            /* Anything after here is an escape that resolves to a
12995 	               literal. (Except digits, which may or may not)
12996 	             */
12997 		    case 'n':
12998 			ender = '\n';
12999 			p++;
13000 			break;
13001 		    case 'N': /* Handle a single-code point named character. */
13002                         RExC_parse = p + 1;
13003                         if (! grok_bslash_N(pRExC_state,
13004                                             NULL,   /* Fail if evaluates to
13005                                                        anything other than a
13006                                                        single code point */
13007                                             &ender, /* The returned single code
13008                                                        point */
13009                                             NULL,   /* Don't need a count of
13010                                                        how many code points */
13011                                             flagp,
13012                                             RExC_strict,
13013                                             depth)
13014                         ) {
13015                             if (*flagp & NEED_UTF8)
13016                                 FAIL("panic: grok_bslash_N set NEED_UTF8");
13017                             if (*flagp & RESTART_PASS1)
13018                                 return NULL;
13019 
13020                             /* Here, it wasn't a single code point.  Go close
13021                              * up this EXACTish node.  The switch() prior to
13022                              * this switch handles the other cases */
13023                             RExC_parse = p = oldp;
13024                             goto loopdone;
13025                         }
13026                         p = RExC_parse;
13027                         RExC_parse = parse_start;
13028                         if (ender > 0xff) {
13029                             REQUIRE_UTF8(flagp);
13030                         }
13031                         break;
13032 		    case 'r':
13033 			ender = '\r';
13034 			p++;
13035 			break;
13036 		    case 't':
13037 			ender = '\t';
13038 			p++;
13039 			break;
13040 		    case 'f':
13041 			ender = '\f';
13042 			p++;
13043 			break;
13044 		    case 'e':
13045 			ender = ESC_NATIVE;
13046 			p++;
13047 			break;
13048 		    case 'a':
13049 			ender = '\a';
13050 			p++;
13051 			break;
13052 		    case 'o':
13053 			{
13054 			    UV result;
13055 			    const char* error_msg;
13056 
13057 			    bool valid = grok_bslash_o(&p,
13058 						       &result,
13059 						       &error_msg,
13060 						       PASS2, /* out warnings */
13061                                                        (bool) RExC_strict,
13062                                                        TRUE, /* Output warnings
13063                                                                 for non-
13064                                                                 portables */
13065                                                        UTF);
13066 			    if (! valid) {
13067 				RExC_parse = p;	/* going to die anyway; point
13068 						   to exact spot of failure */
13069 				vFAIL(error_msg);
13070 			    }
13071                             ender = result;
13072 			    if (IN_ENCODING && ender < 0x100) {
13073 				goto recode_encoding;
13074 			    }
13075 			    if (ender > 0xff) {
13076 				REQUIRE_UTF8(flagp);
13077 			    }
13078 			    break;
13079 			}
13080 		    case 'x':
13081 			{
13082                             UV result = UV_MAX; /* initialize to erroneous
13083                                                    value */
13084 			    const char* error_msg;
13085 
13086 			    bool valid = grok_bslash_x(&p,
13087 						       &result,
13088 						       &error_msg,
13089 						       PASS2, /* out warnings */
13090                                                        (bool) RExC_strict,
13091                                                        TRUE, /* Silence warnings
13092                                                                 for non-
13093                                                                 portables */
13094                                                        UTF);
13095 			    if (! valid) {
13096 				RExC_parse = p;	/* going to die anyway; point
13097 						   to exact spot of failure */
13098 				vFAIL(error_msg);
13099 			    }
13100                             ender = result;
13101 
13102                             if (ender < 0x100) {
13103 #ifdef EBCDIC
13104                                 if (RExC_recode_x_to_native) {
13105                                     ender = LATIN1_TO_NATIVE(ender);
13106                                 }
13107                                 else
13108 #endif
13109                                 if (IN_ENCODING) {
13110                                     goto recode_encoding;
13111                                 }
13112 			    }
13113                             else {
13114 				REQUIRE_UTF8(flagp);
13115 			    }
13116 			    break;
13117 			}
13118 		    case 'c':
13119 			p++;
13120 			ender = grok_bslash_c(*p++, PASS2);
13121 			break;
13122                     case '8': case '9': /* must be a backreference */
13123                         --p;
13124                         /* we have an escape like \8 which cannot be an octal escape
13125                          * so we exit the loop, and let the outer loop handle this
13126                          * escape which may or may not be a legitimate backref. */
13127                         goto loopdone;
13128                     case '1': case '2': case '3':case '4':
13129 		    case '5': case '6': case '7':
13130                         /* When we parse backslash escapes there is ambiguity
13131                          * between backreferences and octal escapes. Any escape
13132                          * from \1 - \9 is a backreference, any multi-digit
13133                          * escape which does not start with 0 and which when
13134                          * evaluated as decimal could refer to an already
13135                          * parsed capture buffer is a back reference. Anything
13136                          * else is octal.
13137                          *
13138                          * Note this implies that \118 could be interpreted as
13139                          * 118 OR as "\11" . "8" depending on whether there
13140                          * were 118 capture buffers defined already in the
13141                          * pattern.  */
13142 
13143                         /* NOTE, RExC_npar is 1 more than the actual number of
13144                          * parens we have seen so far, hence the < RExC_npar below. */
13145 
13146                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
13147                         {  /* Not to be treated as an octal constant, go
13148                                    find backref */
13149                             --p;
13150                             goto loopdone;
13151                         }
13152                         /* FALLTHROUGH */
13153                     case '0':
13154 			{
13155 			    I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13156 			    STRLEN numlen = 3;
13157 			    ender = grok_oct(p, &numlen, &flags, NULL);
13158 			    if (ender > 0xff) {
13159 				REQUIRE_UTF8(flagp);
13160 			    }
13161 			    p += numlen;
13162                             if (PASS2   /* like \08, \178 */
13163                                 && numlen < 3
13164                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
13165                             {
13166 				reg_warn_non_literal_string(
13167                                          p + 1,
13168                                          form_short_octal_warning(p, numlen));
13169                             }
13170 			}
13171 			if (IN_ENCODING && ender < 0x100)
13172 			    goto recode_encoding;
13173 			break;
13174 		      recode_encoding:
13175 			if (! RExC_override_recoding) {
13176 			    SV* enc = _get_encoding();
13177 			    ender = reg_recode((U8)ender, &enc);
13178 			    if (!enc && PASS2)
13179 				ckWARNreg(p, "Invalid escape in the specified encoding");
13180 			    REQUIRE_UTF8(flagp);
13181 			}
13182 			break;
13183 		    case '\0':
13184 			if (p >= RExC_end)
13185 			    FAIL("Trailing \\");
13186 			/* FALLTHROUGH */
13187 		    default:
13188 			if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
13189 			    /* Include any left brace following the alpha to emphasize
13190 			     * that it could be part of an escape at some point
13191 			     * in the future */
13192 			    int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
13193 			    ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
13194 			}
13195 			goto normal_default;
13196 		    } /* End of switch on '\' */
13197 		    break;
13198 		case '{':
13199 		    /* Currently we don't warn when the lbrace is at the start
13200 		     * of a construct.  This catches it in the middle of a
13201 		     * literal string, or when it's the first thing after
13202 		     * something like "\b" */
13203 		    if (! SIZE_ONLY
13204 			&& (len || (p > RExC_start && isALPHA_A(*(p -1)))))
13205 		    {
13206 			ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
13207 		    }
13208 		    /*FALLTHROUGH*/
13209 		default:    /* A literal character */
13210 		  normal_default:
13211 		    if (! UTF8_IS_INVARIANT(*p) && UTF) {
13212 			STRLEN numlen;
13213 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
13214 					       &numlen, UTF8_ALLOW_DEFAULT);
13215 			p += numlen;
13216 		    }
13217 		    else
13218 			ender = (U8) *p++;
13219 		    break;
13220 		} /* End of switch on the literal */
13221 
13222 		/* Here, have looked at the literal character and <ender>
13223                  * contains its ordinal, <p> points to the character after it.
13224                  * We need to check if the next non-ignored thing is a
13225                  * quantifier.  Move <p> to after anything that should be
13226                  * ignored, which, as a side effect, positions <p> for the next
13227                  * loop iteration */
13228                 skip_to_be_ignored_text(pRExC_state, &p,
13229                                         FALSE /* Don't force to /x */ );
13230 
13231                 /* If the next thing is a quantifier, it applies to this
13232                  * character only, which means that this character has to be in
13233                  * its own node and can't just be appended to the string in an
13234                  * existing node, so if there are already other characters in
13235                  * the node, close the node with just them, and set up to do
13236                  * this character again next time through, when it will be the
13237                  * only thing in its new node */
13238                 if ((next_is_quantifier = (   LIKELY(p < RExC_end)
13239                                            && UNLIKELY(ISMULT2(p))))
13240                     && LIKELY(len))
13241 		{
13242                     p = oldp;
13243                     goto loopdone;
13244                 }
13245 
13246                 /* Ready to add 'ender' to the node */
13247 
13248                 if (! FOLD) {  /* The simple case, just append the literal */
13249 
13250                     /* In the sizing pass, we need only the size of the
13251                      * character we are appending, hence we can delay getting
13252                      * its representation until PASS2. */
13253                     if (SIZE_ONLY) {
13254                         if (UTF) {
13255                             const STRLEN unilen = UVCHR_SKIP(ender);
13256                             s += unilen;
13257 
13258                             /* We have to subtract 1 just below (and again in
13259                              * the corresponding PASS2 code) because the loop
13260                              * increments <len> each time, as all but this path
13261                              * (and one other) through it add a single byte to
13262                              * the EXACTish node.  But these paths would change
13263                              * len to be the correct final value, so cancel out
13264                              * the increment that follows */
13265                             len += unilen - 1;
13266                         }
13267                         else {
13268                             s++;
13269                         }
13270                     } else { /* PASS2 */
13271                       not_fold_common:
13272                         if (UTF) {
13273                             U8 * new_s = uvchr_to_utf8((U8*)s, ender);
13274                             len += (char *) new_s - s - 1;
13275                             s = (char *) new_s;
13276                         }
13277                         else {
13278                             *(s++) = (char) ender;
13279                         }
13280                     }
13281                 }
13282                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
13283 
13284                     /* Here are folding under /l, and the code point is
13285                      * problematic.  First, we know we can't simplify things */
13286                     maybe_exact = FALSE;
13287                     maybe_exactfu = FALSE;
13288 
13289                     /* A problematic code point in this context means that its
13290                      * fold isn't known until runtime, so we can't fold it now.
13291                      * (The non-problematic code points are the above-Latin1
13292                      * ones that fold to also all above-Latin1.  Their folds
13293                      * don't vary no matter what the locale is.) But here we
13294                      * have characters whose fold depends on the locale.
13295                      * Unlike the non-folding case above, we have to keep track
13296                      * of these in the sizing pass, so that we can make sure we
13297                      * don't split too-long nodes in the middle of a potential
13298                      * multi-char fold.  And unlike the regular fold case
13299                      * handled in the else clauses below, we don't actually
13300                      * fold and don't have special cases to consider.  What we
13301                      * do for both passes is the PASS2 code for non-folding */
13302                     goto not_fold_common;
13303                 }
13304                 else /* A regular FOLD code point */
13305                     if (! (   UTF
13306 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
13307    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
13308                                       || UNICODE_DOT_DOT_VERSION > 0)
13309                             /* See comments for join_exact() as to why we fold
13310                              * this non-UTF at compile time */
13311                             || (   node_type == EXACTFU
13312                                 && ender == LATIN_SMALL_LETTER_SHARP_S)
13313 #endif
13314                 )) {
13315                     /* Here, are folding and are not UTF-8 encoded; therefore
13316                      * the character must be in the range 0-255, and is not /l
13317                      * (Not /l because we already handled these under /l in
13318                      * is_PROBLEMATIC_LOCALE_FOLD_cp) */
13319                     if (IS_IN_SOME_FOLD_L1(ender)) {
13320                         maybe_exact = FALSE;
13321 
13322                         /* See if the character's fold differs between /d and
13323                          * /u.  This includes the multi-char fold SHARP S to
13324                          * 'ss' */
13325                         if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
13326                             RExC_seen_unfolded_sharp_s = 1;
13327                             maybe_exactfu = FALSE;
13328                         }
13329                         else if (maybe_exactfu
13330                             && (PL_fold[ender] != PL_fold_latin1[ender]
13331 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
13332    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
13333                                       || UNICODE_DOT_DOT_VERSION > 0)
13334                                 || (   len > 0
13335                                     && isALPHA_FOLD_EQ(ender, 's')
13336                                     && isALPHA_FOLD_EQ(*(s-1), 's'))
13337 #endif
13338                         )) {
13339                             maybe_exactfu = FALSE;
13340                         }
13341                     }
13342 
13343                     /* Even when folding, we store just the input character, as
13344                      * we have an array that finds its fold quickly */
13345                     *(s++) = (char) ender;
13346                 }
13347                 else {  /* FOLD, and UTF (or sharp s) */
13348                     /* Unlike the non-fold case, we do actually have to
13349                      * calculate the results here in pass 1.  This is for two
13350                      * reasons, the folded length may be longer than the
13351                      * unfolded, and we have to calculate how many EXACTish
13352                      * nodes it will take; and we may run out of room in a node
13353                      * in the middle of a potential multi-char fold, and have
13354                      * to back off accordingly.  */
13355 
13356                     UV folded;
13357                     if (isASCII_uni(ender)) {
13358                         folded = toFOLD(ender);
13359                         *(s)++ = (U8) folded;
13360                     }
13361                     else {
13362                         STRLEN foldlen;
13363 
13364                         folded = _to_uni_fold_flags(
13365                                      ender,
13366                                      (U8 *) s,
13367                                      &foldlen,
13368                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
13369                                                         ? FOLD_FLAGS_NOMIX_ASCII
13370                                                         : 0));
13371                         s += foldlen;
13372 
13373                         /* The loop increments <len> each time, as all but this
13374                          * path (and one other) through it add a single byte to
13375                          * the EXACTish node.  But this one has changed len to
13376                          * be the correct final value, so subtract one to
13377                          * cancel out the increment that follows */
13378                         len += foldlen - 1;
13379                     }
13380                     /* If this node only contains non-folding code points so
13381                      * far, see if this new one is also non-folding */
13382                     if (maybe_exact) {
13383                         if (folded != ender) {
13384                             maybe_exact = FALSE;
13385                         }
13386                         else {
13387                             /* Here the fold is the original; we have to check
13388                              * further to see if anything folds to it */
13389                             if (_invlist_contains_cp(PL_utf8_foldable,
13390                                                         ender))
13391                             {
13392                                 maybe_exact = FALSE;
13393                             }
13394                         }
13395                     }
13396                     ender = folded;
13397 		}
13398 
13399 		if (next_is_quantifier) {
13400 
13401                     /* Here, the next input is a quantifier, and to get here,
13402                      * the current character is the only one in the node.
13403                      * Also, here <len> doesn't include the final byte for this
13404                      * character */
13405                     len++;
13406                     goto loopdone;
13407 		}
13408 
13409 	    } /* End of loop through literal characters */
13410 
13411             /* Here we have either exhausted the input or ran out of room in
13412              * the node.  (If we encountered a character that can't be in the
13413              * node, transfer is made directly to <loopdone>, and so we
13414              * wouldn't have fallen off the end of the loop.)  In the latter
13415              * case, we artificially have to split the node into two, because
13416              * we just don't have enough space to hold everything.  This
13417              * creates a problem if the final character participates in a
13418              * multi-character fold in the non-final position, as a match that
13419              * should have occurred won't, due to the way nodes are matched,
13420              * and our artificial boundary.  So back off until we find a non-
13421              * problematic character -- one that isn't at the beginning or
13422              * middle of such a fold.  (Either it doesn't participate in any
13423              * folds, or appears only in the final position of all the folds it
13424              * does participate in.)  A better solution with far fewer false
13425              * positives, and that would fill the nodes more completely, would
13426              * be to actually have available all the multi-character folds to
13427              * test against, and to back-off only far enough to be sure that
13428              * this node isn't ending with a partial one.  <upper_parse> is set
13429              * further below (if we need to reparse the node) to include just
13430              * up through that final non-problematic character that this code
13431              * identifies, so when it is set to less than the full node, we can
13432              * skip the rest of this */
13433             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
13434 
13435                 const STRLEN full_len = len;
13436 
13437 		assert(len >= MAX_NODE_STRING_SIZE);
13438 
13439                 /* Here, <s> points to the final byte of the final character.
13440                  * Look backwards through the string until find a non-
13441                  * problematic character */
13442 
13443 		if (! UTF) {
13444 
13445                     /* This has no multi-char folds to non-UTF characters */
13446                     if (ASCII_FOLD_RESTRICTED) {
13447                         goto loopdone;
13448                     }
13449 
13450                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
13451                     len = s - s0 + 1;
13452 		}
13453                 else {
13454                     if (!  PL_NonL1NonFinalFold) {
13455                         PL_NonL1NonFinalFold = _new_invlist_C_array(
13456                                         NonL1_Perl_Non_Final_Folds_invlist);
13457                     }
13458 
13459                     /* Point to the first byte of the final character */
13460                     s = (char *) utf8_hop((U8 *) s, -1);
13461 
13462                     while (s >= s0) {   /* Search backwards until find
13463                                            non-problematic char */
13464                         if (UTF8_IS_INVARIANT(*s)) {
13465 
13466                             /* There are no ascii characters that participate
13467                              * in multi-char folds under /aa.  In EBCDIC, the
13468                              * non-ascii invariants are all control characters,
13469                              * so don't ever participate in any folds. */
13470                             if (ASCII_FOLD_RESTRICTED
13471                                 || ! IS_NON_FINAL_FOLD(*s))
13472                             {
13473                                 break;
13474                             }
13475                         }
13476                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
13477                             if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
13478                                                                   *s, *(s+1))))
13479                             {
13480                                 break;
13481                             }
13482                         }
13483                         else if (! _invlist_contains_cp(
13484                                         PL_NonL1NonFinalFold,
13485                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
13486                         {
13487                             break;
13488                         }
13489 
13490                         /* Here, the current character is problematic in that
13491                          * it does occur in the non-final position of some
13492                          * fold, so try the character before it, but have to
13493                          * special case the very first byte in the string, so
13494                          * we don't read outside the string */
13495                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
13496                     } /* End of loop backwards through the string */
13497 
13498                     /* If there were only problematic characters in the string,
13499                      * <s> will point to before s0, in which case the length
13500                      * should be 0, otherwise include the length of the
13501                      * non-problematic character just found */
13502                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
13503 		}
13504 
13505                 /* Here, have found the final character, if any, that is
13506                  * non-problematic as far as ending the node without splitting
13507                  * it across a potential multi-char fold.  <len> contains the
13508                  * number of bytes in the node up-to and including that
13509                  * character, or is 0 if there is no such character, meaning
13510                  * the whole node contains only problematic characters.  In
13511                  * this case, give up and just take the node as-is.  We can't
13512                  * do any better */
13513                 if (len == 0) {
13514                     len = full_len;
13515 
13516                     /* If the node ends in an 's' we make sure it stays EXACTF,
13517                      * as if it turns into an EXACTFU, it could later get
13518                      * joined with another 's' that would then wrongly match
13519                      * the sharp s */
13520                     if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
13521                     {
13522                         maybe_exactfu = FALSE;
13523                     }
13524                 } else {
13525 
13526                     /* Here, the node does contain some characters that aren't
13527                      * problematic.  If one such is the final character in the
13528                      * node, we are done */
13529                     if (len == full_len) {
13530                         goto loopdone;
13531                     }
13532                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
13533 
13534                         /* If the final character is problematic, but the
13535                          * penultimate is not, back-off that last character to
13536                          * later start a new node with it */
13537                         p = oldp;
13538                         goto loopdone;
13539                     }
13540 
13541                     /* Here, the final non-problematic character is earlier
13542                      * in the input than the penultimate character.  What we do
13543                      * is reparse from the beginning, going up only as far as
13544                      * this final ok one, thus guaranteeing that the node ends
13545                      * in an acceptable character.  The reason we reparse is
13546                      * that we know how far in the character is, but we don't
13547                      * know how to correlate its position with the input parse.
13548                      * An alternate implementation would be to build that
13549                      * correlation as we go along during the original parse,
13550                      * but that would entail extra work for every node, whereas
13551                      * this code gets executed only when the string is too
13552                      * large for the node, and the final two characters are
13553                      * problematic, an infrequent occurrence.  Yet another
13554                      * possible strategy would be to save the tail of the
13555                      * string, and the next time regatom is called, initialize
13556                      * with that.  The problem with this is that unless you
13557                      * back off one more character, you won't be guaranteed
13558                      * regatom will get called again, unless regbranch,
13559                      * regpiece ... are also changed.  If you do back off that
13560                      * extra character, so that there is input guaranteed to
13561                      * force calling regatom, you can't handle the case where
13562                      * just the first character in the node is acceptable.  I
13563                      * (khw) decided to try this method which doesn't have that
13564                      * pitfall; if performance issues are found, we can do a
13565                      * combination of the current approach plus that one */
13566                     upper_parse = len;
13567                     len = 0;
13568                     s = s0;
13569                     goto reparse;
13570                 }
13571 	    }   /* End of verifying node ends with an appropriate char */
13572 
13573           loopdone:   /* Jumped to when encounters something that shouldn't be
13574                          in the node */
13575 
13576             /* I (khw) don't know if you can get here with zero length, but the
13577              * old code handled this situation by creating a zero-length EXACT
13578              * node.  Might as well be NOTHING instead */
13579             if (len == 0) {
13580                 OP(ret) = NOTHING;
13581             }
13582             else {
13583                 if (FOLD) {
13584                     /* If 'maybe_exact' is still set here, means there are no
13585                      * code points in the node that participate in folds;
13586                      * similarly for 'maybe_exactfu' and code points that match
13587                      * differently depending on UTF8ness of the target string
13588                      * (for /u), or depending on locale for /l */
13589                     if (maybe_exact) {
13590                         OP(ret) = (LOC)
13591                                   ? EXACTL
13592                                   : EXACT;
13593                     }
13594                     else if (maybe_exactfu) {
13595                         OP(ret) = (LOC)
13596                                   ? EXACTFLU8
13597                                   : EXACTFU;
13598                     }
13599                 }
13600                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
13601                                            FALSE /* Don't look to see if could
13602                                                     be turned into an EXACT
13603                                                     node, as we have already
13604                                                     computed that */
13605                                           );
13606             }
13607 
13608 	    RExC_parse = p - 1;
13609             Set_Node_Cur_Length(ret, parse_start);
13610 	    RExC_parse = p;
13611             skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13612                                     FALSE /* Don't force to /x */ );
13613 	    {
13614 		/* len is STRLEN which is unsigned, need to copy to signed */
13615 		IV iv = len;
13616 		if (iv < 0)
13617 		    vFAIL("Internal disaster");
13618 	    }
13619 
13620 	} /* End of label 'defchar:' */
13621 	break;
13622     } /* End of giant switch on input character */
13623 
13624     return(ret);
13625 }
13626 
13627 
13628 STATIC void
13629 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13630 {
13631     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
13632      * sets up the bitmap and any flags, removing those code points from the
13633      * inversion list, setting it to NULL should it become completely empty */
13634 
13635     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13636     assert(PL_regkind[OP(node)] == ANYOF);
13637 
13638     ANYOF_BITMAP_ZERO(node);
13639     if (*invlist_ptr) {
13640 
13641 	/* This gets set if we actually need to modify things */
13642 	bool change_invlist = FALSE;
13643 
13644 	UV start, end;
13645 
13646 	/* Start looking through *invlist_ptr */
13647 	invlist_iterinit(*invlist_ptr);
13648 	while (invlist_iternext(*invlist_ptr, &start, &end)) {
13649 	    UV high;
13650 	    int i;
13651 
13652             if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13653                 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13654             }
13655 
13656 	    /* Quit if are above what we should change */
13657 	    if (start >= NUM_ANYOF_CODE_POINTS) {
13658 		break;
13659 	    }
13660 
13661 	    change_invlist = TRUE;
13662 
13663 	    /* Set all the bits in the range, up to the max that we are doing */
13664 	    high = (end < NUM_ANYOF_CODE_POINTS - 1)
13665                    ? end
13666                    : NUM_ANYOF_CODE_POINTS - 1;
13667 	    for (i = start; i <= (int) high; i++) {
13668 		if (! ANYOF_BITMAP_TEST(node, i)) {
13669 		    ANYOF_BITMAP_SET(node, i);
13670 		}
13671 	    }
13672 	}
13673 	invlist_iterfinish(*invlist_ptr);
13674 
13675         /* Done with loop; remove any code points that are in the bitmap from
13676          * *invlist_ptr; similarly for code points above the bitmap if we have
13677          * a flag to match all of them anyways */
13678 	if (change_invlist) {
13679 	    _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13680 	}
13681         if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13682 	    _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13683 	}
13684 
13685 	/* If have completely emptied it, remove it completely */
13686 	if (_invlist_len(*invlist_ptr) == 0) {
13687 	    SvREFCNT_dec_NN(*invlist_ptr);
13688 	    *invlist_ptr = NULL;
13689 	}
13690     }
13691 }
13692 
13693 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13694    Character classes ([:foo:]) can also be negated ([:^foo:]).
13695    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13696    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13697    but trigger failures because they are currently unimplemented. */
13698 
13699 #define POSIXCC_DONE(c)   ((c) == ':')
13700 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13701 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13702 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
13703 
13704 #define WARNING_PREFIX              "Assuming NOT a POSIX class since "
13705 #define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
13706 #define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
13707 
13708 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
13709 
13710 /* 'posix_warnings' and 'warn_text' are names of variables in the following
13711  * routine. q.v. */
13712 #define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
13713         if (posix_warnings) {                                               \
13714             if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \
13715             av_push(RExC_warn_text, Perl_newSVpvf(aTHX_                          \
13716                                              WARNING_PREFIX                 \
13717                                              text                           \
13718                                              REPORT_LOCATION,               \
13719                                              REPORT_LOCATION_ARGS(p)));     \
13720         }                                                                   \
13721     } STMT_END
13722 
13723 STATIC int
13724 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
13725 
13726     const char * const s,      /* Where the putative posix class begins.
13727                                   Normally, this is one past the '['.  This
13728                                   parameter exists so it can be somewhere
13729                                   besides RExC_parse. */
13730     char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
13731                                   NULL */
13732     AV ** posix_warnings,      /* Where to place any generated warnings, or
13733                                   NULL */
13734     const bool check_only      /* Don't die if error */
13735 )
13736 {
13737     /* This parses what the caller thinks may be one of the three POSIX
13738      * constructs:
13739      *  1) a character class, like [:blank:]
13740      *  2) a collating symbol, like [. .]
13741      *  3) an equivalence class, like [= =]
13742      * In the latter two cases, it croaks if it finds a syntactically legal
13743      * one, as these are not handled by Perl.
13744      *
13745      * The main purpose is to look for a POSIX character class.  It returns:
13746      *  a) the class number
13747      *      if it is a completely syntactically and semantically legal class.
13748      *      'updated_parse_ptr', if not NULL, is set to point to just after the
13749      *      closing ']' of the class
13750      *  b) OOB_NAMEDCLASS
13751      *      if it appears that one of the three POSIX constructs was meant, but
13752      *      its specification was somehow defective.  'updated_parse_ptr', if
13753      *      not NULL, is set to point to the character just after the end
13754      *      character of the class.  See below for handling of warnings.
13755      *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
13756      *      if it  doesn't appear that a POSIX construct was intended.
13757      *      'updated_parse_ptr' is not changed.  No warnings nor errors are
13758      *      raised.
13759      *
13760      * In b) there may be errors or warnings generated.  If 'check_only' is
13761      * TRUE, then any errors are discarded.  Warnings are returned to the
13762      * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
13763      * instead it is NULL, warnings are suppressed.  This is done in all
13764      * passes.  The reason for this is that the rest of the parsing is heavily
13765      * dependent on whether this routine found a valid posix class or not.  If
13766      * it did, the closing ']' is absorbed as part of the class.  If no class,
13767      * or an invalid one is found, any ']' will be considered the terminator of
13768      * the outer bracketed character class, leading to very different results.
13769      * In particular, a '(?[ ])' construct will likely have a syntax error if
13770      * the class is parsed other than intended, and this will happen in pass1,
13771      * before the warnings would normally be output.  This mechanism allows the
13772      * caller to output those warnings in pass1 just before dieing, giving a
13773      * much better clue as to what is wrong.
13774      *
13775      * The reason for this function, and its complexity is that a bracketed
13776      * character class can contain just about anything.  But it's easy to
13777      * mistype the very specific posix class syntax but yielding a valid
13778      * regular bracketed class, so it silently gets compiled into something
13779      * quite unintended.
13780      *
13781      * The solution adopted here maintains backward compatibility except that
13782      * it adds a warning if it looks like a posix class was intended but
13783      * improperly specified.  The warning is not raised unless what is input
13784      * very closely resembles one of the 14 legal posix classes.  To do this,
13785      * it uses fuzzy parsing.  It calculates how many single-character edits it
13786      * would take to transform what was input into a legal posix class.  Only
13787      * if that number is quite small does it think that the intention was a
13788      * posix class.  Obviously these are heuristics, and there will be cases
13789      * where it errs on one side or another, and they can be tweaked as
13790      * experience informs.
13791      *
13792      * The syntax for a legal posix class is:
13793      *
13794      * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/
13795      *
13796      * What this routine considers syntactically to be an intended posix class
13797      * is this (the comments indicate some restrictions that the pattern
13798      * doesn't show):
13799      *
13800      *  qr/(?x: \[?                         # The left bracket, possibly
13801      *                                      # omitted
13802      *          \h*                         # possibly followed by blanks
13803      *          (?: \^ \h* )?               # possibly a misplaced caret
13804      *          [:;]?                       # The opening class character,
13805      *                                      # possibly omitted.  A typo
13806      *                                      # semi-colon can also be used.
13807      *          \h*
13808      *          \^?                         # possibly a correctly placed
13809      *                                      # caret, but not if there was also
13810      *                                      # a misplaced one
13811      *          \h*
13812      *          .{3,15}                     # The class name.  If there are
13813      *                                      # deviations from the legal syntax,
13814      *                                      # its edit distance must be close
13815      *                                      # to a real class name in order
13816      *                                      # for it to be considered to be
13817      *                                      # an intended posix class.
13818      *          \h*
13819      *          [:punct:]?                  # The closing class character,
13820      *                                      # possibly omitted.  If not a colon
13821      *                                      # nor semi colon, the class name
13822      *                                      # must be even closer to a valid
13823      *                                      # one
13824      *          \h*
13825      *          \]?                         # The right bracket, possibly
13826      *                                      # omitted.
13827      *     )/
13828      *
13829      * In the above, \h must be ASCII-only.
13830      *
13831      * These are heuristics, and can be tweaked as field experience dictates.
13832      * There will be cases when someone didn't intend to specify a posix class
13833      * that this warns as being so.  The goal is to minimize these, while
13834      * maximizing the catching of things intended to be a posix class that
13835      * aren't parsed as such.
13836      */
13837 
13838     const char* p             = s;
13839     const char * const e      = RExC_end;
13840     unsigned complement       = 0;      /* If to complement the class */
13841     bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
13842     bool has_opening_bracket  = FALSE;
13843     bool has_opening_colon    = FALSE;
13844     int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
13845                                                    valid class */
13846     const char * possible_end = NULL;   /* used for a 2nd parse pass */
13847     const char* name_start;             /* ptr to class name first char */
13848 
13849     /* If the number of single-character typos the input name is away from a
13850      * legal name is no more than this number, it is considered to have meant
13851      * the legal name */
13852     int max_distance          = 2;
13853 
13854     /* to store the name.  The size determines the maximum length before we
13855      * decide that no posix class was intended.  Should be at least
13856      * sizeof("alphanumeric") */
13857     UV input_text[15];
13858 
13859     PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
13860 
13861     if (posix_warnings && RExC_warn_text)
13862         av_clear(RExC_warn_text);
13863 
13864     if (p >= e) {
13865         return NOT_MEANT_TO_BE_A_POSIX_CLASS;
13866     }
13867 
13868     if (*(p - 1) != '[') {
13869         ADD_POSIX_WARNING(p, "it doesn't start with a '['");
13870         found_problem = TRUE;
13871     }
13872     else {
13873         has_opening_bracket = TRUE;
13874     }
13875 
13876     /* They could be confused and think you can put spaces between the
13877      * components */
13878     if (isBLANK(*p)) {
13879         found_problem = TRUE;
13880 
13881         do {
13882             p++;
13883         } while (p < e && isBLANK(*p));
13884 
13885         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
13886     }
13887 
13888     /* For [. .] and [= =].  These are quite different internally from [: :],
13889      * so they are handled separately.  */
13890     if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
13891                                             and 1 for at least one char in it
13892                                           */
13893     {
13894         const char open_char  = *p;
13895         const char * temp_ptr = p + 1;
13896 
13897         /* These two constructs are not handled by perl, and if we find a
13898          * syntactically valid one, we croak.  khw, who wrote this code, finds
13899          * this explanation of them very unclear:
13900          * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
13901          * And searching the rest of the internet wasn't very helpful either.
13902          * It looks like just about any byte can be in these constructs,
13903          * depending on the locale.  But unless the pattern is being compiled
13904          * under /l, which is very rare, Perl runs under the C or POSIX locale.
13905          * In that case, it looks like [= =] isn't allowed at all, and that
13906          * [. .] could be any single code point, but for longer strings the
13907          * constituent characters would have to be the ASCII alphabetics plus
13908          * the minus-hyphen.  Any sensible locale definition would limit itself
13909          * to these.  And any portable one definitely should.  Trying to parse
13910          * the general case is a nightmare (see [perl #127604]).  So, this code
13911          * looks only for interiors of these constructs that match:
13912          *      qr/.|[-\w]{2,}/
13913          * Using \w relaxes the apparent rules a little, without adding much
13914          * danger of mistaking something else for one of these constructs.
13915          *
13916          * [. .] in some implementations described on the internet is usable to
13917          * escape a character that otherwise is special in bracketed character
13918          * classes.  For example [.].] means a literal right bracket instead of
13919          * the ending of the class
13920          *
13921          * [= =] can legitimately contain a [. .] construct, but we don't
13922          * handle this case, as that [. .] construct will later get parsed
13923          * itself and croak then.  And [= =] is checked for even when not under
13924          * /l, as Perl has long done so.
13925          *
13926          * The code below relies on there being a trailing NUL, so it doesn't
13927          * have to keep checking if the parse ptr < e.
13928          */
13929         if (temp_ptr[1] == open_char) {
13930             temp_ptr++;
13931         }
13932         else while (    temp_ptr < e
13933                     && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
13934         {
13935             temp_ptr++;
13936         }
13937 
13938         if (*temp_ptr == open_char) {
13939             temp_ptr++;
13940             if (*temp_ptr == ']') {
13941                 temp_ptr++;
13942                 if (! found_problem && ! check_only) {
13943                     RExC_parse = (char *) temp_ptr;
13944                     vFAIL3("POSIX syntax [%c %c] is reserved for future "
13945                             "extensions", open_char, open_char);
13946                 }
13947 
13948                 /* Here, the syntax wasn't completely valid, or else the call
13949                  * is to check-only */
13950                 if (updated_parse_ptr) {
13951                     *updated_parse_ptr = (char *) temp_ptr;
13952                 }
13953 
13954                 return OOB_NAMEDCLASS;
13955             }
13956         }
13957 
13958         /* If we find something that started out to look like one of these
13959          * constructs, but isn't, we continue below so that it can be checked
13960          * for being a class name with a typo of '.' or '=' instead of a colon.
13961          * */
13962     }
13963 
13964     /* Here, we think there is a possibility that a [: :] class was meant, and
13965      * we have the first real character.  It could be they think the '^' comes
13966      * first */
13967     if (*p == '^') {
13968         found_problem = TRUE;
13969         ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
13970         complement = 1;
13971         p++;
13972 
13973         if (isBLANK(*p)) {
13974             found_problem = TRUE;
13975 
13976             do {
13977                 p++;
13978             } while (p < e && isBLANK(*p));
13979 
13980             ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
13981         }
13982     }
13983 
13984     /* But the first character should be a colon, which they could have easily
13985      * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
13986      * distinguish from a colon, so treat that as a colon).  */
13987     if (*p == ':') {
13988         p++;
13989         has_opening_colon = TRUE;
13990     }
13991     else if (*p == ';') {
13992         found_problem = TRUE;
13993         p++;
13994         ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
13995         has_opening_colon = TRUE;
13996     }
13997     else {
13998         found_problem = TRUE;
13999         ADD_POSIX_WARNING(p, "there must be a starting ':'");
14000 
14001         /* Consider an initial punctuation (not one of the recognized ones) to
14002          * be a left terminator */
14003         if (*p != '^' && *p != ']' && isPUNCT(*p)) {
14004             p++;
14005         }
14006     }
14007 
14008     /* They may think that you can put spaces between the components */
14009     if (isBLANK(*p)) {
14010         found_problem = TRUE;
14011 
14012         do {
14013             p++;
14014         } while (p < e && isBLANK(*p));
14015 
14016         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14017     }
14018 
14019     if (*p == '^') {
14020 
14021         /* We consider something like [^:^alnum:]] to not have been intended to
14022          * be a posix class, but XXX maybe we should */
14023         if (complement) {
14024             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14025         }
14026 
14027         complement = 1;
14028         p++;
14029     }
14030 
14031     /* Again, they may think that you can put spaces between the components */
14032     if (isBLANK(*p)) {
14033         found_problem = TRUE;
14034 
14035         do {
14036             p++;
14037         } while (p < e && isBLANK(*p));
14038 
14039         ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14040     }
14041 
14042     if (*p == ']') {
14043 
14044         /* XXX This ']' may be a typo, and something else was meant.  But
14045          * treating it as such creates enough complications, that that
14046          * possibility isn't currently considered here.  So we assume that the
14047          * ']' is what is intended, and if we've already found an initial '[',
14048          * this leaves this construct looking like [:] or [:^], which almost
14049          * certainly weren't intended to be posix classes */
14050         if (has_opening_bracket) {
14051             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14052         }
14053 
14054         /* But this function can be called when we parse the colon for
14055          * something like qr/[alpha:]]/, so we back up to look for the
14056          * beginning */
14057         p--;
14058 
14059         if (*p == ';') {
14060             found_problem = TRUE;
14061             ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14062         }
14063         else if (*p != ':') {
14064 
14065             /* XXX We are currently very restrictive here, so this code doesn't
14066              * consider the possibility that, say, /[alpha.]]/ was intended to
14067              * be a posix class. */
14068             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14069         }
14070 
14071         /* Here we have something like 'foo:]'.  There was no initial colon,
14072          * and we back up over 'foo.  XXX Unlike the going forward case, we
14073          * don't handle typos of non-word chars in the middle */
14074         has_opening_colon = FALSE;
14075         p--;
14076 
14077         while (p > RExC_start && isWORDCHAR(*p)) {
14078             p--;
14079         }
14080         p++;
14081 
14082         /* Here, we have positioned ourselves to where we think the first
14083          * character in the potential class is */
14084     }
14085 
14086     /* Now the interior really starts.  There are certain key characters that
14087      * can end the interior, or these could just be typos.  To catch both
14088      * cases, we may have to do two passes.  In the first pass, we keep on
14089      * going unless we come to a sequence that matches
14090      *      qr/ [[:punct:]] [[:blank:]]* \] /xa
14091      * This means it takes a sequence to end the pass, so two typos in a row if
14092      * that wasn't what was intended.  If the class is perfectly formed, just
14093      * this one pass is needed.  We also stop if there are too many characters
14094      * being accumulated, but this number is deliberately set higher than any
14095      * real class.  It is set high enough so that someone who thinks that
14096      * 'alphanumeric' is a correct name would get warned that it wasn't.
14097      * While doing the pass, we keep track of where the key characters were in
14098      * it.  If we don't find an end to the class, and one of the key characters
14099      * was found, we redo the pass, but stop when we get to that character.
14100      * Thus the key character was considered a typo in the first pass, but a
14101      * terminator in the second.  If two key characters are found, we stop at
14102      * the second one in the first pass.  Again this can miss two typos, but
14103      * catches a single one
14104      *
14105      * In the first pass, 'possible_end' starts as NULL, and then gets set to
14106      * point to the first key character.  For the second pass, it starts as -1.
14107      * */
14108 
14109     name_start = p;
14110   parse_name:
14111     {
14112         bool has_blank               = FALSE;
14113         bool has_upper               = FALSE;
14114         bool has_terminating_colon   = FALSE;
14115         bool has_terminating_bracket = FALSE;
14116         bool has_semi_colon          = FALSE;
14117         unsigned int name_len        = 0;
14118         int punct_count              = 0;
14119 
14120         while (p < e) {
14121 
14122             /* Squeeze out blanks when looking up the class name below */
14123             if (isBLANK(*p) ) {
14124                 has_blank = TRUE;
14125                 found_problem = TRUE;
14126                 p++;
14127                 continue;
14128             }
14129 
14130             /* The name will end with a punctuation */
14131             if (isPUNCT(*p)) {
14132                 const char * peek = p + 1;
14133 
14134                 /* Treat any non-']' punctuation followed by a ']' (possibly
14135                  * with intervening blanks) as trying to terminate the class.
14136                  * ']]' is very likely to mean a class was intended (but
14137                  * missing the colon), but the warning message that gets
14138                  * generated shows the error position better if we exit the
14139                  * loop at the bottom (eventually), so skip it here. */
14140                 if (*p != ']') {
14141                     if (peek < e && isBLANK(*peek)) {
14142                         has_blank = TRUE;
14143                         found_problem = TRUE;
14144                         do {
14145                             peek++;
14146                         } while (peek < e && isBLANK(*peek));
14147                     }
14148 
14149                     if (peek < e && *peek == ']') {
14150                         has_terminating_bracket = TRUE;
14151                         if (*p == ':') {
14152                             has_terminating_colon = TRUE;
14153                         }
14154                         else if (*p == ';') {
14155                             has_semi_colon = TRUE;
14156                             has_terminating_colon = TRUE;
14157                         }
14158                         else {
14159                             found_problem = TRUE;
14160                         }
14161                         p = peek + 1;
14162                         goto try_posix;
14163                     }
14164                 }
14165 
14166                 /* Here we have punctuation we thought didn't end the class.
14167                  * Keep track of the position of the key characters that are
14168                  * more likely to have been class-enders */
14169                 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
14170 
14171                     /* Allow just one such possible class-ender not actually
14172                      * ending the class. */
14173                     if (possible_end) {
14174                         break;
14175                     }
14176                     possible_end = p;
14177                 }
14178 
14179                 /* If we have too many punctuation characters, no use in
14180                  * keeping going */
14181                 if (++punct_count > max_distance) {
14182                     break;
14183                 }
14184 
14185                 /* Treat the punctuation as a typo. */
14186                 input_text[name_len++] = *p;
14187                 p++;
14188             }
14189             else if (isUPPER(*p)) { /* Use lowercase for lookup */
14190                 input_text[name_len++] = toLOWER(*p);
14191                 has_upper = TRUE;
14192                 found_problem = TRUE;
14193                 p++;
14194             } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
14195                 input_text[name_len++] = *p;
14196                 p++;
14197             }
14198             else {
14199                 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
14200                 p+= UTF8SKIP(p);
14201             }
14202 
14203             /* The declaration of 'input_text' is how long we allow a potential
14204              * class name to be, before saying they didn't mean a class name at
14205              * all */
14206             if (name_len >= C_ARRAY_LENGTH(input_text)) {
14207                 break;
14208             }
14209         }
14210 
14211         /* We get to here when the possible class name hasn't been properly
14212          * terminated before:
14213          *   1) we ran off the end of the pattern; or
14214          *   2) found two characters, each of which might have been intended to
14215          *      be the name's terminator
14216          *   3) found so many punctuation characters in the purported name,
14217          *      that the edit distance to a valid one is exceeded
14218          *   4) we decided it was more characters than anyone could have
14219          *      intended to be one. */
14220 
14221         found_problem = TRUE;
14222 
14223         /* In the final two cases, we know that looking up what we've
14224          * accumulated won't lead to a match, even a fuzzy one. */
14225         if (   name_len >= C_ARRAY_LENGTH(input_text)
14226             || punct_count > max_distance)
14227         {
14228             /* If there was an intermediate key character that could have been
14229              * an intended end, redo the parse, but stop there */
14230             if (possible_end && possible_end != (char *) -1) {
14231                 possible_end = (char *) -1; /* Special signal value to say
14232                                                we've done a first pass */
14233                 p = name_start;
14234                 goto parse_name;
14235             }
14236 
14237             /* Otherwise, it can't have meant to have been a class */
14238             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14239         }
14240 
14241         /* If we ran off the end, and the final character was a punctuation
14242          * one, back up one, to look at that final one just below.  Later, we
14243          * will restore the parse pointer if appropriate */
14244         if (name_len && p == e && isPUNCT(*(p-1))) {
14245             p--;
14246             name_len--;
14247         }
14248 
14249         if (p < e && isPUNCT(*p)) {
14250             if (*p == ']') {
14251                 has_terminating_bracket = TRUE;
14252 
14253                 /* If this is a 2nd ']', and the first one is just below this
14254                  * one, consider that to be the real terminator.  This gives a
14255                  * uniform and better positioning for the warning message  */
14256                 if (   possible_end
14257                     && possible_end != (char *) -1
14258                     && *possible_end == ']'
14259                     && name_len && input_text[name_len - 1] == ']')
14260                 {
14261                     name_len--;
14262                     p = possible_end;
14263 
14264                     /* And this is actually equivalent to having done the 2nd
14265                      * pass now, so set it to not try again */
14266                     possible_end = (char *) -1;
14267                 }
14268             }
14269             else {
14270                 if (*p == ':') {
14271                     has_terminating_colon = TRUE;
14272                 }
14273                 else if (*p == ';') {
14274                     has_semi_colon = TRUE;
14275                     has_terminating_colon = TRUE;
14276                 }
14277                 p++;
14278             }
14279         }
14280 
14281     try_posix:
14282 
14283         /* Here, we have a class name to look up.  We can short circuit the
14284          * stuff below for short names that can't possibly be meant to be a
14285          * class name.  (We can do this on the first pass, as any second pass
14286          * will yield an even shorter name) */
14287         if (name_len < 3) {
14288             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14289         }
14290 
14291         /* Find which class it is.  Initially switch on the length of the name.
14292          * */
14293         switch (name_len) {
14294             case 4:
14295                 if (memEQ(name_start, "word", 4)) {
14296                     /* this is not POSIX, this is the Perl \w */
14297                     class_number = ANYOF_WORDCHAR;
14298                 }
14299                 break;
14300             case 5:
14301                 /* Names all of length 5: alnum alpha ascii blank cntrl digit
14302                  *                        graph lower print punct space upper
14303                  * Offset 4 gives the best switch position.  */
14304                 switch (name_start[4]) {
14305                     case 'a':
14306                         if (memEQ(name_start, "alph", 4)) /* alpha */
14307                             class_number = ANYOF_ALPHA;
14308                         break;
14309                     case 'e':
14310                         if (memEQ(name_start, "spac", 4)) /* space */
14311                             class_number = ANYOF_SPACE;
14312                         break;
14313                     case 'h':
14314                         if (memEQ(name_start, "grap", 4)) /* graph */
14315                             class_number = ANYOF_GRAPH;
14316                         break;
14317                     case 'i':
14318                         if (memEQ(name_start, "asci", 4)) /* ascii */
14319                             class_number = ANYOF_ASCII;
14320                         break;
14321                     case 'k':
14322                         if (memEQ(name_start, "blan", 4)) /* blank */
14323                             class_number = ANYOF_BLANK;
14324                         break;
14325                     case 'l':
14326                         if (memEQ(name_start, "cntr", 4)) /* cntrl */
14327                             class_number = ANYOF_CNTRL;
14328                         break;
14329                     case 'm':
14330                         if (memEQ(name_start, "alnu", 4)) /* alnum */
14331                             class_number = ANYOF_ALPHANUMERIC;
14332                         break;
14333                     case 'r':
14334                         if (memEQ(name_start, "lowe", 4)) /* lower */
14335                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
14336                         else if (memEQ(name_start, "uppe", 4)) /* upper */
14337                             class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
14338                         break;
14339                     case 't':
14340                         if (memEQ(name_start, "digi", 4)) /* digit */
14341                             class_number = ANYOF_DIGIT;
14342                         else if (memEQ(name_start, "prin", 4)) /* print */
14343                             class_number = ANYOF_PRINT;
14344                         else if (memEQ(name_start, "punc", 4)) /* punct */
14345                             class_number = ANYOF_PUNCT;
14346                         break;
14347                 }
14348                 break;
14349             case 6:
14350                 if (memEQ(name_start, "xdigit", 6))
14351                     class_number = ANYOF_XDIGIT;
14352                 break;
14353         }
14354 
14355         /* If the name exactly matches a posix class name the class number will
14356          * here be set to it, and the input almost certainly was meant to be a
14357          * posix class, so we can skip further checking.  If instead the syntax
14358          * is exactly correct, but the name isn't one of the legal ones, we
14359          * will return that as an error below.  But if neither of these apply,
14360          * it could be that no posix class was intended at all, or that one
14361          * was, but there was a typo.  We tease these apart by doing fuzzy
14362          * matching on the name */
14363         if (class_number == OOB_NAMEDCLASS && found_problem) {
14364             const UV posix_names[][6] = {
14365                                                 { 'a', 'l', 'n', 'u', 'm' },
14366                                                 { 'a', 'l', 'p', 'h', 'a' },
14367                                                 { 'a', 's', 'c', 'i', 'i' },
14368                                                 { 'b', 'l', 'a', 'n', 'k' },
14369                                                 { 'c', 'n', 't', 'r', 'l' },
14370                                                 { 'd', 'i', 'g', 'i', 't' },
14371                                                 { 'g', 'r', 'a', 'p', 'h' },
14372                                                 { 'l', 'o', 'w', 'e', 'r' },
14373                                                 { 'p', 'r', 'i', 'n', 't' },
14374                                                 { 'p', 'u', 'n', 'c', 't' },
14375                                                 { 's', 'p', 'a', 'c', 'e' },
14376                                                 { 'u', 'p', 'p', 'e', 'r' },
14377                                                 { 'w', 'o', 'r', 'd' },
14378                                                 { 'x', 'd', 'i', 'g', 'i', 't' }
14379                                             };
14380             /* The names of the above all have added NULs to make them the same
14381              * size, so we need to also have the real lengths */
14382             const UV posix_name_lengths[] = {
14383                                                 sizeof("alnum") - 1,
14384                                                 sizeof("alpha") - 1,
14385                                                 sizeof("ascii") - 1,
14386                                                 sizeof("blank") - 1,
14387                                                 sizeof("cntrl") - 1,
14388                                                 sizeof("digit") - 1,
14389                                                 sizeof("graph") - 1,
14390                                                 sizeof("lower") - 1,
14391                                                 sizeof("print") - 1,
14392                                                 sizeof("punct") - 1,
14393                                                 sizeof("space") - 1,
14394                                                 sizeof("upper") - 1,
14395                                                 sizeof("word")  - 1,
14396                                                 sizeof("xdigit")- 1
14397                                             };
14398             unsigned int i;
14399             int temp_max = max_distance;    /* Use a temporary, so if we
14400                                                reparse, we haven't changed the
14401                                                outer one */
14402 
14403             /* Use a smaller max edit distance if we are missing one of the
14404              * delimiters */
14405             if (   has_opening_bracket + has_opening_colon < 2
14406                 || has_terminating_bracket + has_terminating_colon < 2)
14407             {
14408                 temp_max--;
14409             }
14410 
14411             /* See if the input name is close to a legal one */
14412             for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
14413 
14414                 /* Short circuit call if the lengths are too far apart to be
14415                  * able to match */
14416                 if (abs( (int) (name_len - posix_name_lengths[i]))
14417                     > temp_max)
14418                 {
14419                     continue;
14420                 }
14421 
14422                 if (edit_distance(input_text,
14423                                   posix_names[i],
14424                                   name_len,
14425                                   posix_name_lengths[i],
14426                                   temp_max
14427                                  )
14428                     > -1)
14429                 { /* If it is close, it probably was intended to be a class */
14430                     goto probably_meant_to_be;
14431                 }
14432             }
14433 
14434             /* Here the input name is not close enough to a valid class name
14435              * for us to consider it to be intended to be a posix class.  If
14436              * we haven't already done so, and the parse found a character that
14437              * could have been terminators for the name, but which we absorbed
14438              * as typos during the first pass, repeat the parse, signalling it
14439              * to stop at that character */
14440             if (possible_end && possible_end != (char *) -1) {
14441                 possible_end = (char *) -1;
14442                 p = name_start;
14443                 goto parse_name;
14444             }
14445 
14446             /* Here neither pass found a close-enough class name */
14447             return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14448         }
14449 
14450     probably_meant_to_be:
14451 
14452         /* Here we think that a posix specification was intended.  Update any
14453          * parse pointer */
14454         if (updated_parse_ptr) {
14455             *updated_parse_ptr = (char *) p;
14456         }
14457 
14458         /* If a posix class name was intended but incorrectly specified, we
14459          * output or return the warnings */
14460         if (found_problem) {
14461 
14462             /* We set flags for these issues in the parse loop above instead of
14463              * adding them to the list of warnings, because we can parse it
14464              * twice, and we only want one warning instance */
14465             if (has_upper) {
14466                 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
14467             }
14468             if (has_blank) {
14469                 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14470             }
14471             if (has_semi_colon) {
14472                 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14473             }
14474             else if (! has_terminating_colon) {
14475                 ADD_POSIX_WARNING(p, "there is no terminating ':'");
14476             }
14477             if (! has_terminating_bracket) {
14478                 ADD_POSIX_WARNING(p, "there is no terminating ']'");
14479             }
14480 
14481             if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) {
14482                 *posix_warnings = RExC_warn_text;
14483             }
14484         }
14485         else if (class_number != OOB_NAMEDCLASS) {
14486             /* If it is a known class, return the class.  The class number
14487              * #defines are structured so each complement is +1 to the normal
14488              * one */
14489             return class_number + complement;
14490         }
14491         else if (! check_only) {
14492 
14493             /* Here, it is an unrecognized class.  This is an error (unless the
14494             * call is to check only, which we've already handled above) */
14495             const char * const complement_string = (complement)
14496                                                    ? "^"
14497                                                    : "";
14498             RExC_parse = (char *) p;
14499             vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown",
14500                         complement_string,
14501                         UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
14502         }
14503     }
14504 
14505     return OOB_NAMEDCLASS;
14506 }
14507 #undef ADD_POSIX_WARNING
14508 
14509 STATIC unsigned  int
14510 S_regex_set_precedence(const U8 my_operator) {
14511 
14512     /* Returns the precedence in the (?[...]) construct of the input operator,
14513      * specified by its character representation.  The precedence follows
14514      * general Perl rules, but it extends this so that ')' and ']' have (low)
14515      * precedence even though they aren't really operators */
14516 
14517     switch (my_operator) {
14518         case '!':
14519             return 5;
14520         case '&':
14521             return 4;
14522         case '^':
14523         case '|':
14524         case '+':
14525         case '-':
14526             return 3;
14527         case ')':
14528             return 2;
14529         case ']':
14530             return 1;
14531     }
14532 
14533     NOT_REACHED; /* NOTREACHED */
14534     return 0;   /* Silence compiler warning */
14535 }
14536 
14537 STATIC regnode *
14538 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
14539                     I32 *flagp, U32 depth,
14540                     char * const oregcomp_parse)
14541 {
14542     /* Handle the (?[...]) construct to do set operations */
14543 
14544     U8 curchar;                     /* Current character being parsed */
14545     UV start, end;	            /* End points of code point ranges */
14546     SV* final = NULL;               /* The end result inversion list */
14547     SV* result_string;              /* 'final' stringified */
14548     AV* stack;                      /* stack of operators and operands not yet
14549                                        resolved */
14550     AV* fence_stack = NULL;         /* A stack containing the positions in
14551                                        'stack' of where the undealt-with left
14552                                        parens would be if they were actually
14553                                        put there */
14554     /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug
14555      * in Solaris Studio 12.3. See RT #127455 */
14556     VOL IV fence = 0;               /* Position of where most recent undealt-
14557                                        with left paren in stack is; -1 if none.
14558                                      */
14559     STRLEN len;                     /* Temporary */
14560     regnode* node;                  /* Temporary, and final regnode returned by
14561                                        this function */
14562     const bool save_fold = FOLD;    /* Temporary */
14563     char *save_end, *save_parse;    /* Temporaries */
14564     const bool in_locale = LOC;     /* we turn off /l during processing */
14565     AV* posix_warnings = NULL;
14566 
14567     GET_RE_DEBUG_FLAGS_DECL;
14568 
14569     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
14570 
14571     if (in_locale) {
14572         set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
14573     }
14574 
14575     REQUIRE_UNI_RULES(flagp, NULL);   /* The use of this operator implies /u.
14576                                          This is required so that the compile
14577                                          time values are valid in all runtime
14578                                          cases */
14579 
14580     /* This will return only an ANYOF regnode, or (unlikely) something smaller
14581      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
14582      * call regclass to handle '[]' so as to not have to reinvent its parsing
14583      * rules here (throwing away the size it computes each time).  And, we exit
14584      * upon an unescaped ']' that isn't one ending a regclass.  To do both
14585      * these things, we need to realize that something preceded by a backslash
14586      * is escaped, so we have to keep track of backslashes */
14587     if (SIZE_ONLY) {
14588         UV depth = 0; /* how many nested (?[...]) constructs */
14589 
14590         while (RExC_parse < RExC_end) {
14591             SV* current = NULL;
14592 
14593             skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14594                                     TRUE /* Force /x */ );
14595 
14596             switch (*RExC_parse) {
14597                 case '?':
14598                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
14599                     /* FALLTHROUGH */
14600                 default:
14601                     break;
14602                 case '\\':
14603                     /* Skip past this, so the next character gets skipped, after
14604                      * the switch */
14605                     RExC_parse++;
14606                     if (*RExC_parse == 'c') {
14607                             /* Skip the \cX notation for control characters */
14608                             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
14609                     }
14610                     break;
14611 
14612                 case '[':
14613                 {
14614                     /* See if this is a [:posix:] class. */
14615                     bool is_posix_class = (OOB_NAMEDCLASS
14616                             < handle_possible_posix(pRExC_state,
14617                                                 RExC_parse + 1,
14618                                                 NULL,
14619                                                 NULL,
14620                                                 TRUE /* checking only */));
14621                     /* If it is a posix class, leave the parse pointer at the
14622                      * '[' to fool regclass() into thinking it is part of a
14623                      * '[[:posix:]]'. */
14624                     if (! is_posix_class) {
14625                         RExC_parse++;
14626                     }
14627 
14628                     /* regclass() can only return RESTART_PASS1 and NEED_UTF8
14629                      * if multi-char folds are allowed.  */
14630                     if (!regclass(pRExC_state, flagp,depth+1,
14631                                   is_posix_class, /* parse the whole char
14632                                                      class only if not a
14633                                                      posix class */
14634                                   FALSE, /* don't allow multi-char folds */
14635                                   TRUE, /* silence non-portable warnings. */
14636                                   TRUE, /* strict */
14637                                   FALSE, /* Require return to be an ANYOF */
14638                                   &current,
14639                                   &posix_warnings
14640                                  ))
14641                         FAIL2("panic: regclass returned NULL to handle_sets, "
14642                               "flags=%#"UVxf"", (UV) *flagp);
14643 
14644                     /* function call leaves parse pointing to the ']', except
14645                      * if we faked it */
14646                     if (is_posix_class) {
14647                         RExC_parse--;
14648                     }
14649 
14650                     SvREFCNT_dec(current);   /* In case it returned something */
14651                     break;
14652                 }
14653 
14654                 case ']':
14655                     if (depth--) break;
14656                     RExC_parse++;
14657                     if (*RExC_parse == ')') {
14658                         node = reganode(pRExC_state, ANYOF, 0);
14659                         RExC_size += ANYOF_SKIP;
14660                         nextchar(pRExC_state);
14661                         Set_Node_Length(node,
14662                                 RExC_parse - oregcomp_parse + 1); /* MJD */
14663                         if (in_locale) {
14664                             set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
14665                         }
14666 
14667                         return node;
14668                     }
14669                     goto no_close;
14670             }
14671 
14672             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
14673         }
14674 
14675       no_close:
14676         /* We output the messages even if warnings are off, because we'll fail
14677          * the very next thing, and these give a likely diagnosis for that */
14678         if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
14679             output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
14680         }
14681 
14682         FAIL("Syntax error in (?[...])");
14683     }
14684 
14685     /* Pass 2 only after this. */
14686     Perl_ck_warner_d(aTHX_
14687         packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
14688         "The regex_sets feature is experimental" REPORT_LOCATION,
14689         REPORT_LOCATION_ARGS(RExC_parse));
14690 
14691     /* Everything in this construct is a metacharacter.  Operands begin with
14692      * either a '\' (for an escape sequence), or a '[' for a bracketed
14693      * character class.  Any other character should be an operator, or
14694      * parenthesis for grouping.  Both types of operands are handled by calling
14695      * regclass() to parse them.  It is called with a parameter to indicate to
14696      * return the computed inversion list.  The parsing here is implemented via
14697      * a stack.  Each entry on the stack is a single character representing one
14698      * of the operators; or else a pointer to an operand inversion list. */
14699 
14700 #define IS_OPERATOR(a) SvIOK(a)
14701 #define IS_OPERAND(a)  (! IS_OPERATOR(a))
14702 
14703     /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
14704      * to luke-a-shave-itch (or -itz), but people who didn't want to bother
14705      * with pronouncing it called it Reverse Polish instead, but now that YOU
14706      * know how to pronounce it you can use the correct term, thus giving due
14707      * credit to the person who invented it, and impressing your geek friends.
14708      * Wikipedia says that the pronounciation of "Ł" has been changing so that
14709      * it is now more like an English initial W (as in wonk) than an L.)
14710      *
14711      * This means that, for example, 'a | b & c' is stored on the stack as
14712      *
14713      * c  [4]
14714      * b  [3]
14715      * &  [2]
14716      * a  [1]
14717      * |  [0]
14718      *
14719      * where the numbers in brackets give the stack [array] element number.
14720      * In this implementation, parentheses are not stored on the stack.
14721      * Instead a '(' creates a "fence" so that the part of the stack below the
14722      * fence is invisible except to the corresponding ')' (this allows us to
14723      * replace testing for parens, by using instead subtraction of the fence
14724      * position).  As new operands are processed they are pushed onto the stack
14725      * (except as noted in the next paragraph).  New operators of higher
14726      * precedence than the current final one are inserted on the stack before
14727      * the lhs operand (so that when the rhs is pushed next, everything will be
14728      * in the correct positions shown above.  When an operator of equal or
14729      * lower precedence is encountered in parsing, all the stacked operations
14730      * of equal or higher precedence are evaluated, leaving the result as the
14731      * top entry on the stack.  This makes higher precedence operations
14732      * evaluate before lower precedence ones, and causes operations of equal
14733      * precedence to left associate.
14734      *
14735      * The only unary operator '!' is immediately pushed onto the stack when
14736      * encountered.  When an operand is encountered, if the top of the stack is
14737      * a '!", the complement is immediately performed, and the '!' popped.  The
14738      * resulting value is treated as a new operand, and the logic in the
14739      * previous paragraph is executed.  Thus in the expression
14740      *      [a] + ! [b]
14741      * the stack looks like
14742      *
14743      * !
14744      * a
14745      * +
14746      *
14747      * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
14748      * becomes
14749      *
14750      * !b
14751      * a
14752      * +
14753      *
14754      * A ')' is treated as an operator with lower precedence than all the
14755      * aforementioned ones, which causes all operations on the stack above the
14756      * corresponding '(' to be evaluated down to a single resultant operand.
14757      * Then the fence for the '(' is removed, and the operand goes through the
14758      * algorithm above, without the fence.
14759      *
14760      * A separate stack is kept of the fence positions, so that the position of
14761      * the latest so-far unbalanced '(' is at the top of it.
14762      *
14763      * The ']' ending the construct is treated as the lowest operator of all,
14764      * so that everything gets evaluated down to a single operand, which is the
14765      * result */
14766 
14767     sv_2mortal((SV *)(stack = newAV()));
14768     sv_2mortal((SV *)(fence_stack = newAV()));
14769 
14770     while (RExC_parse < RExC_end) {
14771         I32 top_index;              /* Index of top-most element in 'stack' */
14772         SV** top_ptr;               /* Pointer to top 'stack' element */
14773         SV* current = NULL;         /* To contain the current inversion list
14774                                        operand */
14775         SV* only_to_avoid_leaks;
14776 
14777         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14778                                 TRUE /* Force /x */ );
14779         if (RExC_parse >= RExC_end) {
14780             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
14781         }
14782 
14783         curchar = UCHARAT(RExC_parse);
14784 
14785 redo_curchar:
14786 
14787         top_index = av_tindex_nomg(stack);
14788 
14789         switch (curchar) {
14790             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
14791             char stacked_operator;  /* The topmost operator on the 'stack'. */
14792             SV* lhs;                /* Operand to the left of the operator */
14793             SV* rhs;                /* Operand to the right of the operator */
14794             SV* fence_ptr;          /* Pointer to top element of the fence
14795                                        stack */
14796 
14797             case '(':
14798 
14799                 if (   RExC_parse < RExC_end - 1
14800                     && (UCHARAT(RExC_parse + 1) == '?'))
14801                 {
14802                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
14803                      * This happens when we have some thing like
14804                      *
14805                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
14806                      *   ...
14807                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
14808                      *
14809                      * Here we would be handling the interpolated
14810                      * '$thai_or_lao'.  We handle this by a recursive call to
14811                      * ourselves which returns the inversion list the
14812                      * interpolated expression evaluates to.  We use the flags
14813                      * from the interpolated pattern. */
14814                     U32 save_flags = RExC_flags;
14815                     const char * save_parse;
14816 
14817                     RExC_parse += 2;        /* Skip past the '(?' */
14818                     save_parse = RExC_parse;
14819 
14820                     /* Parse any flags for the '(?' */
14821                     parse_lparen_question_flags(pRExC_state);
14822 
14823                     if (RExC_parse == save_parse  /* Makes sure there was at
14824                                                      least one flag (or else
14825                                                      this embedding wasn't
14826                                                      compiled) */
14827                         || RExC_parse >= RExC_end - 4
14828                         || UCHARAT(RExC_parse) != ':'
14829                         || UCHARAT(++RExC_parse) != '('
14830                         || UCHARAT(++RExC_parse) != '?'
14831                         || UCHARAT(++RExC_parse) != '[')
14832                     {
14833 
14834                         /* In combination with the above, this moves the
14835                          * pointer to the point just after the first erroneous
14836                          * character (or if there are no flags, to where they
14837                          * should have been) */
14838                         if (RExC_parse >= RExC_end - 4) {
14839                             RExC_parse = RExC_end;
14840                         }
14841                         else if (RExC_parse != save_parse) {
14842                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14843                         }
14844                         vFAIL("Expecting '(?flags:(?[...'");
14845                     }
14846 
14847                     /* Recurse, with the meat of the embedded expression */
14848                     RExC_parse++;
14849                     (void) handle_regex_sets(pRExC_state, &current, flagp,
14850                                                     depth+1, oregcomp_parse);
14851 
14852                     /* Here, 'current' contains the embedded expression's
14853                      * inversion list, and RExC_parse points to the trailing
14854                      * ']'; the next character should be the ')' */
14855                     RExC_parse++;
14856                     assert(UCHARAT(RExC_parse) == ')');
14857 
14858                     /* Then the ')' matching the original '(' handled by this
14859                      * case: statement */
14860                     RExC_parse++;
14861                     assert(UCHARAT(RExC_parse) == ')');
14862 
14863                     RExC_parse++;
14864                     RExC_flags = save_flags;
14865                     goto handle_operand;
14866                 }
14867 
14868                 /* A regular '('.  Look behind for illegal syntax */
14869                 if (top_index - fence >= 0) {
14870                     /* If the top entry on the stack is an operator, it had
14871                      * better be a '!', otherwise the entry below the top
14872                      * operand should be an operator */
14873                     if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
14874                         || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
14875                         || (   IS_OPERAND(*top_ptr)
14876                             && (   top_index - fence < 1
14877                                 || ! (stacked_ptr = av_fetch(stack,
14878                                                              top_index - 1,
14879                                                              FALSE))
14880                                 || ! IS_OPERATOR(*stacked_ptr))))
14881                     {
14882                         RExC_parse++;
14883                         vFAIL("Unexpected '(' with no preceding operator");
14884                     }
14885                 }
14886 
14887                 /* Stack the position of this undealt-with left paren */
14888                 av_push(fence_stack, newSViv(fence));
14889                 fence = top_index + 1;
14890                 break;
14891 
14892             case '\\':
14893                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
14894                  * multi-char folds are allowed.  */
14895                 if (!regclass(pRExC_state, flagp,depth+1,
14896                               TRUE, /* means parse just the next thing */
14897                               FALSE, /* don't allow multi-char folds */
14898                               FALSE, /* don't silence non-portable warnings.  */
14899                               TRUE,  /* strict */
14900                               FALSE, /* Require return to be an ANYOF */
14901                               &current,
14902                               NULL))
14903                 {
14904                     FAIL2("panic: regclass returned NULL to handle_sets, "
14905                           "flags=%#"UVxf"", (UV) *flagp);
14906                 }
14907 
14908                 /* regclass() will return with parsing just the \ sequence,
14909                  * leaving the parse pointer at the next thing to parse */
14910                 RExC_parse--;
14911                 goto handle_operand;
14912 
14913             case '[':   /* Is a bracketed character class */
14914             {
14915                 /* See if this is a [:posix:] class. */
14916                 bool is_posix_class = (OOB_NAMEDCLASS
14917                             < handle_possible_posix(pRExC_state,
14918                                                 RExC_parse + 1,
14919                                                 NULL,
14920                                                 NULL,
14921                                                 TRUE /* checking only */));
14922                 /* If it is a posix class, leave the parse pointer at the '['
14923                  * to fool regclass() into thinking it is part of a
14924                  * '[[:posix:]]'. */
14925                 if (! is_posix_class) {
14926                     RExC_parse++;
14927                 }
14928 
14929                 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
14930                  * multi-char folds are allowed.  */
14931                 if (!regclass(pRExC_state, flagp,depth+1,
14932                                 is_posix_class, /* parse the whole char
14933                                                     class only if not a
14934                                                     posix class */
14935                                 FALSE, /* don't allow multi-char folds */
14936                                 TRUE, /* silence non-portable warnings. */
14937                                 TRUE, /* strict */
14938                                 FALSE, /* Require return to be an ANYOF */
14939                                 &current,
14940                                 NULL
14941                                 ))
14942                 {
14943                     FAIL2("panic: regclass returned NULL to handle_sets, "
14944                           "flags=%#"UVxf"", (UV) *flagp);
14945                 }
14946 
14947                 /* function call leaves parse pointing to the ']', except if we
14948                  * faked it */
14949                 if (is_posix_class) {
14950                     RExC_parse--;
14951                 }
14952 
14953                 goto handle_operand;
14954             }
14955 
14956             case ']':
14957                 if (top_index >= 1) {
14958                     goto join_operators;
14959                 }
14960 
14961                 /* Only a single operand on the stack: are done */
14962                 goto done;
14963 
14964             case ')':
14965                 if (av_tindex_nomg(fence_stack) < 0) {
14966                     RExC_parse++;
14967                     vFAIL("Unexpected ')'");
14968                 }
14969 
14970                 /* If nothing after the fence, is missing an operand */
14971                 if (top_index - fence < 0) {
14972                     RExC_parse++;
14973                     goto bad_syntax;
14974                 }
14975                 /* If at least two things on the stack, treat this as an
14976                   * operator */
14977                 if (top_index - fence >= 1) {
14978                     goto join_operators;
14979                 }
14980 
14981                 /* Here only a single thing on the fenced stack, and there is a
14982                  * fence.  Get rid of it */
14983                 fence_ptr = av_pop(fence_stack);
14984                 assert(fence_ptr);
14985                 fence = SvIV(fence_ptr) - 1;
14986                 SvREFCNT_dec_NN(fence_ptr);
14987                 fence_ptr = NULL;
14988 
14989                 if (fence < 0) {
14990                     fence = 0;
14991                 }
14992 
14993                 /* Having gotten rid of the fence, we pop the operand at the
14994                  * stack top and process it as a newly encountered operand */
14995                 current = av_pop(stack);
14996                 if (IS_OPERAND(current)) {
14997                     goto handle_operand;
14998                 }
14999 
15000                 RExC_parse++;
15001                 goto bad_syntax;
15002 
15003             case '&':
15004             case '|':
15005             case '+':
15006             case '-':
15007             case '^':
15008 
15009                 /* These binary operators should have a left operand already
15010                  * parsed */
15011                 if (   top_index - fence < 0
15012                     || top_index - fence == 1
15013                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15014                     || ! IS_OPERAND(*top_ptr))
15015                 {
15016                     goto unexpected_binary;
15017                 }
15018 
15019                 /* If only the one operand is on the part of the stack visible
15020                  * to us, we just place this operator in the proper position */
15021                 if (top_index - fence < 2) {
15022 
15023                     /* Place the operator before the operand */
15024 
15025                     SV* lhs = av_pop(stack);
15026                     av_push(stack, newSVuv(curchar));
15027                     av_push(stack, lhs);
15028                     break;
15029                 }
15030 
15031                 /* But if there is something else on the stack, we need to
15032                  * process it before this new operator if and only if the
15033                  * stacked operation has equal or higher precedence than the
15034                  * new one */
15035 
15036              join_operators:
15037 
15038                 /* The operator on the stack is supposed to be below both its
15039                  * operands */
15040                 if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15041                     || IS_OPERAND(*stacked_ptr))
15042                 {
15043                     /* But if not, it's legal and indicates we are completely
15044                      * done if and only if we're currently processing a ']',
15045                      * which should be the final thing in the expression */
15046                     if (curchar == ']') {
15047                         goto done;
15048                     }
15049 
15050                   unexpected_binary:
15051                     RExC_parse++;
15052                     vFAIL2("Unexpected binary operator '%c' with no "
15053                            "preceding operand", curchar);
15054                 }
15055                 stacked_operator = (char) SvUV(*stacked_ptr);
15056 
15057                 if (regex_set_precedence(curchar)
15058                     > regex_set_precedence(stacked_operator))
15059                 {
15060                     /* Here, the new operator has higher precedence than the
15061                      * stacked one.  This means we need to add the new one to
15062                      * the stack to await its rhs operand (and maybe more
15063                      * stuff).  We put it before the lhs operand, leaving
15064                      * untouched the stacked operator and everything below it
15065                      * */
15066                     lhs = av_pop(stack);
15067                     assert(IS_OPERAND(lhs));
15068 
15069                     av_push(stack, newSVuv(curchar));
15070                     av_push(stack, lhs);
15071                     break;
15072                 }
15073 
15074                 /* Here, the new operator has equal or lower precedence than
15075                  * what's already there.  This means the operation already
15076                  * there should be performed now, before the new one. */
15077 
15078                 rhs = av_pop(stack);
15079                 if (! IS_OPERAND(rhs)) {
15080 
15081                     /* This can happen when a ! is not followed by an operand,
15082                      * like in /(?[\t &!])/ */
15083                     goto bad_syntax;
15084                 }
15085 
15086                 lhs = av_pop(stack);
15087 
15088                 if (! IS_OPERAND(lhs)) {
15089 
15090                     /* This can happen when there is an empty (), like in
15091                      * /(?[[0]+()+])/ */
15092                     goto bad_syntax;
15093                 }
15094 
15095                 switch (stacked_operator) {
15096                     case '&':
15097                         _invlist_intersection(lhs, rhs, &rhs);
15098                         break;
15099 
15100                     case '|':
15101                     case '+':
15102                         _invlist_union(lhs, rhs, &rhs);
15103                         break;
15104 
15105                     case '-':
15106                         _invlist_subtract(lhs, rhs, &rhs);
15107                         break;
15108 
15109                     case '^':   /* The union minus the intersection */
15110                     {
15111                         SV* i = NULL;
15112                         SV* u = NULL;
15113                         SV* element;
15114 
15115                         _invlist_union(lhs, rhs, &u);
15116                         _invlist_intersection(lhs, rhs, &i);
15117                         /* _invlist_subtract will overwrite rhs
15118                             without freeing what it already contains */
15119                         element = rhs;
15120                         _invlist_subtract(u, i, &rhs);
15121                         SvREFCNT_dec_NN(i);
15122                         SvREFCNT_dec_NN(u);
15123                         SvREFCNT_dec_NN(element);
15124                         break;
15125                     }
15126                 }
15127                 SvREFCNT_dec(lhs);
15128 
15129                 /* Here, the higher precedence operation has been done, and the
15130                  * result is in 'rhs'.  We overwrite the stacked operator with
15131                  * the result.  Then we redo this code to either push the new
15132                  * operator onto the stack or perform any higher precedence
15133                  * stacked operation */
15134                 only_to_avoid_leaks = av_pop(stack);
15135                 SvREFCNT_dec(only_to_avoid_leaks);
15136                 av_push(stack, rhs);
15137                 goto redo_curchar;
15138 
15139             case '!':   /* Highest priority, right associative */
15140 
15141                 /* If what's already at the top of the stack is another '!",
15142                  * they just cancel each other out */
15143                 if (   (top_ptr = av_fetch(stack, top_index, FALSE))
15144                     && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
15145                 {
15146                     only_to_avoid_leaks = av_pop(stack);
15147                     SvREFCNT_dec(only_to_avoid_leaks);
15148                 }
15149                 else { /* Otherwise, since it's right associative, just push
15150                           onto the stack */
15151                     av_push(stack, newSVuv(curchar));
15152                 }
15153                 break;
15154 
15155             default:
15156                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15157                 vFAIL("Unexpected character");
15158 
15159           handle_operand:
15160 
15161             /* Here 'current' is the operand.  If something is already on the
15162              * stack, we have to check if it is a !.  But first, the code above
15163              * may have altered the stack in the time since we earlier set
15164              * 'top_index'.  */
15165 
15166             top_index = av_tindex_nomg(stack);
15167             if (top_index - fence >= 0) {
15168                 /* If the top entry on the stack is an operator, it had better
15169                  * be a '!', otherwise the entry below the top operand should
15170                  * be an operator */
15171                 top_ptr = av_fetch(stack, top_index, FALSE);
15172                 assert(top_ptr);
15173                 if (IS_OPERATOR(*top_ptr)) {
15174 
15175                     /* The only permissible operator at the top of the stack is
15176                      * '!', which is applied immediately to this operand. */
15177                     curchar = (char) SvUV(*top_ptr);
15178                     if (curchar != '!') {
15179                         SvREFCNT_dec(current);
15180                         vFAIL2("Unexpected binary operator '%c' with no "
15181                                 "preceding operand", curchar);
15182                     }
15183 
15184                     _invlist_invert(current);
15185 
15186                     only_to_avoid_leaks = av_pop(stack);
15187                     SvREFCNT_dec(only_to_avoid_leaks);
15188 
15189                     /* And we redo with the inverted operand.  This allows
15190                      * handling multiple ! in a row */
15191                     goto handle_operand;
15192                 }
15193                           /* Single operand is ok only for the non-binary ')'
15194                            * operator */
15195                 else if ((top_index - fence == 0 && curchar != ')')
15196                          || (top_index - fence > 0
15197                              && (! (stacked_ptr = av_fetch(stack,
15198                                                            top_index - 1,
15199                                                            FALSE))
15200                                  || IS_OPERAND(*stacked_ptr))))
15201                 {
15202                     SvREFCNT_dec(current);
15203                     vFAIL("Operand with no preceding operator");
15204                 }
15205             }
15206 
15207             /* Here there was nothing on the stack or the top element was
15208              * another operand.  Just add this new one */
15209             av_push(stack, current);
15210 
15211         } /* End of switch on next parse token */
15212 
15213         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15214     } /* End of loop parsing through the construct */
15215 
15216   done:
15217     if (av_tindex_nomg(fence_stack) >= 0) {
15218         vFAIL("Unmatched (");
15219     }
15220 
15221     if (av_tindex_nomg(stack) < 0   /* Was empty */
15222         || ((final = av_pop(stack)) == NULL)
15223         || ! IS_OPERAND(final)
15224         || SvTYPE(final) != SVt_INVLIST
15225         || av_tindex_nomg(stack) >= 0)  /* More left on stack */
15226     {
15227       bad_syntax:
15228         SvREFCNT_dec(final);
15229         vFAIL("Incomplete expression within '(?[ ])'");
15230     }
15231 
15232     /* Here, 'final' is the resultant inversion list from evaluating the
15233      * expression.  Return it if so requested */
15234     if (return_invlist) {
15235         *return_invlist = final;
15236         return END;
15237     }
15238 
15239     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
15240      * expecting a string of ranges and individual code points */
15241     invlist_iterinit(final);
15242     result_string = newSVpvs("");
15243     while (invlist_iternext(final, &start, &end)) {
15244         if (start == end) {
15245             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
15246         }
15247         else {
15248             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
15249                                                      start,          end);
15250         }
15251     }
15252 
15253     /* About to generate an ANYOF (or similar) node from the inversion list we
15254      * have calculated */
15255     save_parse = RExC_parse;
15256     RExC_parse = SvPV(result_string, len);
15257     save_end = RExC_end;
15258     RExC_end = RExC_parse + len;
15259 
15260     /* We turn off folding around the call, as the class we have constructed
15261      * already has all folding taken into consideration, and we don't want
15262      * regclass() to add to that */
15263     RExC_flags &= ~RXf_PMf_FOLD;
15264     /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
15265      * folds are allowed.  */
15266     node = regclass(pRExC_state, flagp,depth+1,
15267                     FALSE, /* means parse the whole char class */
15268                     FALSE, /* don't allow multi-char folds */
15269                     TRUE, /* silence non-portable warnings.  The above may very
15270                              well have generated non-portable code points, but
15271                              they're valid on this machine */
15272                     FALSE, /* similarly, no need for strict */
15273                     FALSE, /* Require return to be an ANYOF */
15274                     NULL,
15275                     NULL
15276                 );
15277     if (!node)
15278         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
15279                     PTR2UV(flagp));
15280 
15281     /* Fix up the node type if we are in locale.  (We have pretended we are
15282      * under /u for the purposes of regclass(), as this construct will only
15283      * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
15284      * as to cause any warnings about bad locales to be output in regexec.c),
15285      * and add the flag that indicates to check if not in a UTF-8 locale.  The
15286      * reason we above forbid optimization into something other than an ANYOF
15287      * node is simply to minimize the number of code changes in regexec.c.
15288      * Otherwise we would have to create new EXACTish node types and deal with
15289      * them.  This decision could be revisited should this construct become
15290      * popular.
15291      *
15292      * (One might think we could look at the resulting ANYOF node and suppress
15293      * the flag if everything is above 255, as those would be UTF-8 only,
15294      * but this isn't true, as the components that led to that result could
15295      * have been locale-affected, and just happen to cancel each other out
15296      * under UTF-8 locales.) */
15297     if (in_locale) {
15298         set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
15299 
15300         assert(OP(node) == ANYOF);
15301 
15302         OP(node) = ANYOFL;
15303         ANYOF_FLAGS(node)
15304                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
15305     }
15306 
15307     if (save_fold) {
15308         RExC_flags |= RXf_PMf_FOLD;
15309     }
15310 
15311     RExC_parse = save_parse + 1;
15312     RExC_end = save_end;
15313     SvREFCNT_dec_NN(final);
15314     SvREFCNT_dec_NN(result_string);
15315 
15316     nextchar(pRExC_state);
15317     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
15318     return node;
15319 }
15320 #undef IS_OPERATOR
15321 #undef IS_OPERAND
15322 
15323 STATIC void
15324 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
15325 {
15326     /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
15327      * innocent-looking character class, like /[ks]/i won't have to go out to
15328      * disk to find the possible matches.
15329      *
15330      * This should be called only for a Latin1-range code points, cp, which is
15331      * known to be involved in a simple fold with other code points above
15332      * Latin1.  It would give false results if /aa has been specified.
15333      * Multi-char folds are outside the scope of this, and must be handled
15334      * specially.
15335      *
15336      * XXX It would be better to generate these via regen, in case a new
15337      * version of the Unicode standard adds new mappings, though that is not
15338      * really likely, and may be caught by the default: case of the switch
15339      * below. */
15340 
15341     PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
15342 
15343     assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
15344 
15345     switch (cp) {
15346         case 'k':
15347         case 'K':
15348           *invlist =
15349              add_cp_to_invlist(*invlist, KELVIN_SIGN);
15350             break;
15351         case 's':
15352         case 'S':
15353           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
15354             break;
15355         case MICRO_SIGN:
15356           *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
15357           *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
15358             break;
15359         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
15360         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
15361           *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
15362             break;
15363         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
15364           *invlist = add_cp_to_invlist(*invlist,
15365                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
15366             break;
15367 
15368 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
15369 
15370         case LATIN_SMALL_LETTER_SHARP_S:
15371           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
15372             break;
15373 
15374 #endif
15375 
15376 #if    UNICODE_MAJOR_VERSION < 3                                        \
15377    || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
15378 
15379         /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
15380          * U+0131.  */
15381         case 'i':
15382         case 'I':
15383           *invlist =
15384              add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
15385 #   if UNICODE_DOT_DOT_VERSION == 1
15386           *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
15387 #   endif
15388             break;
15389 #endif
15390 
15391         default:
15392             /* Use deprecated warning to increase the chances of this being
15393              * output */
15394             if (PASS2) {
15395                 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
15396             }
15397             break;
15398     }
15399 }
15400 
15401 STATIC void
15402 S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
15403 {
15404     /* If the final parameter is NULL, output the elements of the array given
15405      * by '*posix_warnings' as REGEXP warnings.  Otherwise, the elements are
15406      * pushed onto it, (creating if necessary) */
15407 
15408     SV * msg;
15409     const bool first_is_fatal =  ! return_posix_warnings
15410                                 && ckDEAD(packWARN(WARN_REGEXP));
15411 
15412     PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
15413 
15414     while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
15415         if (return_posix_warnings) {
15416             if (! *return_posix_warnings) { /* mortalize to not leak if
15417                                                warnings are fatal */
15418                 *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
15419             }
15420             av_push(*return_posix_warnings, msg);
15421         }
15422         else {
15423             if (first_is_fatal) {           /* Avoid leaking this */
15424                 av_undef(posix_warnings);   /* This isn't necessary if the
15425                                                array is mortal, but is a
15426                                                fail-safe */
15427                 (void) sv_2mortal(msg);
15428                 if (PASS2) {
15429                     SAVEFREESV(RExC_rx_sv);
15430                 }
15431             }
15432             Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
15433             SvREFCNT_dec_NN(msg);
15434         }
15435     }
15436 }
15437 
15438 STATIC AV *
15439 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
15440 {
15441     /* This adds the string scalar <multi_string> to the array
15442      * <multi_char_matches>.  <multi_string> is known to have exactly
15443      * <cp_count> code points in it.  This is used when constructing a
15444      * bracketed character class and we find something that needs to match more
15445      * than a single character.
15446      *
15447      * <multi_char_matches> is actually an array of arrays.  Each top-level
15448      * element is an array that contains all the strings known so far that are
15449      * the same length.  And that length (in number of code points) is the same
15450      * as the index of the top-level array.  Hence, the [2] element is an
15451      * array, each element thereof is a string containing TWO code points;
15452      * while element [3] is for strings of THREE characters, and so on.  Since
15453      * this is for multi-char strings there can never be a [0] nor [1] element.
15454      *
15455      * When we rewrite the character class below, we will do so such that the
15456      * longest strings are written first, so that it prefers the longest
15457      * matching strings first.  This is done even if it turns out that any
15458      * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
15459      * Christiansen has agreed that this is ok.  This makes the test for the
15460      * ligature 'ffi' come before the test for 'ff', for example */
15461 
15462     AV* this_array;
15463     AV** this_array_ptr;
15464 
15465     PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
15466 
15467     if (! multi_char_matches) {
15468         multi_char_matches = newAV();
15469     }
15470 
15471     if (av_exists(multi_char_matches, cp_count)) {
15472         this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
15473         this_array = *this_array_ptr;
15474     }
15475     else {
15476         this_array = newAV();
15477         av_store(multi_char_matches, cp_count,
15478                  (SV*) this_array);
15479     }
15480     av_push(this_array, multi_string);
15481 
15482     return multi_char_matches;
15483 }
15484 
15485 /* The names of properties whose definitions are not known at compile time are
15486  * stored in this SV, after a constant heading.  So if the length has been
15487  * changed since initialization, then there is a run-time definition. */
15488 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
15489                                         (SvCUR(listsv) != initial_listsv_len)
15490 
15491 /* There is a restricted set of white space characters that are legal when
15492  * ignoring white space in a bracketed character class.  This generates the
15493  * code to skip them.
15494  *
15495  * There is a line below that uses the same white space criteria but is outside
15496  * this macro.  Both here and there must use the same definition */
15497 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
15498     STMT_START {                                                        \
15499         if (do_skip) {                                                  \
15500             while (isBLANK_A(UCHARAT(p)))                               \
15501             {                                                           \
15502                 p++;                                                    \
15503             }                                                           \
15504         }                                                               \
15505     } STMT_END
15506 
15507 STATIC regnode *
15508 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
15509                  const bool stop_at_1,  /* Just parse the next thing, don't
15510                                            look for a full character class */
15511                  bool allow_multi_folds,
15512                  const bool silence_non_portable,   /* Don't output warnings
15513                                                        about too large
15514                                                        characters */
15515                  const bool strict,
15516                  bool optimizable,                  /* ? Allow a non-ANYOF return
15517                                                        node */
15518                  SV** ret_invlist, /* Return an inversion list, not a node */
15519                  AV** return_posix_warnings
15520           )
15521 {
15522     /* parse a bracketed class specification.  Most of these will produce an
15523      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
15524      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
15525      * under /i with multi-character folds: it will be rewritten following the
15526      * paradigm of this example, where the <multi-fold>s are characters which
15527      * fold to multiple character sequences:
15528      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
15529      * gets effectively rewritten as:
15530      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
15531      * reg() gets called (recursively) on the rewritten version, and this
15532      * function will return what it constructs.  (Actually the <multi-fold>s
15533      * aren't physically removed from the [abcdefghi], it's just that they are
15534      * ignored in the recursion by means of a flag:
15535      * <RExC_in_multi_char_class>.)
15536      *
15537      * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
15538      * characters, with the corresponding bit set if that character is in the
15539      * list.  For characters above this, a range list or swash is used.  There
15540      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
15541      * determinable at compile time
15542      *
15543      * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
15544      * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
15545      * to UTF-8.  This can only happen if ret_invlist is non-NULL.
15546      */
15547 
15548     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
15549     IV range = 0;
15550     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
15551     regnode *ret;
15552     STRLEN numlen;
15553     int namedclass = OOB_NAMEDCLASS;
15554     char *rangebegin = NULL;
15555     bool need_class = 0;
15556     SV *listsv = NULL;
15557     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
15558 				      than just initialized.  */
15559     SV* properties = NULL;    /* Code points that match \p{} \P{} */
15560     SV* posixes = NULL;     /* Code points that match classes like [:word:],
15561                                extended beyond the Latin1 range.  These have to
15562                                be kept separate from other code points for much
15563                                of this function because their handling  is
15564                                different under /i, and for most classes under
15565                                /d as well */
15566     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
15567                                separate for a while from the non-complemented
15568                                versions because of complications with /d
15569                                matching */
15570     SV* simple_posixes = NULL; /* But under some conditions, the classes can be
15571                                   treated more simply than the general case,
15572                                   leading to less compilation and execution
15573                                   work */
15574     UV element_count = 0;   /* Number of distinct elements in the class.
15575 			       Optimizations may be possible if this is tiny */
15576     AV * multi_char_matches = NULL; /* Code points that fold to more than one
15577                                        character; used under /i */
15578     UV n;
15579     char * stop_ptr = RExC_end;    /* where to stop parsing */
15580     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
15581                                                    space? */
15582 
15583     /* Unicode properties are stored in a swash; this holds the current one
15584      * being parsed.  If this swash is the only above-latin1 component of the
15585      * character class, an optimization is to pass it directly on to the
15586      * execution engine.  Otherwise, it is set to NULL to indicate that there
15587      * are other things in the class that have to be dealt with at execution
15588      * time */
15589     SV* swash = NULL;		/* Code points that match \p{} \P{} */
15590 
15591     /* Set if a component of this character class is user-defined; just passed
15592      * on to the engine */
15593     bool has_user_defined_property = FALSE;
15594 
15595     /* inversion list of code points this node matches only when the target
15596      * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
15597      * /d) */
15598     SV* has_upper_latin1_only_utf8_matches = NULL;
15599 
15600     /* Inversion list of code points this node matches regardless of things
15601      * like locale, folding, utf8ness of the target string */
15602     SV* cp_list = NULL;
15603 
15604     /* Like cp_list, but code points on this list need to be checked for things
15605      * that fold to/from them under /i */
15606     SV* cp_foldable_list = NULL;
15607 
15608     /* Like cp_list, but code points on this list are valid only when the
15609      * runtime locale is UTF-8 */
15610     SV* only_utf8_locale_list = NULL;
15611 
15612     /* In a range, if one of the endpoints is non-character-set portable,
15613      * meaning that it hard-codes a code point that may mean a different
15614      * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
15615      * mnemonic '\t' which each mean the same character no matter which
15616      * character set the platform is on. */
15617     unsigned int non_portable_endpoint = 0;
15618 
15619     /* Is the range unicode? which means on a platform that isn't 1-1 native
15620      * to Unicode (i.e. non-ASCII), each code point in it should be considered
15621      * to be a Unicode value.  */
15622     bool unicode_range = FALSE;
15623     bool invert = FALSE;    /* Is this class to be complemented */
15624 
15625     bool warn_super = ALWAYS_WARN_SUPER;
15626 
15627     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
15628         case we need to change the emitted regop to an EXACT. */
15629     const char * orig_parse = RExC_parse;
15630     const SSize_t orig_size = RExC_size;
15631     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
15632 
15633     /* This variable is used to mark where the end in the input is of something
15634      * that looks like a POSIX construct but isn't.  During the parse, when
15635      * something looks like it could be such a construct is encountered, it is
15636      * checked for being one, but not if we've already checked this area of the
15637      * input.  Only after this position is reached do we check again */
15638     char *not_posix_region_end = RExC_parse - 1;
15639 
15640     AV* posix_warnings = NULL;
15641     const bool do_posix_warnings =     return_posix_warnings
15642                                    || (PASS2 && ckWARN(WARN_REGEXP));
15643 
15644     GET_RE_DEBUG_FLAGS_DECL;
15645 
15646     PERL_ARGS_ASSERT_REGCLASS;
15647 #ifndef DEBUGGING
15648     PERL_UNUSED_ARG(depth);
15649 #endif
15650 
15651     DEBUG_PARSE("clas");
15652 
15653 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
15654     || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
15655                                    && UNICODE_DOT_DOT_VERSION == 0)
15656     allow_multi_folds = FALSE;
15657 #endif
15658 
15659     /* Assume we are going to generate an ANYOF node. */
15660     ret = reganode(pRExC_state,
15661                    (LOC)
15662                     ? ANYOFL
15663                     : ANYOF,
15664                    0);
15665 
15666     if (SIZE_ONLY) {
15667 	RExC_size += ANYOF_SKIP;
15668 	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
15669     }
15670     else {
15671         ANYOF_FLAGS(ret) = 0;
15672 
15673  	RExC_emit += ANYOF_SKIP;
15674 	listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
15675 	initial_listsv_len = SvCUR(listsv);
15676         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
15677     }
15678 
15679     SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15680 
15681     assert(RExC_parse <= RExC_end);
15682 
15683     if (UCHARAT(RExC_parse) == '^') {	/* Complement the class */
15684 	RExC_parse++;
15685         invert = TRUE;
15686         allow_multi_folds = FALSE;
15687         MARK_NAUGHTY(1);
15688         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15689     }
15690 
15691     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
15692     if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
15693         int maybe_class = handle_possible_posix(pRExC_state,
15694                                                 RExC_parse,
15695                                                 &not_posix_region_end,
15696                                                 NULL,
15697                                                 TRUE /* checking only */);
15698         if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
15699             SAVEFREESV(RExC_rx_sv);
15700             ckWARN4reg(not_posix_region_end,
15701                     "POSIX syntax [%c %c] belongs inside character classes%s",
15702                     *RExC_parse, *RExC_parse,
15703                     (maybe_class == OOB_NAMEDCLASS)
15704                     ? ((POSIXCC_NOTYET(*RExC_parse))
15705                         ? " (but this one isn't implemented)"
15706                         : " (but this one isn't fully valid)")
15707                     : ""
15708                     );
15709             (void)ReREFCNT_inc(RExC_rx_sv);
15710         }
15711     }
15712 
15713     /* If the caller wants us to just parse a single element, accomplish this
15714      * by faking the loop ending condition */
15715     if (stop_at_1 && RExC_end > RExC_parse) {
15716         stop_ptr = RExC_parse + 1;
15717     }
15718 
15719     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
15720     if (UCHARAT(RExC_parse) == ']')
15721 	goto charclassloop;
15722 
15723     while (1) {
15724 
15725         if (   posix_warnings
15726             && av_tindex_nomg(posix_warnings) >= 0
15727             && RExC_parse > not_posix_region_end)
15728         {
15729             /* Warnings about posix class issues are considered tentative until
15730              * we are far enough along in the parse that we can no longer
15731              * change our mind, at which point we either output them or add
15732              * them, if it has so specified, to what gets returned to the
15733              * caller.  This is done each time through the loop so that a later
15734              * class won't zap them before they have been dealt with. */
15735             output_or_return_posix_warnings(pRExC_state, posix_warnings,
15736                                             return_posix_warnings);
15737         }
15738 
15739         if  (RExC_parse >= stop_ptr) {
15740             break;
15741         }
15742 
15743         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15744 
15745         if  (UCHARAT(RExC_parse) == ']') {
15746             break;
15747         }
15748 
15749       charclassloop:
15750 
15751 	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
15752         save_value = value;
15753         save_prevvalue = prevvalue;
15754 
15755 	if (!range) {
15756 	    rangebegin = RExC_parse;
15757 	    element_count++;
15758             non_portable_endpoint = 0;
15759 	}
15760 	if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
15761 	    value = utf8n_to_uvchr((U8*)RExC_parse,
15762 				   RExC_end - RExC_parse,
15763 				   &numlen, UTF8_ALLOW_DEFAULT);
15764 	    RExC_parse += numlen;
15765 	}
15766 	else
15767 	    value = UCHARAT(RExC_parse++);
15768 
15769         if (value == '[') {
15770             char * posix_class_end;
15771             namedclass = handle_possible_posix(pRExC_state,
15772                                                RExC_parse,
15773                                                &posix_class_end,
15774                                                do_posix_warnings ? &posix_warnings : NULL,
15775                                                FALSE    /* die if error */);
15776             if (namedclass > OOB_NAMEDCLASS) {
15777 
15778                 /* If there was an earlier attempt to parse this particular
15779                  * posix class, and it failed, it was a false alarm, as this
15780                  * successful one proves */
15781                 if (   posix_warnings
15782                     && av_tindex_nomg(posix_warnings) >= 0
15783                     && not_posix_region_end >= RExC_parse
15784                     && not_posix_region_end <= posix_class_end)
15785                 {
15786                     av_undef(posix_warnings);
15787                 }
15788 
15789                 RExC_parse = posix_class_end;
15790             }
15791             else if (namedclass == OOB_NAMEDCLASS) {
15792                 not_posix_region_end = posix_class_end;
15793             }
15794             else {
15795                 namedclass = OOB_NAMEDCLASS;
15796             }
15797         }
15798         else if (   RExC_parse - 1 > not_posix_region_end
15799                  && MAYBE_POSIXCC(value))
15800         {
15801             (void) handle_possible_posix(
15802                         pRExC_state,
15803                         RExC_parse - 1,  /* -1 because parse has already been
15804                                             advanced */
15805                         &not_posix_region_end,
15806                         do_posix_warnings ? &posix_warnings : NULL,
15807                         TRUE /* checking only */);
15808         }
15809         else if (value == '\\') {
15810             /* Is a backslash; get the code point of the char after it */
15811 
15812             if (RExC_parse >= RExC_end) {
15813                 vFAIL("Unmatched [");
15814             }
15815 
15816 	    if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
15817 		value = utf8n_to_uvchr((U8*)RExC_parse,
15818 				   RExC_end - RExC_parse,
15819 				   &numlen, UTF8_ALLOW_DEFAULT);
15820 		RExC_parse += numlen;
15821 	    }
15822 	    else
15823 		value = UCHARAT(RExC_parse++);
15824 
15825 	    /* Some compilers cannot handle switching on 64-bit integer
15826 	     * values, therefore value cannot be an UV.  Yes, this will
15827 	     * be a problem later if we want switch on Unicode.
15828 	     * A similar issue a little bit later when switching on
15829 	     * namedclass. --jhi */
15830 
15831             /* If the \ is escaping white space when white space is being
15832              * skipped, it means that that white space is wanted literally, and
15833              * is already in 'value'.  Otherwise, need to translate the escape
15834              * into what it signifies. */
15835             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
15836 
15837 	    case 'w':	namedclass = ANYOF_WORDCHAR;	break;
15838 	    case 'W':	namedclass = ANYOF_NWORDCHAR;	break;
15839 	    case 's':	namedclass = ANYOF_SPACE;	break;
15840 	    case 'S':	namedclass = ANYOF_NSPACE;	break;
15841 	    case 'd':	namedclass = ANYOF_DIGIT;	break;
15842 	    case 'D':	namedclass = ANYOF_NDIGIT;	break;
15843 	    case 'v':	namedclass = ANYOF_VERTWS;	break;
15844 	    case 'V':	namedclass = ANYOF_NVERTWS;	break;
15845 	    case 'h':	namedclass = ANYOF_HORIZWS;	break;
15846 	    case 'H':	namedclass = ANYOF_NHORIZWS;	break;
15847             case 'N':  /* Handle \N{NAME} in class */
15848                 {
15849                     const char * const backslash_N_beg = RExC_parse - 2;
15850                     int cp_count;
15851 
15852                     if (! grok_bslash_N(pRExC_state,
15853                                         NULL,      /* No regnode */
15854                                         &value,    /* Yes single value */
15855                                         &cp_count, /* Multiple code pt count */
15856                                         flagp,
15857                                         strict,
15858                                         depth)
15859                     ) {
15860 
15861                         if (*flagp & NEED_UTF8)
15862                             FAIL("panic: grok_bslash_N set NEED_UTF8");
15863                         if (*flagp & RESTART_PASS1)
15864                             return NULL;
15865 
15866                         if (cp_count < 0) {
15867                             vFAIL("\\N in a character class must be a named character: \\N{...}");
15868                         }
15869                         else if (cp_count == 0) {
15870                             if (PASS2) {
15871                                 ckWARNreg(RExC_parse,
15872                                         "Ignoring zero length \\N{} in character class");
15873                             }
15874                         }
15875                         else { /* cp_count > 1 */
15876                             if (! RExC_in_multi_char_class) {
15877                                 if (invert || range || *RExC_parse == '-') {
15878                                     if (strict) {
15879                                         RExC_parse--;
15880                                         vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
15881                                     }
15882                                     else if (PASS2) {
15883                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
15884                                     }
15885                                     break; /* <value> contains the first code
15886                                               point. Drop out of the switch to
15887                                               process it */
15888                                 }
15889                                 else {
15890                                     SV * multi_char_N = newSVpvn(backslash_N_beg,
15891                                                  RExC_parse - backslash_N_beg);
15892                                     multi_char_matches
15893                                         = add_multi_match(multi_char_matches,
15894                                                           multi_char_N,
15895                                                           cp_count);
15896                                 }
15897                             }
15898                         } /* End of cp_count != 1 */
15899 
15900                         /* This element should not be processed further in this
15901                          * class */
15902                         element_count--;
15903                         value = save_value;
15904                         prevvalue = save_prevvalue;
15905                         continue;   /* Back to top of loop to get next char */
15906                     }
15907 
15908                     /* Here, is a single code point, and <value> contains it */
15909                     unicode_range = TRUE;   /* \N{} are Unicode */
15910                 }
15911                 break;
15912 	    case 'p':
15913 	    case 'P':
15914 		{
15915 		char *e;
15916 
15917                 /* We will handle any undefined properties ourselves */
15918                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
15919                                        /* And we actually would prefer to get
15920                                         * the straight inversion list of the
15921                                         * swash, since we will be accessing it
15922                                         * anyway, to save a little time */
15923                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
15924 
15925 		if (RExC_parse >= RExC_end)
15926 		    vFAIL2("Empty \\%c", (U8)value);
15927 		if (*RExC_parse == '{') {
15928 		    const U8 c = (U8)value;
15929 		    e = strchr(RExC_parse, '}');
15930                     if (!e) {
15931                         RExC_parse++;
15932                         vFAIL2("Missing right brace on \\%c{}", c);
15933                     }
15934 
15935                     RExC_parse++;
15936                     while (isSPACE(*RExC_parse)) {
15937                          RExC_parse++;
15938 		    }
15939 
15940 		    if (UCHARAT(RExC_parse) == '^') {
15941 
15942                         /* toggle.  (The rhs xor gets the single bit that
15943                          * differs between P and p; the other xor inverts just
15944                          * that bit) */
15945                         value ^= 'P' ^ 'p';
15946 
15947                         RExC_parse++;
15948                         while (isSPACE(*RExC_parse)) {
15949                             RExC_parse++;
15950                         }
15951                     }
15952 
15953                     if (e == RExC_parse)
15954                         vFAIL2("Empty \\%c{}", c);
15955 
15956 		    n = e - RExC_parse;
15957 		    while (isSPACE(*(RExC_parse + n - 1)))
15958 		        n--;
15959 		}   /* The \p isn't immediately followed by a '{' */
15960 		else if (! isALPHA(*RExC_parse)) {
15961                     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15962                     vFAIL2("Character following \\%c must be '{' or a "
15963                            "single-character Unicode property name",
15964                            (U8) value);
15965                 }
15966                 else {
15967 		    e = RExC_parse;
15968 		    n = 1;
15969 		}
15970 		if (!SIZE_ONLY) {
15971                     SV* invlist;
15972                     char* name;
15973                     char* base_name;    /* name after any packages are stripped */
15974                     char* lookup_name = NULL;
15975                     const char * const colon_colon = "::";
15976 
15977                     /* Try to get the definition of the property into
15978                      * <invlist>.  If /i is in effect, the effective property
15979                      * will have its name be <__NAME_i>.  The design is
15980                      * discussed in commit
15981                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
15982                     name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
15983                     SAVEFREEPV(name);
15984                     if (FOLD) {
15985                         lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
15986 
15987                         /* The function call just below that uses this can fail
15988                          * to return, leaking memory if we don't do this */
15989                         SAVEFREEPV(lookup_name);
15990                     }
15991 
15992                     /* Look up the property name, and get its swash and
15993                      * inversion list, if the property is found  */
15994                     SvREFCNT_dec(swash); /* Free any left-overs */
15995                     swash = _core_swash_init("utf8",
15996                                              (lookup_name)
15997                                               ? lookup_name
15998                                               : name,
15999                                              &PL_sv_undef,
16000                                              1, /* binary */
16001                                              0, /* not tr/// */
16002                                              NULL, /* No inversion list */
16003                                              &swash_init_flags
16004                                             );
16005                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
16006                         HV* curpkg = (IN_PERL_COMPILETIME)
16007                                       ? PL_curstash
16008                                       : CopSTASH(PL_curcop);
16009                         UV final_n = n;
16010                         bool has_pkg;
16011 
16012                         if (swash) {    /* Got a swash but no inversion list.
16013                                            Something is likely wrong that will
16014                                            be sorted-out later */
16015                             SvREFCNT_dec_NN(swash);
16016                             swash = NULL;
16017                         }
16018 
16019                         /* Here didn't find it.  It could be a an error (like a
16020                          * typo) in specifying a Unicode property, or it could
16021                          * be a user-defined property that will be available at
16022                          * run-time.  The names of these must begin with 'In'
16023                          * or 'Is' (after any packages are stripped off).  So
16024                          * if not one of those, or if we accept only
16025                          * compile-time properties, is an error; otherwise add
16026                          * it to the list for run-time look up. */
16027                         if ((base_name = rninstr(name, name + n,
16028                                                  colon_colon, colon_colon + 2)))
16029                         { /* Has ::.  We know this must be a user-defined
16030                              property */
16031                             base_name += 2;
16032                             final_n -= base_name - name;
16033                             has_pkg = TRUE;
16034                         }
16035                         else {
16036                             base_name = name;
16037                             has_pkg = FALSE;
16038                         }
16039 
16040                         if (   final_n < 3
16041                             || base_name[0] != 'I'
16042                             || (base_name[1] != 's' && base_name[1] != 'n')
16043                             || ret_invlist)
16044                         {
16045                             const char * const msg
16046                                 = (has_pkg)
16047                                   ? "Illegal user-defined property name"
16048                                   : "Can't find Unicode property definition";
16049                             RExC_parse = e + 1;
16050 
16051                             /* diag_listed_as: Can't find Unicode property definition "%s" */
16052                             vFAIL3utf8f("%s \"%"UTF8f"\"",
16053                                 msg, UTF8fARG(UTF, n, name));
16054                         }
16055 
16056                         /* If the property name doesn't already have a package
16057                          * name, add the current one to it so that it can be
16058                          * referred to outside it. [perl #121777] */
16059                         if (! has_pkg && curpkg) {
16060                             char* pkgname = HvNAME(curpkg);
16061                             if (strNE(pkgname, "main")) {
16062                                 char* full_name = Perl_form(aTHX_
16063                                                             "%s::%s",
16064                                                             pkgname,
16065                                                             name);
16066                                 n = strlen(full_name);
16067                                 name = savepvn(full_name, n);
16068                                 SAVEFREEPV(name);
16069                             }
16070                         }
16071                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n",
16072                                         (value == 'p' ? '+' : '!'),
16073                                         (FOLD) ? "__" : "",
16074                                         UTF8fARG(UTF, n, name),
16075                                         (FOLD) ? "_i" : "");
16076                         has_user_defined_property = TRUE;
16077                         optimizable = FALSE;    /* Will have to leave this an
16078                                                    ANYOF node */
16079 
16080                         /* We don't know yet what this matches, so have to flag
16081                          * it */
16082                         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
16083                     }
16084                     else {
16085 
16086                         /* Here, did get the swash and its inversion list.  If
16087                          * the swash is from a user-defined property, then this
16088                          * whole character class should be regarded as such */
16089                         if (swash_init_flags
16090                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
16091                         {
16092                             has_user_defined_property = TRUE;
16093                         }
16094                         else if
16095                             /* We warn on matching an above-Unicode code point
16096                              * if the match would return true, except don't
16097                              * warn for \p{All}, which has exactly one element
16098                              * = 0 */
16099                             (_invlist_contains_cp(invlist, 0x110000)
16100                                 && (! (_invlist_len(invlist) == 1
16101                                        && *invlist_array(invlist) == 0)))
16102                         {
16103                             warn_super = TRUE;
16104                         }
16105 
16106 
16107                         /* Invert if asking for the complement */
16108                         if (value == 'P') {
16109 			    _invlist_union_complement_2nd(properties,
16110                                                           invlist,
16111                                                           &properties);
16112 
16113                             /* The swash can't be used as-is, because we've
16114 			     * inverted things; delay removing it to here after
16115 			     * have copied its invlist above */
16116                             SvREFCNT_dec_NN(swash);
16117                             swash = NULL;
16118                         }
16119                         else {
16120                             _invlist_union(properties, invlist, &properties);
16121 			}
16122 		    }
16123 		}
16124 		RExC_parse = e + 1;
16125                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
16126                                                 named */
16127 
16128 		/* \p means they want Unicode semantics */
16129 		REQUIRE_UNI_RULES(flagp, NULL);
16130 		}
16131 		break;
16132 	    case 'n':	value = '\n';			break;
16133 	    case 'r':	value = '\r';			break;
16134 	    case 't':	value = '\t';			break;
16135 	    case 'f':	value = '\f';			break;
16136 	    case 'b':	value = '\b';			break;
16137 	    case 'e':	value = ESC_NATIVE;             break;
16138 	    case 'a':	value = '\a';                   break;
16139 	    case 'o':
16140 		RExC_parse--;	/* function expects to be pointed at the 'o' */
16141 		{
16142 		    const char* error_msg;
16143 		    bool valid = grok_bslash_o(&RExC_parse,
16144 					       &value,
16145 					       &error_msg,
16146                                                PASS2,   /* warnings only in
16147                                                            pass 2 */
16148                                                strict,
16149                                                silence_non_portable,
16150                                                UTF);
16151 		    if (! valid) {
16152 			vFAIL(error_msg);
16153 		    }
16154 		}
16155                 non_portable_endpoint++;
16156 		if (IN_ENCODING && value < 0x100) {
16157 		    goto recode_encoding;
16158 		}
16159 		break;
16160 	    case 'x':
16161 		RExC_parse--;	/* function expects to be pointed at the 'x' */
16162 		{
16163 		    const char* error_msg;
16164 		    bool valid = grok_bslash_x(&RExC_parse,
16165 					       &value,
16166 					       &error_msg,
16167 					       PASS2, /* Output warnings */
16168                                                strict,
16169                                                silence_non_portable,
16170                                                UTF);
16171                     if (! valid) {
16172 			vFAIL(error_msg);
16173 		    }
16174 		}
16175                 non_portable_endpoint++;
16176 		if (IN_ENCODING && value < 0x100)
16177 		    goto recode_encoding;
16178 		break;
16179 	    case 'c':
16180 		value = grok_bslash_c(*RExC_parse++, PASS2);
16181                 non_portable_endpoint++;
16182 		break;
16183 	    case '0': case '1': case '2': case '3': case '4':
16184 	    case '5': case '6': case '7':
16185 		{
16186 		    /* Take 1-3 octal digits */
16187 		    I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
16188                     numlen = (strict) ? 4 : 3;
16189                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
16190 		    RExC_parse += numlen;
16191                     if (numlen != 3) {
16192                         if (strict) {
16193                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16194                             vFAIL("Need exactly 3 octal digits");
16195                         }
16196                         else if (! SIZE_ONLY /* like \08, \178 */
16197                                  && numlen < 3
16198                                  && RExC_parse < RExC_end
16199                                  && isDIGIT(*RExC_parse)
16200                                  && ckWARN(WARN_REGEXP))
16201                         {
16202                             SAVEFREESV(RExC_rx_sv);
16203                             reg_warn_non_literal_string(
16204                                  RExC_parse + 1,
16205                                  form_short_octal_warning(RExC_parse, numlen));
16206                             (void)ReREFCNT_inc(RExC_rx_sv);
16207                         }
16208                     }
16209                     non_portable_endpoint++;
16210 		    if (IN_ENCODING && value < 0x100)
16211 			goto recode_encoding;
16212 		    break;
16213 		}
16214 	      recode_encoding:
16215 		if (! RExC_override_recoding) {
16216 		    SV* enc = _get_encoding();
16217 		    value = reg_recode((U8)value, &enc);
16218 		    if (!enc) {
16219                         if (strict) {
16220                             vFAIL("Invalid escape in the specified encoding");
16221                         }
16222                         else if (PASS2) {
16223                             ckWARNreg(RExC_parse,
16224 				  "Invalid escape in the specified encoding");
16225                         }
16226                     }
16227 		    break;
16228 		}
16229 	    default:
16230 		/* Allow \_ to not give an error */
16231 		if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
16232                     if (strict) {
16233                         vFAIL2("Unrecognized escape \\%c in character class",
16234                                (int)value);
16235                     }
16236                     else {
16237                         SAVEFREESV(RExC_rx_sv);
16238                         ckWARN2reg(RExC_parse,
16239                             "Unrecognized escape \\%c in character class passed through",
16240                             (int)value);
16241                         (void)ReREFCNT_inc(RExC_rx_sv);
16242                     }
16243 		}
16244 		break;
16245 	    }   /* End of switch on char following backslash */
16246 	} /* end of handling backslash escape sequences */
16247 
16248         /* Here, we have the current token in 'value' */
16249 
16250 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
16251             U8 classnum;
16252 
16253 	    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
16254 	     * literal, as is the character that began the false range, i.e.
16255 	     * the 'a' in the examples */
16256 	    if (range) {
16257 		if (!SIZE_ONLY) {
16258 		    const int w = (RExC_parse >= rangebegin)
16259                                   ? RExC_parse - rangebegin
16260                                   : 0;
16261                     if (strict) {
16262                         vFAIL2utf8f(
16263                             "False [] range \"%"UTF8f"\"",
16264                             UTF8fARG(UTF, w, rangebegin));
16265                     }
16266                     else {
16267                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
16268                         ckWARN2reg(RExC_parse,
16269                             "False [] range \"%"UTF8f"\"",
16270                             UTF8fARG(UTF, w, rangebegin));
16271                         (void)ReREFCNT_inc(RExC_rx_sv);
16272                         cp_list = add_cp_to_invlist(cp_list, '-');
16273                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
16274                                                              prevvalue);
16275                     }
16276 		}
16277 
16278 		range = 0; /* this was not a true range */
16279                 element_count += 2; /* So counts for three values */
16280 	    }
16281 
16282             classnum = namedclass_to_classnum(namedclass);
16283 
16284 	    if (LOC && namedclass < ANYOF_POSIXL_MAX
16285 #ifndef HAS_ISASCII
16286                 && classnum != _CC_ASCII
16287 #endif
16288             ) {
16289                 /* What the Posix classes (like \w, [:space:]) match in locale
16290                  * isn't knowable under locale until actual match time.  Room
16291                  * must be reserved (one time per outer bracketed class) to
16292                  * store such classes.  The space will contain a bit for each
16293                  * named class that is to be matched against.  This isn't
16294                  * needed for \p{} and pseudo-classes, as they are not affected
16295                  * by locale, and hence are dealt with separately */
16296                 if (! need_class) {
16297                     need_class = 1;
16298                     if (SIZE_ONLY) {
16299                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16300                     }
16301                     else {
16302                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
16303                     }
16304                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
16305                     ANYOF_POSIXL_ZERO(ret);
16306 
16307                     /* We can't change this into some other type of node
16308                      * (unless this is the only element, in which case there
16309                      * are nodes that mean exactly this) as has runtime
16310                      * dependencies */
16311                     optimizable = FALSE;
16312                 }
16313 
16314                 /* Coverity thinks it is possible for this to be negative; both
16315                  * jhi and khw think it's not, but be safer */
16316                 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16317                        || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
16318 
16319                 /* See if it already matches the complement of this POSIX
16320                  * class */
16321                 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
16322                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
16323                                                             ? -1
16324                                                             : 1)))
16325                 {
16326                     posixl_matches_all = TRUE;
16327                     break;  /* No need to continue.  Since it matches both
16328                                e.g., \w and \W, it matches everything, and the
16329                                bracketed class can be optimized into qr/./s */
16330                 }
16331 
16332                 /* Add this class to those that should be checked at runtime */
16333                 ANYOF_POSIXL_SET(ret, namedclass);
16334 
16335                 /* The above-Latin1 characters are not subject to locale rules.
16336                  * Just add them, in the second pass, to the
16337                  * unconditionally-matched list */
16338                 if (! SIZE_ONLY) {
16339                     SV* scratch_list = NULL;
16340 
16341                     /* Get the list of the above-Latin1 code points this
16342                      * matches */
16343                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
16344                                           PL_XPosix_ptrs[classnum],
16345 
16346                                           /* Odd numbers are complements, like
16347                                            * NDIGIT, NASCII, ... */
16348                                           namedclass % 2 != 0,
16349                                           &scratch_list);
16350                     /* Checking if 'cp_list' is NULL first saves an extra
16351                      * clone.  Its reference count will be decremented at the
16352                      * next union, etc, or if this is the only instance, at the
16353                      * end of the routine */
16354                     if (! cp_list) {
16355                         cp_list = scratch_list;
16356                     }
16357                     else {
16358                         _invlist_union(cp_list, scratch_list, &cp_list);
16359                         SvREFCNT_dec_NN(scratch_list);
16360                     }
16361                     continue;   /* Go get next character */
16362                 }
16363             }
16364             else if (! SIZE_ONLY) {
16365 
16366                 /* Here, not in pass1 (in that pass we skip calculating the
16367                  * contents of this class), and is /l, or is a POSIX class for
16368                  * which /l doesn't matter (or is a Unicode property, which is
16369                  * skipped here). */
16370                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
16371                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
16372 
16373                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
16374                          * nor /l make a difference in what these match,
16375                          * therefore we just add what they match to cp_list. */
16376                         if (classnum != _CC_VERTSPACE) {
16377                             assert(   namedclass == ANYOF_HORIZWS
16378                                    || namedclass == ANYOF_NHORIZWS);
16379 
16380                             /* It turns out that \h is just a synonym for
16381                              * XPosixBlank */
16382                             classnum = _CC_BLANK;
16383                         }
16384 
16385                         _invlist_union_maybe_complement_2nd(
16386                                 cp_list,
16387                                 PL_XPosix_ptrs[classnum],
16388                                 namedclass % 2 != 0,    /* Complement if odd
16389                                                           (NHORIZWS, NVERTWS)
16390                                                         */
16391                                 &cp_list);
16392                     }
16393                 }
16394                 else if (UNI_SEMANTICS
16395                         || classnum == _CC_ASCII
16396                         || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
16397                                                   || classnum == _CC_XDIGIT)))
16398                 {
16399                     /* We usually have to worry about /d and /a affecting what
16400                      * POSIX classes match, with special code needed for /d
16401                      * because we won't know until runtime what all matches.
16402                      * But there is no extra work needed under /u, and
16403                      * [:ascii:] is unaffected by /a and /d; and :digit: and
16404                      * :xdigit: don't have runtime differences under /d.  So we
16405                      * can special case these, and avoid some extra work below,
16406                      * and at runtime. */
16407                     _invlist_union_maybe_complement_2nd(
16408                                                      simple_posixes,
16409                                                      PL_XPosix_ptrs[classnum],
16410                                                      namedclass % 2 != 0,
16411                                                      &simple_posixes);
16412                 }
16413                 else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
16414                            complement and use nposixes */
16415                     SV** posixes_ptr = namedclass % 2 == 0
16416                                        ? &posixes
16417                                        : &nposixes;
16418                     _invlist_union_maybe_complement_2nd(
16419                                                      *posixes_ptr,
16420                                                      PL_XPosix_ptrs[classnum],
16421                                                      namedclass % 2 != 0,
16422                                                      posixes_ptr);
16423                 }
16424 	    }
16425 	} /* end of namedclass \blah */
16426 
16427         SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16428 
16429         /* If 'range' is set, 'value' is the ending of a range--check its
16430          * validity.  (If value isn't a single code point in the case of a
16431          * range, we should have figured that out above in the code that
16432          * catches false ranges).  Later, we will handle each individual code
16433          * point in the range.  If 'range' isn't set, this could be the
16434          * beginning of a range, so check for that by looking ahead to see if
16435          * the next real character to be processed is the range indicator--the
16436          * minus sign */
16437 
16438 	if (range) {
16439 #ifdef EBCDIC
16440             /* For unicode ranges, we have to test that the Unicode as opposed
16441              * to the native values are not decreasing.  (Above 255, there is
16442              * no difference between native and Unicode) */
16443 	    if (unicode_range && prevvalue < 255 && value < 255) {
16444                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
16445                     goto backwards_range;
16446                 }
16447             }
16448             else
16449 #endif
16450 	    if (prevvalue > value) /* b-a */ {
16451 		int w;
16452 #ifdef EBCDIC
16453               backwards_range:
16454 #endif
16455                 w = RExC_parse - rangebegin;
16456                 vFAIL2utf8f(
16457                     "Invalid [] range \"%"UTF8f"\"",
16458                     UTF8fARG(UTF, w, rangebegin));
16459                 NOT_REACHED; /* NOTREACHED */
16460 	    }
16461 	}
16462 	else {
16463             prevvalue = value; /* save the beginning of the potential range */
16464             if (! stop_at_1     /* Can't be a range if parsing just one thing */
16465                 && *RExC_parse == '-')
16466             {
16467                 char* next_char_ptr = RExC_parse + 1;
16468 
16469                 /* Get the next real char after the '-' */
16470                 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
16471 
16472                 /* If the '-' is at the end of the class (just before the ']',
16473                  * it is a literal minus; otherwise it is a range */
16474                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
16475                     RExC_parse = next_char_ptr;
16476 
16477                     /* a bad range like \w-, [:word:]- ? */
16478                     if (namedclass > OOB_NAMEDCLASS) {
16479                         if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
16480                             const int w = RExC_parse >= rangebegin
16481                                           ?  RExC_parse - rangebegin
16482                                           : 0;
16483                             if (strict) {
16484                                 vFAIL4("False [] range \"%*.*s\"",
16485                                     w, w, rangebegin);
16486                             }
16487                             else if (PASS2) {
16488                                 vWARN4(RExC_parse,
16489                                     "False [] range \"%*.*s\"",
16490                                     w, w, rangebegin);
16491                             }
16492                         }
16493                         if (!SIZE_ONLY) {
16494                             cp_list = add_cp_to_invlist(cp_list, '-');
16495                         }
16496                         element_count++;
16497                     } else
16498                         range = 1;	/* yeah, it's a range! */
16499                     continue;	/* but do it the next time */
16500                 }
16501 	    }
16502 	}
16503 
16504         if (namedclass > OOB_NAMEDCLASS) {
16505             continue;
16506         }
16507 
16508         /* Here, we have a single value this time through the loop, and
16509          * <prevvalue> is the beginning of the range, if any; or <value> if
16510          * not. */
16511 
16512 	/* non-Latin1 code point implies unicode semantics.  Must be set in
16513 	 * pass1 so is there for the whole of pass 2 */
16514 	if (value > 255) {
16515             REQUIRE_UNI_RULES(flagp, NULL);
16516 	}
16517 
16518         /* Ready to process either the single value, or the completed range.
16519          * For single-valued non-inverted ranges, we consider the possibility
16520          * of multi-char folds.  (We made a conscious decision to not do this
16521          * for the other cases because it can often lead to non-intuitive
16522          * results.  For example, you have the peculiar case that:
16523          *  "s s" =~ /^[^\xDF]+$/i => Y
16524          *  "ss"  =~ /^[^\xDF]+$/i => N
16525          *
16526          * See [perl #89750] */
16527         if (FOLD && allow_multi_folds && value == prevvalue) {
16528             if (value == LATIN_SMALL_LETTER_SHARP_S
16529                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
16530                                                         value)))
16531             {
16532                 /* Here <value> is indeed a multi-char fold.  Get what it is */
16533 
16534                 U8 foldbuf[UTF8_MAXBYTES_CASE];
16535                 STRLEN foldlen;
16536 
16537                 UV folded = _to_uni_fold_flags(
16538                                 value,
16539                                 foldbuf,
16540                                 &foldlen,
16541                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
16542                                                    ? FOLD_FLAGS_NOMIX_ASCII
16543                                                    : 0)
16544                                 );
16545 
16546                 /* Here, <folded> should be the first character of the
16547                  * multi-char fold of <value>, with <foldbuf> containing the
16548                  * whole thing.  But, if this fold is not allowed (because of
16549                  * the flags), <fold> will be the same as <value>, and should
16550                  * be processed like any other character, so skip the special
16551                  * handling */
16552                 if (folded != value) {
16553 
16554                     /* Skip if we are recursed, currently parsing the class
16555                      * again.  Otherwise add this character to the list of
16556                      * multi-char folds. */
16557                     if (! RExC_in_multi_char_class) {
16558                         STRLEN cp_count = utf8_length(foldbuf,
16559                                                       foldbuf + foldlen);
16560                         SV* multi_fold = sv_2mortal(newSVpvs(""));
16561 
16562                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
16563 
16564                         multi_char_matches
16565                                         = add_multi_match(multi_char_matches,
16566                                                           multi_fold,
16567                                                           cp_count);
16568 
16569                     }
16570 
16571                     /* This element should not be processed further in this
16572                      * class */
16573                     element_count--;
16574                     value = save_value;
16575                     prevvalue = save_prevvalue;
16576                     continue;
16577                 }
16578             }
16579         }
16580 
16581         if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
16582             if (range) {
16583 
16584                 /* If the range starts above 255, everything is portable and
16585                  * likely to be so for any forseeable character set, so don't
16586                  * warn. */
16587                 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
16588                     vWARN(RExC_parse, "Both or neither range ends should be Unicode");
16589                 }
16590                 else if (prevvalue != value) {
16591 
16592                     /* Under strict, ranges that stop and/or end in an ASCII
16593                      * printable should have each end point be a portable value
16594                      * for it (preferably like 'A', but we don't warn if it is
16595                      * a (portable) Unicode name or code point), and the range
16596                      * must be be all digits or all letters of the same case.
16597                      * Otherwise, the range is non-portable and unclear as to
16598                      * what it contains */
16599                     if ((isPRINT_A(prevvalue) || isPRINT_A(value))
16600                         && (non_portable_endpoint
16601                             || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
16602                                    || (isLOWER_A(prevvalue) && isLOWER_A(value))
16603                                    || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
16604                     {
16605                         vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
16606                     }
16607                     else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
16608 
16609                         /* But the nature of Unicode and languages mean we
16610                          * can't do the same checks for above-ASCII ranges,
16611                          * except in the case of digit ones.  These should
16612                          * contain only digits from the same group of 10.  The
16613                          * ASCII case is handled just above.  0x660 is the
16614                          * first digit character beyond ASCII.  Hence here, the
16615                          * range could be a range of digits.  Find out.  */
16616                         IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
16617                                                          prevvalue);
16618                         IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
16619                                                          value);
16620 
16621                         /* If the range start and final points are in the same
16622                          * inversion list element, it means that either both
16623                          * are not digits, or both are digits in a consecutive
16624                          * sequence of digits.  (So far, Unicode has kept all
16625                          * such sequences as distinct groups of 10, but assert
16626                          * to make sure).  If the end points are not in the
16627                          * same element, neither should be a digit. */
16628                         if (index_start == index_final) {
16629                             assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
16630                             || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
16631                                - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
16632                                == 10)
16633                                /* But actually Unicode did have one group of 11
16634                                 * 'digits' in 5.2, so in case we are operating
16635                                 * on that version, let that pass */
16636                             || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
16637                                - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
16638                                 == 11
16639                                && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
16640                                 == 0x19D0)
16641                             );
16642                         }
16643                         else if ((index_start >= 0
16644                                   && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
16645                                  || (index_final >= 0
16646                                      && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
16647                         {
16648                             vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
16649                         }
16650                     }
16651                 }
16652             }
16653             if ((! range || prevvalue == value) && non_portable_endpoint) {
16654                 if (isPRINT_A(value)) {
16655                     char literal[3];
16656                     unsigned d = 0;
16657                     if (isBACKSLASHED_PUNCT(value)) {
16658                         literal[d++] = '\\';
16659                     }
16660                     literal[d++] = (char) value;
16661                     literal[d++] = '\0';
16662 
16663                     vWARN4(RExC_parse,
16664                            "\"%.*s\" is more clearly written simply as \"%s\"",
16665                            (int) (RExC_parse - rangebegin),
16666                            rangebegin,
16667                            literal
16668                         );
16669                 }
16670                 else if isMNEMONIC_CNTRL(value) {
16671                     vWARN4(RExC_parse,
16672                            "\"%.*s\" is more clearly written simply as \"%s\"",
16673                            (int) (RExC_parse - rangebegin),
16674                            rangebegin,
16675                            cntrl_to_mnemonic((U8) value)
16676                         );
16677                 }
16678             }
16679         }
16680 
16681         /* Deal with this element of the class */
16682 	if (! SIZE_ONLY) {
16683 
16684 #ifndef EBCDIC
16685             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16686                                                      prevvalue, value);
16687 #else
16688             /* On non-ASCII platforms, for ranges that span all of 0..255, and
16689              * ones that don't require special handling, we can just add the
16690              * range like we do for ASCII platforms */
16691             if ((UNLIKELY(prevvalue == 0) && value >= 255)
16692                 || ! (prevvalue < 256
16693                       && (unicode_range
16694                           || (! non_portable_endpoint
16695                               && ((isLOWER_A(prevvalue) && isLOWER_A(value))
16696                                   || (isUPPER_A(prevvalue)
16697                                       && isUPPER_A(value)))))))
16698             {
16699                 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16700                                                          prevvalue, value);
16701             }
16702             else {
16703                 /* Here, requires special handling.  This can be because it is
16704                  * a range whose code points are considered to be Unicode, and
16705                  * so must be individually translated into native, or because
16706                  * its a subrange of 'A-Z' or 'a-z' which each aren't
16707                  * contiguous in EBCDIC, but we have defined them to include
16708                  * only the "expected" upper or lower case ASCII alphabetics.
16709                  * Subranges above 255 are the same in native and Unicode, so
16710                  * can be added as a range */
16711                 U8 start = NATIVE_TO_LATIN1(prevvalue);
16712                 unsigned j;
16713                 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
16714                 for (j = start; j <= end; j++) {
16715                     cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
16716                 }
16717                 if (value > 255) {
16718                     cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
16719                                                              256, value);
16720                 }
16721             }
16722 #endif
16723         }
16724 
16725 	range = 0; /* this range (if it was one) is done now */
16726     } /* End of loop through all the text within the brackets */
16727 
16728 
16729     if (   posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
16730         output_or_return_posix_warnings(pRExC_state, posix_warnings,
16731                                         return_posix_warnings);
16732     }
16733 
16734     /* If anything in the class expands to more than one character, we have to
16735      * deal with them by building up a substitute parse string, and recursively
16736      * calling reg() on it, instead of proceeding */
16737     if (multi_char_matches) {
16738 	SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
16739         I32 cp_count;
16740 	STRLEN len;
16741 	char *save_end = RExC_end;
16742 	char *save_parse = RExC_parse;
16743 	char *save_start = RExC_start;
16744         STRLEN prefix_end = 0;      /* We copy the character class after a
16745                                        prefix supplied here.  This is the size
16746                                        + 1 of that prefix */
16747         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
16748                                        a "|" */
16749         I32 reg_flags;
16750 
16751         assert(! invert);
16752         assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
16753 
16754 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
16755            because too confusing */
16756         if (invert) {
16757             sv_catpv(substitute_parse, "(?:");
16758         }
16759 #endif
16760 
16761         /* Look at the longest folds first */
16762         for (cp_count = av_tindex_nomg(multi_char_matches);
16763                         cp_count > 0;
16764                         cp_count--)
16765         {
16766 
16767             if (av_exists(multi_char_matches, cp_count)) {
16768                 AV** this_array_ptr;
16769                 SV* this_sequence;
16770 
16771                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
16772                                                  cp_count, FALSE);
16773                 while ((this_sequence = av_pop(*this_array_ptr)) !=
16774                                                                 &PL_sv_undef)
16775                 {
16776                     if (! first_time) {
16777                         sv_catpv(substitute_parse, "|");
16778                     }
16779                     first_time = FALSE;
16780 
16781                     sv_catpv(substitute_parse, SvPVX(this_sequence));
16782                 }
16783             }
16784         }
16785 
16786         /* If the character class contains anything else besides these
16787          * multi-character folds, have to include it in recursive parsing */
16788         if (element_count) {
16789             sv_catpv(substitute_parse, "|[");
16790             prefix_end = SvCUR(substitute_parse);
16791             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
16792 
16793             /* Put in a closing ']' only if not going off the end, as otherwise
16794              * we are adding something that really isn't there */
16795             if (RExC_parse < RExC_end) {
16796                 sv_catpv(substitute_parse, "]");
16797             }
16798         }
16799 
16800         sv_catpv(substitute_parse, ")");
16801 #if 0
16802         if (invert) {
16803             /* This is a way to get the parse to skip forward a whole named
16804              * sequence instead of matching the 2nd character when it fails the
16805              * first */
16806             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
16807         }
16808 #endif
16809 
16810         /* Set up the data structure so that any errors will be properly
16811          * reported.  See the comments at the definition of
16812          * REPORT_LOCATION_ARGS for details */
16813         RExC_precomp_adj = orig_parse - RExC_precomp;
16814 	RExC_start =  RExC_parse = SvPV(substitute_parse, len);
16815         RExC_adjusted_start = RExC_start + prefix_end;
16816 	RExC_end = RExC_parse + len;
16817         RExC_in_multi_char_class = 1;
16818 	RExC_override_recoding = 1;
16819         RExC_emit = (regnode *)orig_emit;
16820 
16821 	ret = reg(pRExC_state, 1, &reg_flags, depth+1);
16822 
16823 	*flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
16824 
16825         /* And restore so can parse the rest of the pattern */
16826         RExC_parse = save_parse;
16827 	RExC_start = RExC_adjusted_start = save_start;
16828         RExC_precomp_adj = 0;
16829 	RExC_end = save_end;
16830 	RExC_in_multi_char_class = 0;
16831 	RExC_override_recoding = 0;
16832         SvREFCNT_dec_NN(multi_char_matches);
16833         return ret;
16834     }
16835 
16836     /* Here, we've gone through the entire class and dealt with multi-char
16837      * folds.  We are now in a position that we can do some checks to see if we
16838      * can optimize this ANYOF node into a simpler one, even in Pass 1.
16839      * Currently we only do two checks:
16840      * 1) is in the unlikely event that the user has specified both, eg. \w and
16841      *    \W under /l, then the class matches everything.  (This optimization
16842      *    is done only to make the optimizer code run later work.)
16843      * 2) if the character class contains only a single element (including a
16844      *    single range), we see if there is an equivalent node for it.
16845      * Other checks are possible */
16846     if (   optimizable
16847         && ! ret_invlist   /* Can't optimize if returning the constructed
16848                               inversion list */
16849         && (UNLIKELY(posixl_matches_all) || element_count == 1))
16850     {
16851         U8 op = END;
16852         U8 arg = 0;
16853 
16854         if (UNLIKELY(posixl_matches_all)) {
16855             op = SANY;
16856         }
16857         else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
16858                                                    class, like \w or [:digit:]
16859                                                    or \p{foo} */
16860 
16861             /* All named classes are mapped into POSIXish nodes, with its FLAG
16862              * argument giving which class it is */
16863             switch ((I32)namedclass) {
16864                 case ANYOF_UNIPROP:
16865                     break;
16866 
16867                 /* These don't depend on the charset modifiers.  They always
16868                  * match under /u rules */
16869                 case ANYOF_NHORIZWS:
16870                 case ANYOF_HORIZWS:
16871                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
16872                     /* FALLTHROUGH */
16873 
16874                 case ANYOF_NVERTWS:
16875                 case ANYOF_VERTWS:
16876                     op = POSIXU;
16877                     goto join_posix;
16878 
16879                 /* The actual POSIXish node for all the rest depends on the
16880                  * charset modifier.  The ones in the first set depend only on
16881                  * ASCII or, if available on this platform, also locale */
16882                 case ANYOF_ASCII:
16883                 case ANYOF_NASCII:
16884 #ifdef HAS_ISASCII
16885                     op = (LOC) ? POSIXL : POSIXA;
16886 #else
16887                     op = POSIXA;
16888 #endif
16889                     goto join_posix;
16890 
16891                 /* The following don't have any matches in the upper Latin1
16892                  * range, hence /d is equivalent to /u for them.  Making it /u
16893                  * saves some branches at runtime */
16894                 case ANYOF_DIGIT:
16895                 case ANYOF_NDIGIT:
16896                 case ANYOF_XDIGIT:
16897                 case ANYOF_NXDIGIT:
16898                     if (! DEPENDS_SEMANTICS) {
16899                         goto treat_as_default;
16900                     }
16901 
16902                     op = POSIXU;
16903                     goto join_posix;
16904 
16905                 /* The following change to CASED under /i */
16906                 case ANYOF_LOWER:
16907                 case ANYOF_NLOWER:
16908                 case ANYOF_UPPER:
16909                 case ANYOF_NUPPER:
16910                     if (FOLD) {
16911                         namedclass = ANYOF_CASED + (namedclass % 2);
16912                     }
16913                     /* FALLTHROUGH */
16914 
16915                 /* The rest have more possibilities depending on the charset.
16916                  * We take advantage of the enum ordering of the charset
16917                  * modifiers to get the exact node type, */
16918                 default:
16919                   treat_as_default:
16920                     op = POSIXD + get_regex_charset(RExC_flags);
16921                     if (op > POSIXA) { /* /aa is same as /a */
16922                         op = POSIXA;
16923                     }
16924 
16925                   join_posix:
16926                     /* The odd numbered ones are the complements of the
16927                      * next-lower even number one */
16928                     if (namedclass % 2 == 1) {
16929                         invert = ! invert;
16930                         namedclass--;
16931                     }
16932                     arg = namedclass_to_classnum(namedclass);
16933                     break;
16934             }
16935         }
16936         else if (value == prevvalue) {
16937 
16938             /* Here, the class consists of just a single code point */
16939 
16940             if (invert) {
16941                 if (! LOC && value == '\n') {
16942                     op = REG_ANY; /* Optimize [^\n] */
16943                     *flagp |= HASWIDTH|SIMPLE;
16944                     MARK_NAUGHTY(1);
16945                 }
16946             }
16947             else if (value < 256 || UTF) {
16948 
16949                 /* Optimize a single value into an EXACTish node, but not if it
16950                  * would require converting the pattern to UTF-8. */
16951                 op = compute_EXACTish(pRExC_state);
16952             }
16953         } /* Otherwise is a range */
16954         else if (! LOC) {   /* locale could vary these */
16955             if (prevvalue == '0') {
16956                 if (value == '9') {
16957                     arg = _CC_DIGIT;
16958                     op = POSIXA;
16959                 }
16960             }
16961             else if (! FOLD || ASCII_FOLD_RESTRICTED) {
16962                 /* We can optimize A-Z or a-z, but not if they could match
16963                  * something like the KELVIN SIGN under /i. */
16964                 if (prevvalue == 'A') {
16965                     if (value == 'Z'
16966 #ifdef EBCDIC
16967                         && ! non_portable_endpoint
16968 #endif
16969                     ) {
16970                         arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
16971                         op = POSIXA;
16972                     }
16973                 }
16974                 else if (prevvalue == 'a') {
16975                     if (value == 'z'
16976 #ifdef EBCDIC
16977                         && ! non_portable_endpoint
16978 #endif
16979                     ) {
16980                         arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
16981                         op = POSIXA;
16982                     }
16983                 }
16984             }
16985         }
16986 
16987         /* Here, we have changed <op> away from its initial value iff we found
16988          * an optimization */
16989         if (op != END) {
16990 
16991             /* Throw away this ANYOF regnode, and emit the calculated one,
16992              * which should correspond to the beginning, not current, state of
16993              * the parse */
16994             const char * cur_parse = RExC_parse;
16995             RExC_parse = (char *)orig_parse;
16996             if ( SIZE_ONLY) {
16997                 if (! LOC) {
16998 
16999                     /* To get locale nodes to not use the full ANYOF size would
17000                      * require moving the code above that writes the portions
17001                      * of it that aren't in other nodes to after this point.
17002                      * e.g.  ANYOF_POSIXL_SET */
17003                     RExC_size = orig_size;
17004                 }
17005             }
17006             else {
17007                 RExC_emit = (regnode *)orig_emit;
17008                 if (PL_regkind[op] == POSIXD) {
17009                     if (op == POSIXL) {
17010                         RExC_contains_locale = 1;
17011                     }
17012                     if (invert) {
17013                         op += NPOSIXD - POSIXD;
17014                     }
17015                 }
17016             }
17017 
17018             ret = reg_node(pRExC_state, op);
17019 
17020             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17021                 if (! SIZE_ONLY) {
17022                     FLAGS(ret) = arg;
17023                 }
17024                 *flagp |= HASWIDTH|SIMPLE;
17025             }
17026             else if (PL_regkind[op] == EXACT) {
17027                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17028                                            TRUE /* downgradable to EXACT */
17029                                            );
17030             }
17031 
17032             RExC_parse = (char *) cur_parse;
17033 
17034             SvREFCNT_dec(posixes);
17035             SvREFCNT_dec(nposixes);
17036             SvREFCNT_dec(simple_posixes);
17037             SvREFCNT_dec(cp_list);
17038             SvREFCNT_dec(cp_foldable_list);
17039             return ret;
17040         }
17041     }
17042 
17043     if (SIZE_ONLY)
17044         return ret;
17045     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
17046 
17047     /* If folding, we calculate all characters that could fold to or from the
17048      * ones already on the list */
17049     if (cp_foldable_list) {
17050         if (FOLD) {
17051             UV start, end;	/* End points of code point ranges */
17052 
17053             SV* fold_intersection = NULL;
17054             SV** use_list;
17055 
17056             /* Our calculated list will be for Unicode rules.  For locale
17057              * matching, we have to keep a separate list that is consulted at
17058              * runtime only when the locale indicates Unicode rules.  For
17059              * non-locale, we just use the general list */
17060             if (LOC) {
17061                 use_list = &only_utf8_locale_list;
17062             }
17063             else {
17064                 use_list = &cp_list;
17065             }
17066 
17067             /* Only the characters in this class that participate in folds need
17068              * be checked.  Get the intersection of this class and all the
17069              * possible characters that are foldable.  This can quickly narrow
17070              * down a large class */
17071             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
17072                                   &fold_intersection);
17073 
17074             /* The folds for all the Latin1 characters are hard-coded into this
17075              * program, but we have to go out to disk to get the others. */
17076             if (invlist_highest(cp_foldable_list) >= 256) {
17077 
17078                 /* This is a hash that for a particular fold gives all
17079                  * characters that are involved in it */
17080                 if (! PL_utf8_foldclosures) {
17081                     _load_PL_utf8_foldclosures();
17082                 }
17083             }
17084 
17085             /* Now look at the foldable characters in this class individually */
17086             invlist_iterinit(fold_intersection);
17087             while (invlist_iternext(fold_intersection, &start, &end)) {
17088                 UV j;
17089 
17090                 /* Look at every character in the range */
17091                 for (j = start; j <= end; j++) {
17092                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17093                     STRLEN foldlen;
17094                     SV** listp;
17095 
17096                     if (j < 256) {
17097 
17098                         if (IS_IN_SOME_FOLD_L1(j)) {
17099 
17100                             /* ASCII is always matched; non-ASCII is matched
17101                              * only under Unicode rules (which could happen
17102                              * under /l if the locale is a UTF-8 one */
17103                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17104                                 *use_list = add_cp_to_invlist(*use_list,
17105                                                             PL_fold_latin1[j]);
17106                             }
17107                             else {
17108                                 has_upper_latin1_only_utf8_matches
17109                                     = add_cp_to_invlist(
17110                                             has_upper_latin1_only_utf8_matches,
17111                                             PL_fold_latin1[j]);
17112                             }
17113                         }
17114 
17115                         if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17116                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17117                         {
17118                             add_above_Latin1_folds(pRExC_state,
17119                                                    (U8) j,
17120                                                    use_list);
17121                         }
17122                         continue;
17123                     }
17124 
17125                     /* Here is an above Latin1 character.  We don't have the
17126                      * rules hard-coded for it.  First, get its fold.  This is
17127                      * the simple fold, as the multi-character folds have been
17128                      * handled earlier and separated out */
17129                     _to_uni_fold_flags(j, foldbuf, &foldlen,
17130                                                         (ASCII_FOLD_RESTRICTED)
17131                                                         ? FOLD_FLAGS_NOMIX_ASCII
17132                                                         : 0);
17133 
17134                     /* Single character fold of above Latin1.  Add everything in
17135                     * its fold closure to the list that this node should match.
17136                     * The fold closures data structure is a hash with the keys
17137                     * being the UTF-8 of every character that is folded to, like
17138                     * 'k', and the values each an array of all code points that
17139                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
17140                     * Multi-character folds are not included */
17141                     if ((listp = hv_fetch(PL_utf8_foldclosures,
17142                                         (char *) foldbuf, foldlen, FALSE)))
17143                     {
17144                         AV* list = (AV*) *listp;
17145                         IV k;
17146                         for (k = 0; k <= av_tindex_nomg(list); k++) {
17147                             SV** c_p = av_fetch(list, k, FALSE);
17148                             UV c;
17149                             assert(c_p);
17150 
17151                             c = SvUV(*c_p);
17152 
17153                             /* /aa doesn't allow folds between ASCII and non- */
17154                             if ((ASCII_FOLD_RESTRICTED
17155                                 && (isASCII(c) != isASCII(j))))
17156                             {
17157                                 continue;
17158                             }
17159 
17160                             /* Folds under /l which cross the 255/256 boundary
17161                              * are added to a separate list.  (These are valid
17162                              * only when the locale is UTF-8.) */
17163                             if (c < 256 && LOC) {
17164                                 *use_list = add_cp_to_invlist(*use_list, c);
17165                                 continue;
17166                             }
17167 
17168                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
17169                             {
17170                                 cp_list = add_cp_to_invlist(cp_list, c);
17171                             }
17172                             else {
17173                                 /* Similarly folds involving non-ascii Latin1
17174                                 * characters under /d are added to their list */
17175                                 has_upper_latin1_only_utf8_matches
17176                                         = add_cp_to_invlist(
17177                                            has_upper_latin1_only_utf8_matches,
17178                                            c);
17179                             }
17180                         }
17181                     }
17182                 }
17183             }
17184             SvREFCNT_dec_NN(fold_intersection);
17185         }
17186 
17187         /* Now that we have finished adding all the folds, there is no reason
17188          * to keep the foldable list separate */
17189         _invlist_union(cp_list, cp_foldable_list, &cp_list);
17190 	SvREFCNT_dec_NN(cp_foldable_list);
17191     }
17192 
17193     /* And combine the result (if any) with any inversion list from posix
17194      * classes.  The lists are kept separate up to now because we don't want to
17195      * fold the classes (folding of those is automatically handled by the swash
17196      * fetching code) */
17197     if (simple_posixes) {
17198         _invlist_union(cp_list, simple_posixes, &cp_list);
17199         SvREFCNT_dec_NN(simple_posixes);
17200     }
17201     if (posixes || nposixes) {
17202         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
17203             /* Under /a and /aa, nothing above ASCII matches these */
17204             _invlist_intersection(posixes,
17205                                   PL_XPosix_ptrs[_CC_ASCII],
17206                                   &posixes);
17207         }
17208         if (nposixes) {
17209             if (DEPENDS_SEMANTICS) {
17210                 /* Under /d, everything in the upper half of the Latin1 range
17211                  * matches these complements */
17212                 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17213             }
17214             else if (AT_LEAST_ASCII_RESTRICTED) {
17215                 /* Under /a and /aa, everything above ASCII matches these
17216                  * complements */
17217                 _invlist_union_complement_2nd(nposixes,
17218                                               PL_XPosix_ptrs[_CC_ASCII],
17219                                               &nposixes);
17220             }
17221             if (posixes) {
17222                 _invlist_union(posixes, nposixes, &posixes);
17223                 SvREFCNT_dec_NN(nposixes);
17224             }
17225             else {
17226                 posixes = nposixes;
17227             }
17228         }
17229         if (! DEPENDS_SEMANTICS) {
17230             if (cp_list) {
17231                 _invlist_union(cp_list, posixes, &cp_list);
17232                 SvREFCNT_dec_NN(posixes);
17233             }
17234             else {
17235                 cp_list = posixes;
17236             }
17237         }
17238         else {
17239             /* Under /d, we put into a separate list the Latin1 things that
17240              * match only when the target string is utf8 */
17241             SV* nonascii_but_latin1_properties = NULL;
17242             _invlist_intersection(posixes, PL_UpperLatin1,
17243                                   &nonascii_but_latin1_properties);
17244             _invlist_subtract(posixes, nonascii_but_latin1_properties,
17245                               &posixes);
17246             if (cp_list) {
17247                 _invlist_union(cp_list, posixes, &cp_list);
17248                 SvREFCNT_dec_NN(posixes);
17249             }
17250             else {
17251                 cp_list = posixes;
17252             }
17253 
17254             if (has_upper_latin1_only_utf8_matches) {
17255                 _invlist_union(has_upper_latin1_only_utf8_matches,
17256                                nonascii_but_latin1_properties,
17257                                &has_upper_latin1_only_utf8_matches);
17258                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
17259             }
17260             else {
17261                 has_upper_latin1_only_utf8_matches
17262                                             = nonascii_but_latin1_properties;
17263             }
17264         }
17265     }
17266 
17267     /* And combine the result (if any) with any inversion list from properties.
17268      * The lists are kept separate up to now so that we can distinguish the two
17269      * in regards to matching above-Unicode.  A run-time warning is generated
17270      * if a Unicode property is matched against a non-Unicode code point. But,
17271      * we allow user-defined properties to match anything, without any warning,
17272      * and we also suppress the warning if there is a portion of the character
17273      * class that isn't a Unicode property, and which matches above Unicode, \W
17274      * or [\x{110000}] for example.
17275      * (Note that in this case, unlike the Posix one above, there is no
17276      * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
17277      * forces Unicode semantics */
17278     if (properties) {
17279         if (cp_list) {
17280 
17281             /* If it matters to the final outcome, see if a non-property
17282              * component of the class matches above Unicode.  If so, the
17283              * warning gets suppressed.  This is true even if just a single
17284              * such code point is specified, as, though not strictly correct if
17285              * another such code point is matched against, the fact that they
17286              * are using above-Unicode code points indicates they should know
17287              * the issues involved */
17288             if (warn_super) {
17289                 warn_super = ! (invert
17290                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
17291             }
17292 
17293             _invlist_union(properties, cp_list, &cp_list);
17294             SvREFCNT_dec_NN(properties);
17295         }
17296         else {
17297             cp_list = properties;
17298         }
17299 
17300         if (warn_super) {
17301             ANYOF_FLAGS(ret)
17302              |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17303 
17304             /* Because an ANYOF node is the only one that warns, this node
17305              * can't be optimized into something else */
17306             optimizable = FALSE;
17307         }
17308     }
17309 
17310     /* Here, we have calculated what code points should be in the character
17311      * class.
17312      *
17313      * Now we can see about various optimizations.  Fold calculation (which we
17314      * did above) needs to take place before inversion.  Otherwise /[^k]/i
17315      * would invert to include K, which under /i would match k, which it
17316      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
17317      * folded until runtime */
17318 
17319     /* If we didn't do folding, it's because some information isn't available
17320      * until runtime; set the run-time fold flag for these.  (We don't have to
17321      * worry about properties folding, as that is taken care of by the swash
17322      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
17323      * locales, or the class matches at least one 0-255 range code point */
17324     if (LOC && FOLD) {
17325 
17326         /* Some things on the list might be unconditionally included because of
17327          * other components.  Remove them, and clean up the list if it goes to
17328          * 0 elements */
17329         if (only_utf8_locale_list && cp_list) {
17330             _invlist_subtract(only_utf8_locale_list, cp_list,
17331                               &only_utf8_locale_list);
17332 
17333             if (_invlist_len(only_utf8_locale_list) == 0) {
17334                 SvREFCNT_dec_NN(only_utf8_locale_list);
17335                 only_utf8_locale_list = NULL;
17336             }
17337         }
17338         if (only_utf8_locale_list) {
17339             ANYOF_FLAGS(ret)
17340                  |=  ANYOFL_FOLD
17341                     |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
17342         }
17343         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
17344             UV start, end;
17345             invlist_iterinit(cp_list);
17346             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
17347                 ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
17348             }
17349             invlist_iterfinish(cp_list);
17350         }
17351     }
17352 
17353 #define MATCHES_ALL_NON_UTF8_NON_ASCII(ret)                                 \
17354     (   DEPENDS_SEMANTICS                                                   \
17355      && (ANYOF_FLAGS(ret)                                                   \
17356         & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
17357 
17358     /* See if we can simplify things under /d */
17359     if (   has_upper_latin1_only_utf8_matches
17360         || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
17361     {
17362         /* But not if we are inverting, as that screws it up */
17363         if (! invert) {
17364             if (has_upper_latin1_only_utf8_matches) {
17365                 if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
17366 
17367                     /* Here, we have both the flag and inversion list.  Any
17368                      * character in 'has_upper_latin1_only_utf8_matches'
17369                      * matches when UTF-8 is in effect, but it also matches
17370                      * when UTF-8 is not in effect because of
17371                      * MATCHES_ALL_NON_UTF8_NON_ASCII.  Therefore it matches
17372                      * unconditionally, so can be added to the regular list,
17373                      * and 'has_upper_latin1_only_utf8_matches' cleared */
17374                     _invlist_union(cp_list,
17375                                    has_upper_latin1_only_utf8_matches,
17376                                    &cp_list);
17377                     SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17378                     has_upper_latin1_only_utf8_matches = NULL;
17379                 }
17380                 else if (cp_list) {
17381 
17382                     /* Here, 'cp_list' gives chars that always match, and
17383                      * 'has_upper_latin1_only_utf8_matches' gives chars that
17384                      * were specified to match only if the target string is in
17385                      * UTF-8.  It may be that these overlap, so we can subtract
17386                      * the unconditionally matching from the conditional ones,
17387                      * to make the conditional list as small as possible,
17388                      * perhaps even clearing it, in which case more
17389                      * optimizations are possible later */
17390                     _invlist_subtract(has_upper_latin1_only_utf8_matches,
17391                                       cp_list,
17392                                       &has_upper_latin1_only_utf8_matches);
17393                     if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
17394                         SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17395                         has_upper_latin1_only_utf8_matches = NULL;
17396                     }
17397                 }
17398             }
17399 
17400             /* Similarly, if the unconditional matches include every upper
17401              * latin1 character, we can clear that flag to permit later
17402              * optimizations */
17403             if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
17404                 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
17405                 _invlist_subtract(only_non_utf8_list, cp_list,
17406                                   &only_non_utf8_list);
17407                 if (_invlist_len(only_non_utf8_list) == 0) {
17408                     ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17409                 }
17410                 SvREFCNT_dec_NN(only_non_utf8_list);
17411                 only_non_utf8_list = NULL;;
17412             }
17413         }
17414 
17415         /* If we haven't gotten rid of all conditional matching, we change the
17416          * regnode type to indicate that */
17417         if (   has_upper_latin1_only_utf8_matches
17418             || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
17419         {
17420             OP(ret) = ANYOFD;
17421             optimizable = FALSE;
17422         }
17423     }
17424 #undef MATCHES_ALL_NON_UTF8_NON_ASCII
17425 
17426     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
17427      * at compile time.  Besides not inverting folded locale now, we can't
17428      * invert if there are things such as \w, which aren't known until runtime
17429      * */
17430     if (cp_list
17431         && invert
17432         && OP(ret) != ANYOFD
17433         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
17434 	&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17435     {
17436         _invlist_invert(cp_list);
17437 
17438         /* Any swash can't be used as-is, because we've inverted things */
17439         if (swash) {
17440             SvREFCNT_dec_NN(swash);
17441             swash = NULL;
17442         }
17443 
17444 	/* Clear the invert flag since have just done it here */
17445 	invert = FALSE;
17446     }
17447 
17448     if (ret_invlist) {
17449         assert(cp_list);
17450 
17451         *ret_invlist = cp_list;
17452         SvREFCNT_dec(swash);
17453 
17454         /* Discard the generated node */
17455         if (SIZE_ONLY) {
17456             RExC_size = orig_size;
17457         }
17458         else {
17459             RExC_emit = orig_emit;
17460         }
17461         return orig_emit;
17462     }
17463 
17464     /* Some character classes are equivalent to other nodes.  Such nodes take
17465      * up less room and generally fewer operations to execute than ANYOF nodes.
17466      * Above, we checked for and optimized into some such equivalents for
17467      * certain common classes that are easy to test.  Getting to this point in
17468      * the code means that the class didn't get optimized there.  Since this
17469      * code is only executed in Pass 2, it is too late to save space--it has
17470      * been allocated in Pass 1, and currently isn't given back.  But turning
17471      * things into an EXACTish node can allow the optimizer to join it to any
17472      * adjacent such nodes.  And if the class is equivalent to things like /./,
17473      * expensive run-time swashes can be avoided.  Now that we have more
17474      * complete information, we can find things necessarily missed by the
17475      * earlier code.  Another possible "optimization" that isn't done is that
17476      * something like [Ee] could be changed into an EXACTFU.  khw tried this
17477      * and found that the ANYOF is faster, including for code points not in the
17478      * bitmap.  This still might make sense to do, provided it got joined with
17479      * an adjacent node(s) to create a longer EXACTFU one.  This could be
17480      * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
17481      * routine would know is joinable.  If that didn't happen, the node type
17482      * could then be made a straight ANYOF */
17483 
17484     if (optimizable && cp_list && ! invert) {
17485         UV start, end;
17486         U8 op = END;  /* The optimzation node-type */
17487         int posix_class = -1;   /* Illegal value */
17488         const char * cur_parse= RExC_parse;
17489 
17490         invlist_iterinit(cp_list);
17491         if (! invlist_iternext(cp_list, &start, &end)) {
17492 
17493             /* Here, the list is empty.  This happens, for example, when a
17494              * Unicode property that doesn't match anything is the only element
17495              * in the character class (perluniprops.pod notes such properties).
17496              * */
17497             op = OPFAIL;
17498             *flagp |= HASWIDTH|SIMPLE;
17499         }
17500         else if (start == end) {    /* The range is a single code point */
17501             if (! invlist_iternext(cp_list, &start, &end)
17502 
17503                     /* Don't do this optimization if it would require changing
17504                      * the pattern to UTF-8 */
17505                 && (start < 256 || UTF))
17506             {
17507                 /* Here, the list contains a single code point.  Can optimize
17508                  * into an EXACTish node */
17509 
17510                 value = start;
17511 
17512                 if (! FOLD) {
17513                     op = (LOC)
17514                          ? EXACTL
17515                          : EXACT;
17516                 }
17517                 else if (LOC) {
17518 
17519                     /* A locale node under folding with one code point can be
17520                      * an EXACTFL, as its fold won't be calculated until
17521                      * runtime */
17522                     op = EXACTFL;
17523                 }
17524                 else {
17525 
17526                     /* Here, we are generally folding, but there is only one
17527                      * code point to match.  If we have to, we use an EXACT
17528                      * node, but it would be better for joining with adjacent
17529                      * nodes in the optimization pass if we used the same
17530                      * EXACTFish node that any such are likely to be.  We can
17531                      * do this iff the code point doesn't participate in any
17532                      * folds.  For example, an EXACTF of a colon is the same as
17533                      * an EXACT one, since nothing folds to or from a colon. */
17534                     if (value < 256) {
17535                         if (IS_IN_SOME_FOLD_L1(value)) {
17536                             op = EXACT;
17537                         }
17538                     }
17539                     else {
17540                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
17541                             op = EXACT;
17542                         }
17543                     }
17544 
17545                     /* If we haven't found the node type, above, it means we
17546                      * can use the prevailing one */
17547                     if (op == END) {
17548                         op = compute_EXACTish(pRExC_state);
17549                     }
17550                 }
17551             }
17552         }   /* End of first range contains just a single code point */
17553         else if (start == 0) {
17554             if (end == UV_MAX) {
17555                 op = SANY;
17556                 *flagp |= HASWIDTH|SIMPLE;
17557                 MARK_NAUGHTY(1);
17558             }
17559             else if (end == '\n' - 1
17560                     && invlist_iternext(cp_list, &start, &end)
17561                     && start == '\n' + 1 && end == UV_MAX)
17562             {
17563                 op = REG_ANY;
17564                 *flagp |= HASWIDTH|SIMPLE;
17565                 MARK_NAUGHTY(1);
17566             }
17567         }
17568         invlist_iterfinish(cp_list);
17569 
17570         if (op == END) {
17571             const UV cp_list_len = _invlist_len(cp_list);
17572             const UV* cp_list_array = invlist_array(cp_list);
17573 
17574             /* Here, didn't find an optimization.  See if this matches any of
17575              * the POSIX classes.  These run slightly faster for above-Unicode
17576              * code points, so don't bother with POSIXA ones nor the 2 that
17577              * have no above-Unicode matches.  We can avoid these checks unless
17578              * the ANYOF matches at least as high as the lowest POSIX one
17579              * (which was manually found to be \v.  The actual code point may
17580              * increase in later Unicode releases, if a higher code point is
17581              * assigned to be \v, but this code will never break.  It would
17582              * just mean we could execute the checks for posix optimizations
17583              * unnecessarily) */
17584 
17585             if (cp_list_array[cp_list_len-1] > 0x2029) {
17586                 for (posix_class = 0;
17587                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
17588                      posix_class++)
17589                 {
17590                     int try_inverted;
17591                     if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
17592                         continue;
17593                     }
17594                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
17595 
17596                         /* Check if matches normal or inverted */
17597                         if (_invlistEQ(cp_list,
17598                                        PL_XPosix_ptrs[posix_class],
17599                                        try_inverted))
17600                         {
17601                             op = (try_inverted)
17602                                  ? NPOSIXU
17603                                  : POSIXU;
17604                             *flagp |= HASWIDTH|SIMPLE;
17605                             goto found_posix;
17606                         }
17607                     }
17608                 }
17609               found_posix: ;
17610             }
17611         }
17612 
17613         if (op != END) {
17614             RExC_parse = (char *)orig_parse;
17615             RExC_emit = (regnode *)orig_emit;
17616 
17617             if (regarglen[op]) {
17618                 ret = reganode(pRExC_state, op, 0);
17619             } else {
17620                 ret = reg_node(pRExC_state, op);
17621             }
17622 
17623             RExC_parse = (char *)cur_parse;
17624 
17625             if (PL_regkind[op] == EXACT) {
17626                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
17627                                            TRUE /* downgradable to EXACT */
17628                                           );
17629             }
17630             else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17631                 FLAGS(ret) = posix_class;
17632             }
17633 
17634             SvREFCNT_dec_NN(cp_list);
17635             return ret;
17636         }
17637     }
17638 
17639     /* Here, <cp_list> contains all the code points we can determine at
17640      * compile time that match under all conditions.  Go through it, and
17641      * for things that belong in the bitmap, put them there, and delete from
17642      * <cp_list>.  While we are at it, see if everything above 255 is in the
17643      * list, and if so, set a flag to speed up execution */
17644 
17645     populate_ANYOF_from_invlist(ret, &cp_list);
17646 
17647     if (invert) {
17648         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
17649     }
17650 
17651     /* Here, the bitmap has been populated with all the Latin1 code points that
17652      * always match.  Can now add to the overall list those that match only
17653      * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
17654      * */
17655     if (has_upper_latin1_only_utf8_matches) {
17656 	if (cp_list) {
17657 	    _invlist_union(cp_list,
17658                            has_upper_latin1_only_utf8_matches,
17659                            &cp_list);
17660 	    SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17661 	}
17662 	else {
17663 	    cp_list = has_upper_latin1_only_utf8_matches;
17664 	}
17665         ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17666     }
17667 
17668     /* If there is a swash and more than one element, we can't use the swash in
17669      * the optimization below. */
17670     if (swash && element_count > 1) {
17671 	SvREFCNT_dec_NN(swash);
17672 	swash = NULL;
17673     }
17674 
17675     /* Note that the optimization of using 'swash' if it is the only thing in
17676      * the class doesn't have us change swash at all, so it can include things
17677      * that are also in the bitmap; otherwise we have purposely deleted that
17678      * duplicate information */
17679     set_ANYOF_arg(pRExC_state, ret, cp_list,
17680                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
17681                    ? listsv : NULL,
17682                   only_utf8_locale_list,
17683                   swash, has_user_defined_property);
17684 
17685     *flagp |= HASWIDTH|SIMPLE;
17686 
17687     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
17688         RExC_contains_locale = 1;
17689     }
17690 
17691     return ret;
17692 }
17693 
17694 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
17695 
17696 STATIC void
17697 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
17698                 regnode* const node,
17699                 SV* const cp_list,
17700                 SV* const runtime_defns,
17701                 SV* const only_utf8_locale_list,
17702                 SV* const swash,
17703                 const bool has_user_defined_property)
17704 {
17705     /* Sets the arg field of an ANYOF-type node 'node', using information about
17706      * the node passed-in.  If there is nothing outside the node's bitmap, the
17707      * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
17708      * the count returned by add_data(), having allocated and stored an array,
17709      * av, that that count references, as follows:
17710      *  av[0] stores the character class description in its textual form.
17711      *        This is used later (regexec.c:Perl_regclass_swash()) to
17712      *        initialize the appropriate swash, and is also useful for dumping
17713      *        the regnode.  This is set to &PL_sv_undef if the textual
17714      *        description is not needed at run-time (as happens if the other
17715      *        elements completely define the class)
17716      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
17717      *        computed from av[0].  But if no further computation need be done,
17718      *        the swash is stored here now (and av[0] is &PL_sv_undef).
17719      *  av[2] stores the inversion list of code points that match only if the
17720      *        current locale is UTF-8
17721      *  av[3] stores the cp_list inversion list for use in addition or instead
17722      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
17723      *        (Otherwise everything needed is already in av[0] and av[1])
17724      *  av[4] is set if any component of the class is from a user-defined
17725      *        property; used only if av[3] exists */
17726 
17727     UV n;
17728 
17729     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
17730 
17731     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
17732         assert(! (ANYOF_FLAGS(node)
17733                 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
17734 	ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
17735     }
17736     else {
17737 	AV * const av = newAV();
17738 	SV *rv;
17739 
17740 	av_store(av, 0, (runtime_defns)
17741 			? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
17742 	if (swash) {
17743 	    assert(cp_list);
17744 	    av_store(av, 1, swash);
17745 	    SvREFCNT_dec_NN(cp_list);
17746 	}
17747 	else {
17748 	    av_store(av, 1, &PL_sv_undef);
17749 	    if (cp_list) {
17750 		av_store(av, 3, cp_list);
17751 		av_store(av, 4, newSVuv(has_user_defined_property));
17752 	    }
17753 	}
17754 
17755         if (only_utf8_locale_list) {
17756 	    av_store(av, 2, only_utf8_locale_list);
17757         }
17758         else {
17759 	    av_store(av, 2, &PL_sv_undef);
17760         }
17761 
17762 	rv = newRV_noinc(MUTABLE_SV(av));
17763 	n = add_data(pRExC_state, STR_WITH_LEN("s"));
17764 	RExC_rxi->data->data[n] = (void*)rv;
17765 	ARG_SET(node, n);
17766     }
17767 }
17768 
17769 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
17770 SV *
17771 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
17772                                         const regnode* node,
17773                                         bool doinit,
17774                                         SV** listsvp,
17775                                         SV** only_utf8_locale_ptr,
17776                                         SV** output_invlist)
17777 
17778 {
17779     /* For internal core use only.
17780      * Returns the swash for the input 'node' in the regex 'prog'.
17781      * If <doinit> is 'true', will attempt to create the swash if not already
17782      *	  done.
17783      * If <listsvp> is non-null, will return the printable contents of the
17784      *    swash.  This can be used to get debugging information even before the
17785      *    swash exists, by calling this function with 'doinit' set to false, in
17786      *    which case the components that will be used to eventually create the
17787      *    swash are returned  (in a printable form).
17788      * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
17789      *    store an inversion list of code points that should match only if the
17790      *    execution-time locale is a UTF-8 one.
17791      * If <output_invlist> is not NULL, it is where this routine is to store an
17792      *    inversion list of the code points that would be instead returned in
17793      *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
17794      *    when this parameter is used, is just the non-code point data that
17795      *    will go into creating the swash.  This currently should be just
17796      *    user-defined properties whose definitions were not known at compile
17797      *    time.  Using this parameter allows for easier manipulation of the
17798      *    swash's data by the caller.  It is illegal to call this function with
17799      *    this parameter set, but not <listsvp>
17800      *
17801      * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
17802      * that, in spite of this function's name, the swash it returns may include
17803      * the bitmap data as well */
17804 
17805     SV *sw  = NULL;
17806     SV *si  = NULL;         /* Input swash initialization string */
17807     SV* invlist = NULL;
17808 
17809     RXi_GET_DECL(prog,progi);
17810     const struct reg_data * const data = prog ? progi->data : NULL;
17811 
17812     PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
17813     assert(! output_invlist || listsvp);
17814 
17815     if (data && data->count) {
17816 	const U32 n = ARG(node);
17817 
17818 	if (data->what[n] == 's') {
17819 	    SV * const rv = MUTABLE_SV(data->data[n]);
17820 	    AV * const av = MUTABLE_AV(SvRV(rv));
17821 	    SV **const ary = AvARRAY(av);
17822 	    U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
17823 
17824 	    si = *ary;	/* ary[0] = the string to initialize the swash with */
17825 
17826             if (av_tindex_nomg(av) >= 2) {
17827                 if (only_utf8_locale_ptr
17828                     && ary[2]
17829                     && ary[2] != &PL_sv_undef)
17830                 {
17831                     *only_utf8_locale_ptr = ary[2];
17832                 }
17833                 else {
17834                     assert(only_utf8_locale_ptr);
17835                     *only_utf8_locale_ptr = NULL;
17836                 }
17837 
17838                 /* Elements 3 and 4 are either both present or both absent. [3]
17839                  * is any inversion list generated at compile time; [4]
17840                  * indicates if that inversion list has any user-defined
17841                  * properties in it. */
17842                 if (av_tindex_nomg(av) >= 3) {
17843                     invlist = ary[3];
17844                     if (SvUV(ary[4])) {
17845                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
17846                     }
17847                 }
17848                 else {
17849                     invlist = NULL;
17850                 }
17851 	    }
17852 
17853 	    /* Element [1] is reserved for the set-up swash.  If already there,
17854 	     * return it; if not, create it and store it there */
17855 	    if (ary[1] && SvROK(ary[1])) {
17856 		sw = ary[1];
17857 	    }
17858 	    else if (doinit && ((si && si != &PL_sv_undef)
17859                                  || (invlist && invlist != &PL_sv_undef))) {
17860 		assert(si);
17861 		sw = _core_swash_init("utf8", /* the utf8 package */
17862 				      "", /* nameless */
17863 				      si,
17864 				      1, /* binary */
17865 				      0, /* not from tr/// */
17866 				      invlist,
17867 				      &swash_init_flags);
17868 		(void)av_store(av, 1, sw);
17869 	    }
17870 	}
17871     }
17872 
17873     /* If requested, return a printable version of what this swash matches */
17874     if (listsvp) {
17875 	SV* matches_string = NULL;
17876 
17877         /* The swash should be used, if possible, to get the data, as it
17878          * contains the resolved data.  But this function can be called at
17879          * compile-time, before everything gets resolved, in which case we
17880          * return the currently best available information, which is the string
17881          * that will eventually be used to do that resolving, 'si' */
17882 	if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
17883             && (si && si != &PL_sv_undef))
17884         {
17885             /* Here, we only have 'si' (and possibly some passed-in data in
17886              * 'invlist', which is handled below)  If the caller only wants
17887              * 'si', use that.  */
17888             if (! output_invlist) {
17889                 matches_string = newSVsv(si);
17890             }
17891             else {
17892                 /* But if the caller wants an inversion list of the node, we
17893                  * need to parse 'si' and place as much as possible in the
17894                  * desired output inversion list, making 'matches_string' only
17895                  * contain the currently unresolvable things */
17896                 const char *si_string = SvPVX(si);
17897                 STRLEN remaining = SvCUR(si);
17898                 UV prev_cp = 0;
17899                 U8 count = 0;
17900 
17901                 /* Ignore everything before the first new-line */
17902                 while (*si_string != '\n' && remaining > 0) {
17903                     si_string++;
17904                     remaining--;
17905                 }
17906                 assert(remaining > 0);
17907 
17908                 si_string++;
17909                 remaining--;
17910 
17911                 while (remaining > 0) {
17912 
17913                     /* The data consists of just strings defining user-defined
17914                      * property names, but in prior incarnations, and perhaps
17915                      * somehow from pluggable regex engines, it could still
17916                      * hold hex code point definitions.  Each component of a
17917                      * range would be separated by a tab, and each range by a
17918                      * new-line.  If these are found, instead add them to the
17919                      * inversion list */
17920                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
17921                                      |PERL_SCAN_SILENT_NON_PORTABLE;
17922                     STRLEN len = remaining;
17923                     UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
17924 
17925                     /* If the hex decode routine found something, it should go
17926                      * up to the next \n */
17927                     if (   *(si_string + len) == '\n') {
17928                         if (count) {    /* 2nd code point on line */
17929                             *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
17930                         }
17931                         else {
17932                             *output_invlist = add_cp_to_invlist(*output_invlist, cp);
17933                         }
17934                         count = 0;
17935                         goto prepare_for_next_iteration;
17936                     }
17937 
17938                     /* If the hex decode was instead for the lower range limit,
17939                      * save it, and go parse the upper range limit */
17940                     if (*(si_string + len) == '\t') {
17941                         assert(count == 0);
17942 
17943                         prev_cp = cp;
17944                         count = 1;
17945                       prepare_for_next_iteration:
17946                         si_string += len + 1;
17947                         remaining -= len + 1;
17948                         continue;
17949                     }
17950 
17951                     /* Here, didn't find a legal hex number.  Just add it from
17952                      * here to the next \n */
17953 
17954                     remaining -= len;
17955                     while (*(si_string + len) != '\n' && remaining > 0) {
17956                         remaining--;
17957                         len++;
17958                     }
17959                     if (*(si_string + len) == '\n') {
17960                         len++;
17961                         remaining--;
17962                     }
17963                     if (matches_string) {
17964                         sv_catpvn(matches_string, si_string, len - 1);
17965                     }
17966                     else {
17967                         matches_string = newSVpvn(si_string, len - 1);
17968                     }
17969                     si_string += len;
17970                     sv_catpvs(matches_string, " ");
17971                 } /* end of loop through the text */
17972 
17973                 assert(matches_string);
17974                 if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
17975                     SvCUR_set(matches_string, SvCUR(matches_string) - 1);
17976                 }
17977             } /* end of has an 'si' but no swash */
17978 	}
17979 
17980         /* If we have a swash in place, its equivalent inversion list was above
17981          * placed into 'invlist'.  If not, this variable may contain a stored
17982          * inversion list which is information beyond what is in 'si' */
17983         if (invlist) {
17984 
17985             /* Again, if the caller doesn't want the output inversion list, put
17986              * everything in 'matches-string' */
17987             if (! output_invlist) {
17988                 if ( ! matches_string) {
17989                     matches_string = newSVpvs("\n");
17990                 }
17991                 sv_catsv(matches_string, invlist_contents(invlist,
17992                                                   TRUE /* traditional style */
17993                                                   ));
17994             }
17995             else if (! *output_invlist) {
17996                 *output_invlist = invlist_clone(invlist);
17997             }
17998             else {
17999                 _invlist_union(*output_invlist, invlist, output_invlist);
18000             }
18001         }
18002 
18003 	*listsvp = matches_string;
18004     }
18005 
18006     return sw;
18007 }
18008 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
18009 
18010 /* reg_skipcomment()
18011 
18012    Absorbs an /x style # comment from the input stream,
18013    returning a pointer to the first character beyond the comment, or if the
18014    comment terminates the pattern without anything following it, this returns
18015    one past the final character of the pattern (in other words, RExC_end) and
18016    sets the REG_RUN_ON_COMMENT_SEEN flag.
18017 
18018    Note it's the callers responsibility to ensure that we are
18019    actually in /x mode
18020 
18021 */
18022 
18023 PERL_STATIC_INLINE char*
18024 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
18025 {
18026     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
18027 
18028     assert(*p == '#');
18029 
18030     while (p < RExC_end) {
18031         if (*(++p) == '\n') {
18032             return p+1;
18033         }
18034     }
18035 
18036     /* we ran off the end of the pattern without ending the comment, so we have
18037      * to add an \n when wrapping */
18038     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
18039     return p;
18040 }
18041 
18042 STATIC void
18043 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
18044                                 char ** p,
18045                                 const bool force_to_xmod
18046                          )
18047 {
18048     /* If the text at the current parse position '*p' is a '(?#...)' comment,
18049      * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
18050      * is /x whitespace, advance '*p' so that on exit it points to the first
18051      * byte past all such white space and comments */
18052 
18053     const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
18054 
18055     PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
18056 
18057     assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
18058 
18059     for (;;) {
18060 	if (RExC_end - (*p) >= 3
18061 	    && *(*p)     == '('
18062 	    && *(*p + 1) == '?'
18063 	    && *(*p + 2) == '#')
18064 	{
18065 	    while (*(*p) != ')') {
18066 		if ((*p) == RExC_end)
18067 		    FAIL("Sequence (?#... not terminated");
18068 		(*p)++;
18069 	    }
18070 	    (*p)++;
18071 	    continue;
18072 	}
18073 
18074 	if (use_xmod) {
18075             const char * save_p = *p;
18076             while ((*p) < RExC_end) {
18077                 STRLEN len;
18078                 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
18079                     (*p) += len;
18080                 }
18081                 else if (*(*p) == '#') {
18082                     (*p) = reg_skipcomment(pRExC_state, (*p));
18083                 }
18084                 else {
18085                     break;
18086                 }
18087             }
18088             if (*p != save_p) {
18089                 continue;
18090             }
18091 	}
18092 
18093         break;
18094     }
18095 
18096     return;
18097 }
18098 
18099 /* nextchar()
18100 
18101    Advances the parse position by one byte, unless that byte is the beginning
18102    of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
18103    those two cases, the parse position is advanced beyond all such comments and
18104    white space.
18105 
18106    This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
18107 */
18108 
18109 STATIC void
18110 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
18111 {
18112     PERL_ARGS_ASSERT_NEXTCHAR;
18113 
18114     if (RExC_parse < RExC_end) {
18115         assert(   ! UTF
18116                || UTF8_IS_INVARIANT(*RExC_parse)
18117                || UTF8_IS_START(*RExC_parse));
18118 
18119         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
18120 
18121         skip_to_be_ignored_text(pRExC_state, &RExC_parse,
18122                                 FALSE /* Don't assume /x */ );
18123     }
18124 }
18125 
18126 STATIC regnode *
18127 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
18128 {
18129     /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
18130      * space.  In pass1, it aligns and increments RExC_size; in pass2,
18131      * RExC_emit */
18132 
18133     regnode * const ret = RExC_emit;
18134     GET_RE_DEBUG_FLAGS_DECL;
18135 
18136     PERL_ARGS_ASSERT_REGNODE_GUTS;
18137 
18138     assert(extra_size >= regarglen[op]);
18139 
18140     if (SIZE_ONLY) {
18141 	SIZE_ALIGN(RExC_size);
18142 	RExC_size += 1 + extra_size;
18143 	return(ret);
18144     }
18145     if (RExC_emit >= RExC_emit_bound)
18146         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
18147 		   op, (void*)RExC_emit, (void*)RExC_emit_bound);
18148 
18149     NODE_ALIGN_FILL(ret);
18150 #ifndef RE_TRACK_PATTERN_OFFSETS
18151     PERL_UNUSED_ARG(name);
18152 #else
18153     if (RExC_offsets) {         /* MJD */
18154 	MJD_OFFSET_DEBUG(
18155               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
18156               name, __LINE__,
18157               PL_reg_name[op],
18158               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
18159 		? "Overwriting end of array!\n" : "OK",
18160               (UV)(RExC_emit - RExC_emit_start),
18161               (UV)(RExC_parse - RExC_start),
18162               (UV)RExC_offsets[0]));
18163 	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
18164     }
18165 #endif
18166     return(ret);
18167 }
18168 
18169 /*
18170 - reg_node - emit a node
18171 */
18172 STATIC regnode *			/* Location. */
18173 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
18174 {
18175     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
18176 
18177     PERL_ARGS_ASSERT_REG_NODE;
18178 
18179     assert(regarglen[op] == 0);
18180 
18181     if (PASS2) {
18182         regnode *ptr = ret;
18183         FILL_ADVANCE_NODE(ptr, op);
18184         RExC_emit = ptr;
18185     }
18186     return(ret);
18187 }
18188 
18189 /*
18190 - reganode - emit a node with an argument
18191 */
18192 STATIC regnode *			/* Location. */
18193 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
18194 {
18195     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
18196 
18197     PERL_ARGS_ASSERT_REGANODE;
18198 
18199     assert(regarglen[op] == 1);
18200 
18201     if (PASS2) {
18202         regnode *ptr = ret;
18203         FILL_ADVANCE_NODE_ARG(ptr, op, arg);
18204         RExC_emit = ptr;
18205     }
18206     return(ret);
18207 }
18208 
18209 STATIC regnode *
18210 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
18211 {
18212     /* emit a node with U32 and I32 arguments */
18213 
18214     regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
18215 
18216     PERL_ARGS_ASSERT_REG2LANODE;
18217 
18218     assert(regarglen[op] == 2);
18219 
18220     if (PASS2) {
18221         regnode *ptr = ret;
18222         FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
18223         RExC_emit = ptr;
18224     }
18225     return(ret);
18226 }
18227 
18228 /*
18229 - reginsert - insert an operator in front of already-emitted operand
18230 *
18231 * Means relocating the operand.
18232 */
18233 STATIC void
18234 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
18235 {
18236     regnode *src;
18237     regnode *dst;
18238     regnode *place;
18239     const int offset = regarglen[(U8)op];
18240     const int size = NODE_STEP_REGNODE + offset;
18241     GET_RE_DEBUG_FLAGS_DECL;
18242 
18243     PERL_ARGS_ASSERT_REGINSERT;
18244     PERL_UNUSED_CONTEXT;
18245     PERL_UNUSED_ARG(depth);
18246 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
18247     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
18248     if (SIZE_ONLY) {
18249 	RExC_size += size;
18250 	return;
18251     }
18252     assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
18253                                     studying. If this is wrong then we need to adjust RExC_recurse
18254                                     below like we do with RExC_open_parens/RExC_close_parens. */
18255     src = RExC_emit;
18256     RExC_emit += size;
18257     dst = RExC_emit;
18258     if (RExC_open_parens) {
18259         int paren;
18260         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
18261         /* remember that RExC_npar is rex->nparens + 1,
18262          * iow it is 1 more than the number of parens seen in
18263          * the pattern so far. */
18264         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
18265             /* note, RExC_open_parens[0] is the start of the
18266              * regex, it can't move. RExC_close_parens[0] is the end
18267              * of the regex, it *can* move. */
18268             if ( paren && RExC_open_parens[paren] >= opnd ) {
18269                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
18270                 RExC_open_parens[paren] += size;
18271             } else {
18272                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
18273             }
18274             if ( RExC_close_parens[paren] >= opnd ) {
18275                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
18276                 RExC_close_parens[paren] += size;
18277             } else {
18278                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
18279             }
18280         }
18281     }
18282     if (RExC_end_op)
18283         RExC_end_op += size;
18284 
18285     while (src > opnd) {
18286 	StructCopy(--src, --dst, regnode);
18287 #ifdef RE_TRACK_PATTERN_OFFSETS
18288         if (RExC_offsets) {     /* MJD 20010112 */
18289 	    MJD_OFFSET_DEBUG(
18290                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
18291                   "reg_insert",
18292 		  __LINE__,
18293 		  PL_reg_name[op],
18294                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
18295 		    ? "Overwriting end of array!\n" : "OK",
18296                   (UV)(src - RExC_emit_start),
18297                   (UV)(dst - RExC_emit_start),
18298                   (UV)RExC_offsets[0]));
18299 	    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
18300 	    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
18301         }
18302 #endif
18303     }
18304 
18305 
18306     place = opnd;		/* Op node, where operand used to be. */
18307 #ifdef RE_TRACK_PATTERN_OFFSETS
18308     if (RExC_offsets) {         /* MJD */
18309 	MJD_OFFSET_DEBUG(
18310               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
18311               "reginsert",
18312 	      __LINE__,
18313 	      PL_reg_name[op],
18314               (UV)(place - RExC_emit_start) > RExC_offsets[0]
18315               ? "Overwriting end of array!\n" : "OK",
18316               (UV)(place - RExC_emit_start),
18317               (UV)(RExC_parse - RExC_start),
18318               (UV)RExC_offsets[0]));
18319 	Set_Node_Offset(place, RExC_parse);
18320 	Set_Node_Length(place, 1);
18321     }
18322 #endif
18323     src = NEXTOPER(place);
18324     FILL_ADVANCE_NODE(place, op);
18325     Zero(src, offset, regnode);
18326 }
18327 
18328 /*
18329 - regtail - set the next-pointer at the end of a node chain of p to val.
18330 - SEE ALSO: regtail_study
18331 */
18332 STATIC void
18333 S_regtail(pTHX_ RExC_state_t * pRExC_state,
18334                 const regnode * const p,
18335                 const regnode * const val,
18336                 const U32 depth)
18337 {
18338     regnode *scan;
18339     GET_RE_DEBUG_FLAGS_DECL;
18340 
18341     PERL_ARGS_ASSERT_REGTAIL;
18342 #ifndef DEBUGGING
18343     PERL_UNUSED_ARG(depth);
18344 #endif
18345 
18346     if (SIZE_ONLY)
18347 	return;
18348 
18349     /* Find last node. */
18350     scan = (regnode *) p;
18351     for (;;) {
18352 	regnode * const temp = regnext(scan);
18353         DEBUG_PARSE_r({
18354             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
18355             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18356             Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
18357                 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
18358                     (temp == NULL ? "->" : ""),
18359                     (temp == NULL ? PL_reg_name[OP(val)] : "")
18360             );
18361         });
18362         if (temp == NULL)
18363             break;
18364         scan = temp;
18365     }
18366 
18367     if (reg_off_by_arg[OP(scan)]) {
18368         ARG_SET(scan, val - scan);
18369     }
18370     else {
18371         NEXT_OFF(scan) = val - scan;
18372     }
18373 }
18374 
18375 #ifdef DEBUGGING
18376 /*
18377 - regtail_study - set the next-pointer at the end of a node chain of p to val.
18378 - Look for optimizable sequences at the same time.
18379 - currently only looks for EXACT chains.
18380 
18381 This is experimental code. The idea is to use this routine to perform
18382 in place optimizations on branches and groups as they are constructed,
18383 with the long term intention of removing optimization from study_chunk so
18384 that it is purely analytical.
18385 
18386 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
18387 to control which is which.
18388 
18389 */
18390 /* TODO: All four parms should be const */
18391 
18392 STATIC U8
18393 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
18394                       const regnode *val,U32 depth)
18395 {
18396     regnode *scan;
18397     U8 exact = PSEUDO;
18398 #ifdef EXPERIMENTAL_INPLACESCAN
18399     I32 min = 0;
18400 #endif
18401     GET_RE_DEBUG_FLAGS_DECL;
18402 
18403     PERL_ARGS_ASSERT_REGTAIL_STUDY;
18404 
18405 
18406     if (SIZE_ONLY)
18407         return exact;
18408 
18409     /* Find last node. */
18410 
18411     scan = p;
18412     for (;;) {
18413         regnode * const temp = regnext(scan);
18414 #ifdef EXPERIMENTAL_INPLACESCAN
18415         if (PL_regkind[OP(scan)] == EXACT) {
18416 	    bool unfolded_multi_char;	/* Unexamined in this routine */
18417             if (join_exact(pRExC_state, scan, &min,
18418                            &unfolded_multi_char, 1, val, depth+1))
18419                 return EXACT;
18420 	}
18421 #endif
18422         if ( exact ) {
18423             switch (OP(scan)) {
18424                 case EXACT:
18425                 case EXACTL:
18426                 case EXACTF:
18427                 case EXACTFA_NO_TRIE:
18428                 case EXACTFA:
18429                 case EXACTFU:
18430                 case EXACTFLU8:
18431                 case EXACTFU_SS:
18432                 case EXACTFL:
18433                         if( exact == PSEUDO )
18434                             exact= OP(scan);
18435                         else if ( exact != OP(scan) )
18436                             exact= 0;
18437                 case NOTHING:
18438                     break;
18439                 default:
18440                     exact= 0;
18441             }
18442         }
18443         DEBUG_PARSE_r({
18444             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
18445             regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
18446             Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
18447                 SvPV_nolen_const(RExC_mysv),
18448                 REG_NODE_NUM(scan),
18449                 PL_reg_name[exact]);
18450         });
18451 	if (temp == NULL)
18452 	    break;
18453 	scan = temp;
18454     }
18455     DEBUG_PARSE_r({
18456         DEBUG_PARSE_MSG("");
18457         regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
18458         Perl_re_printf( aTHX_
18459                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
18460 		      SvPV_nolen_const(RExC_mysv),
18461 		      (IV)REG_NODE_NUM(val),
18462 		      (IV)(val - scan)
18463         );
18464     });
18465     if (reg_off_by_arg[OP(scan)]) {
18466 	ARG_SET(scan, val - scan);
18467     }
18468     else {
18469 	NEXT_OFF(scan) = val - scan;
18470     }
18471 
18472     return exact;
18473 }
18474 #endif
18475 
18476 /*
18477  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
18478  */
18479 #ifdef DEBUGGING
18480 
18481 static void
18482 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
18483 {
18484     int bit;
18485     int set=0;
18486 
18487     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
18488 
18489     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
18490         if (flags & (1<<bit)) {
18491             if (!set++ && lead)
18492                 Perl_re_printf( aTHX_  "%s",lead);
18493             Perl_re_printf( aTHX_  "%s ",PL_reg_intflags_name[bit]);
18494         }
18495     }
18496     if (lead)  {
18497         if (set)
18498             Perl_re_printf( aTHX_  "\n");
18499         else
18500             Perl_re_printf( aTHX_  "%s[none-set]\n",lead);
18501     }
18502 }
18503 
18504 static void
18505 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
18506 {
18507     int bit;
18508     int set=0;
18509     regex_charset cs;
18510 
18511     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
18512 
18513     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
18514         if (flags & (1<<bit)) {
18515 	    if ((1<<bit) & RXf_PMf_CHARSET) {	/* Output separately, below */
18516 		continue;
18517 	    }
18518             if (!set++ && lead)
18519                 Perl_re_printf( aTHX_  "%s",lead);
18520             Perl_re_printf( aTHX_  "%s ",PL_reg_extflags_name[bit]);
18521         }
18522     }
18523     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
18524             if (!set++ && lead) {
18525                 Perl_re_printf( aTHX_  "%s",lead);
18526             }
18527             switch (cs) {
18528                 case REGEX_UNICODE_CHARSET:
18529                     Perl_re_printf( aTHX_  "UNICODE");
18530                     break;
18531                 case REGEX_LOCALE_CHARSET:
18532                     Perl_re_printf( aTHX_  "LOCALE");
18533                     break;
18534                 case REGEX_ASCII_RESTRICTED_CHARSET:
18535                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
18536                     break;
18537                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
18538                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
18539                     break;
18540                 default:
18541                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
18542                     break;
18543             }
18544     }
18545     if (lead)  {
18546         if (set)
18547             Perl_re_printf( aTHX_  "\n");
18548         else
18549             Perl_re_printf( aTHX_  "%s[none-set]\n",lead);
18550     }
18551 }
18552 #endif
18553 
18554 void
18555 Perl_regdump(pTHX_ const regexp *r)
18556 {
18557 #ifdef DEBUGGING
18558     SV * const sv = sv_newmortal();
18559     SV *dsv= sv_newmortal();
18560     RXi_GET_DECL(r,ri);
18561     GET_RE_DEBUG_FLAGS_DECL;
18562 
18563     PERL_ARGS_ASSERT_REGDUMP;
18564 
18565     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
18566 
18567     /* Header fields of interest. */
18568     if (r->anchored_substr) {
18569 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
18570 	    RE_SV_DUMPLEN(r->anchored_substr), 30);
18571         Perl_re_printf( aTHX_
18572 		      "anchored %s%s at %"IVdf" ",
18573 		      s, RE_SV_TAIL(r->anchored_substr),
18574 		      (IV)r->anchored_offset);
18575     } else if (r->anchored_utf8) {
18576 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
18577 	    RE_SV_DUMPLEN(r->anchored_utf8), 30);
18578         Perl_re_printf( aTHX_
18579 		      "anchored utf8 %s%s at %"IVdf" ",
18580 		      s, RE_SV_TAIL(r->anchored_utf8),
18581 		      (IV)r->anchored_offset);
18582     }
18583     if (r->float_substr) {
18584 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
18585 	    RE_SV_DUMPLEN(r->float_substr), 30);
18586         Perl_re_printf( aTHX_
18587 		      "floating %s%s at %"IVdf"..%"UVuf" ",
18588 		      s, RE_SV_TAIL(r->float_substr),
18589 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
18590     } else if (r->float_utf8) {
18591 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
18592 	    RE_SV_DUMPLEN(r->float_utf8), 30);
18593         Perl_re_printf( aTHX_
18594 		      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
18595 		      s, RE_SV_TAIL(r->float_utf8),
18596 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
18597     }
18598     if (r->check_substr || r->check_utf8)
18599         Perl_re_printf( aTHX_
18600 		      (const char *)
18601 		      (r->check_substr == r->float_substr
18602 		       && r->check_utf8 == r->float_utf8
18603 		       ? "(checking floating" : "(checking anchored"));
18604     if (r->intflags & PREGf_NOSCAN)
18605         Perl_re_printf( aTHX_  " noscan");
18606     if (r->extflags & RXf_CHECK_ALL)
18607         Perl_re_printf( aTHX_  " isall");
18608     if (r->check_substr || r->check_utf8)
18609         Perl_re_printf( aTHX_  ") ");
18610 
18611     if (ri->regstclass) {
18612         regprop(r, sv, ri->regstclass, NULL, NULL);
18613         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
18614     }
18615     if (r->intflags & PREGf_ANCH) {
18616         Perl_re_printf( aTHX_  "anchored");
18617         if (r->intflags & PREGf_ANCH_MBOL)
18618             Perl_re_printf( aTHX_  "(MBOL)");
18619         if (r->intflags & PREGf_ANCH_SBOL)
18620             Perl_re_printf( aTHX_  "(SBOL)");
18621         if (r->intflags & PREGf_ANCH_GPOS)
18622             Perl_re_printf( aTHX_  "(GPOS)");
18623         Perl_re_printf( aTHX_ " ");
18624     }
18625     if (r->intflags & PREGf_GPOS_SEEN)
18626         Perl_re_printf( aTHX_  "GPOS:%"UVuf" ", (UV)r->gofs);
18627     if (r->intflags & PREGf_SKIP)
18628         Perl_re_printf( aTHX_  "plus ");
18629     if (r->intflags & PREGf_IMPLICIT)
18630         Perl_re_printf( aTHX_  "implicit ");
18631     Perl_re_printf( aTHX_  "minlen %"IVdf" ", (IV)r->minlen);
18632     if (r->extflags & RXf_EVAL_SEEN)
18633         Perl_re_printf( aTHX_  "with eval ");
18634     Perl_re_printf( aTHX_  "\n");
18635     DEBUG_FLAGS_r({
18636         regdump_extflags("r->extflags: ",r->extflags);
18637         regdump_intflags("r->intflags: ",r->intflags);
18638     });
18639 #else
18640     PERL_ARGS_ASSERT_REGDUMP;
18641     PERL_UNUSED_CONTEXT;
18642     PERL_UNUSED_ARG(r);
18643 #endif	/* DEBUGGING */
18644 }
18645 
18646 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
18647 #ifdef DEBUGGING
18648 
18649 #  if   _CC_WORDCHAR != 0 || _CC_DIGIT != 1        || _CC_ALPHA != 2    \
18650      || _CC_LOWER != 3    || _CC_UPPER != 4        || _CC_PUNCT != 5    \
18651      || _CC_PRINT != 6    || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8    \
18652      || _CC_CASED != 9    || _CC_SPACE != 10       || _CC_BLANK != 11   \
18653      || _CC_XDIGIT != 12  || _CC_CNTRL != 13       || _CC_ASCII != 14   \
18654      || _CC_VERTSPACE != 15
18655 #   error Need to adjust order of anyofs[]
18656 #  endif
18657 static const char * const anyofs[] = {
18658     "\\w",
18659     "\\W",
18660     "\\d",
18661     "\\D",
18662     "[:alpha:]",
18663     "[:^alpha:]",
18664     "[:lower:]",
18665     "[:^lower:]",
18666     "[:upper:]",
18667     "[:^upper:]",
18668     "[:punct:]",
18669     "[:^punct:]",
18670     "[:print:]",
18671     "[:^print:]",
18672     "[:alnum:]",
18673     "[:^alnum:]",
18674     "[:graph:]",
18675     "[:^graph:]",
18676     "[:cased:]",
18677     "[:^cased:]",
18678     "\\s",
18679     "\\S",
18680     "[:blank:]",
18681     "[:^blank:]",
18682     "[:xdigit:]",
18683     "[:^xdigit:]",
18684     "[:cntrl:]",
18685     "[:^cntrl:]",
18686     "[:ascii:]",
18687     "[:^ascii:]",
18688     "\\v",
18689     "\\V"
18690 };
18691 #endif
18692 
18693 /*
18694 - regprop - printable representation of opcode, with run time support
18695 */
18696 
18697 void
18698 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
18699 {
18700 #ifdef DEBUGGING
18701     int k;
18702     RXi_GET_DECL(prog,progi);
18703     GET_RE_DEBUG_FLAGS_DECL;
18704 
18705     PERL_ARGS_ASSERT_REGPROP;
18706 
18707     sv_setpvn(sv, "", 0);
18708 
18709     if (OP(o) > REGNODE_MAX)		/* regnode.type is unsigned */
18710 	/* It would be nice to FAIL() here, but this may be called from
18711 	   regexec.c, and it would be hard to supply pRExC_state. */
18712 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
18713                                               (int)OP(o), (int)REGNODE_MAX);
18714     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
18715 
18716     k = PL_regkind[OP(o)];
18717 
18718     if (k == EXACT) {
18719 	sv_catpvs(sv, " ");
18720 	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
18721 	 * is a crude hack but it may be the best for now since
18722 	 * we have no flag "this EXACTish node was UTF-8"
18723 	 * --jhi */
18724 	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
18725 		  PERL_PV_ESCAPE_UNI_DETECT |
18726 		  PERL_PV_ESCAPE_NONASCII   |
18727 		  PERL_PV_PRETTY_ELLIPSES   |
18728 		  PERL_PV_PRETTY_LTGT       |
18729 		  PERL_PV_PRETTY_NOCLEAR
18730 		  );
18731     } else if (k == TRIE) {
18732 	/* print the details of the trie in dumpuntil instead, as
18733 	 * progi->data isn't available here */
18734         const char op = OP(o);
18735         const U32 n = ARG(o);
18736         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
18737                (reg_ac_data *)progi->data->data[n] :
18738                NULL;
18739         const reg_trie_data * const trie
18740 	    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
18741 
18742         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
18743         DEBUG_TRIE_COMPILE_r(
18744           Perl_sv_catpvf(aTHX_ sv,
18745             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
18746             (UV)trie->startstate,
18747             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
18748             (UV)trie->wordcount,
18749             (UV)trie->minlen,
18750             (UV)trie->maxlen,
18751             (UV)TRIE_CHARCOUNT(trie),
18752             (UV)trie->uniquecharcount
18753           );
18754         );
18755         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
18756             sv_catpvs(sv, "[");
18757             (void) put_charclass_bitmap_innards(sv,
18758                                                 ((IS_ANYOF_TRIE(op))
18759                                                  ? ANYOF_BITMAP(o)
18760                                                  : TRIE_BITMAP(trie)),
18761                                                 NULL,
18762                                                 NULL,
18763                                                 NULL,
18764                                                 FALSE
18765                                                );
18766             sv_catpvs(sv, "]");
18767         }
18768 
18769     } else if (k == CURLY) {
18770         U32 lo = ARG1(o), hi = ARG2(o);
18771 	if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
18772 	    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
18773         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
18774         if (hi == REG_INFTY)
18775             sv_catpvs(sv, "INFTY");
18776         else
18777             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
18778         sv_catpvs(sv, "}");
18779     }
18780     else if (k == WHILEM && o->flags)			/* Ordinal/of */
18781 	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
18782     else if (k == REF || k == OPEN || k == CLOSE
18783              || k == GROUPP || OP(o)==ACCEPT)
18784     {
18785         AV *name_list= NULL;
18786         U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
18787         Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno);        /* Parenth number */
18788 	if ( RXp_PAREN_NAMES(prog) ) {
18789             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
18790         } else if ( pRExC_state ) {
18791             name_list= RExC_paren_name_list;
18792         }
18793         if (name_list) {
18794             if ( k != REF || (OP(o) < NREF)) {
18795                 SV **name= av_fetch(name_list, parno, 0 );
18796 	        if (name)
18797 	            Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
18798             }
18799             else {
18800                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
18801                 I32 *nums=(I32*)SvPVX(sv_dat);
18802                 SV **name= av_fetch(name_list, nums[0], 0 );
18803                 I32 n;
18804                 if (name) {
18805                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
18806                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
18807 			   	    (n ? "," : ""), (IV)nums[n]);
18808                     }
18809                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
18810                 }
18811             }
18812         }
18813         if ( k == REF && reginfo) {
18814             U32 n = ARG(o);  /* which paren pair */
18815             I32 ln = prog->offs[n].start;
18816             if (prog->lastparen < n || ln == -1)
18817                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
18818             else if (ln == prog->offs[n].end)
18819                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
18820             else {
18821                 const char *s = reginfo->strbeg + ln;
18822                 Perl_sv_catpvf(aTHX_ sv, ": ");
18823                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
18824                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
18825             }
18826         }
18827     } else if (k == GOSUB) {
18828         AV *name_list= NULL;
18829         if ( RXp_PAREN_NAMES(prog) ) {
18830             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
18831         } else if ( pRExC_state ) {
18832             name_list= RExC_paren_name_list;
18833         }
18834 
18835         /* Paren and offset */
18836         Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
18837                 (int)((o + (int)ARG2L(o)) - progi->program) );
18838         if (name_list) {
18839             SV **name= av_fetch(name_list, ARG(o), 0 );
18840             if (name)
18841                 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
18842         }
18843     }
18844     else if (k == LOGICAL)
18845         /* 2: embedded, otherwise 1 */
18846 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
18847     else if (k == ANYOF) {
18848 	const U8 flags = ANYOF_FLAGS(o);
18849         bool do_sep = FALSE;    /* Do we need to separate various components of
18850                                    the output? */
18851         /* Set if there is still an unresolved user-defined property */
18852         SV *unresolved                = NULL;
18853 
18854         /* Things that are ignored except when the runtime locale is UTF-8 */
18855         SV *only_utf8_locale_invlist = NULL;
18856 
18857         /* Code points that don't fit in the bitmap */
18858         SV *nonbitmap_invlist = NULL;
18859 
18860         /* And things that aren't in the bitmap, but are small enough to be */
18861         SV* bitmap_range_not_in_bitmap = NULL;
18862 
18863         const bool inverted = flags & ANYOF_INVERT;
18864 
18865 	if (OP(o) == ANYOFL) {
18866             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
18867                 sv_catpvs(sv, "{utf8-locale-reqd}");
18868             }
18869             if (flags & ANYOFL_FOLD) {
18870                 sv_catpvs(sv, "{i}");
18871             }
18872         }
18873 
18874         /* If there is stuff outside the bitmap, get it */
18875         if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
18876             (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
18877                                                 &unresolved,
18878                                                 &only_utf8_locale_invlist,
18879                                                 &nonbitmap_invlist);
18880             /* The non-bitmap data may contain stuff that could fit in the
18881              * bitmap.  This could come from a user-defined property being
18882              * finally resolved when this call was done; or much more likely
18883              * because there are matches that require UTF-8 to be valid, and so
18884              * aren't in the bitmap.  This is teased apart later */
18885             _invlist_intersection(nonbitmap_invlist,
18886                                   PL_InBitmap,
18887                                   &bitmap_range_not_in_bitmap);
18888             /* Leave just the things that don't fit into the bitmap */
18889             _invlist_subtract(nonbitmap_invlist,
18890                               PL_InBitmap,
18891                               &nonbitmap_invlist);
18892         }
18893 
18894         /* Obey this flag to add all above-the-bitmap code points */
18895         if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
18896             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
18897                                                       NUM_ANYOF_CODE_POINTS,
18898                                                       UV_MAX);
18899         }
18900 
18901         /* Ready to start outputting.  First, the initial left bracket */
18902 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
18903 
18904         /* Then all the things that could fit in the bitmap */
18905         do_sep = put_charclass_bitmap_innards(sv,
18906                                               ANYOF_BITMAP(o),
18907                                               bitmap_range_not_in_bitmap,
18908                                               only_utf8_locale_invlist,
18909                                               o,
18910 
18911                                               /* Can't try inverting for a
18912                                                * better display if there are
18913                                                * things that haven't been
18914                                                * resolved */
18915                                               unresolved != NULL);
18916         SvREFCNT_dec(bitmap_range_not_in_bitmap);
18917 
18918         /* If there are user-defined properties which haven't been defined yet,
18919          * output them.  If the result is not to be inverted, it is clearest to
18920          * output them in a separate [] from the bitmap range stuff.  If the
18921          * result is to be complemented, we have to show everything in one [],
18922          * as the inversion applies to the whole thing.  Use {braces} to
18923          * separate them from anything in the bitmap and anything above the
18924          * bitmap. */
18925         if (unresolved) {
18926             if (inverted) {
18927                 if (! do_sep) { /* If didn't output anything in the bitmap */
18928                     sv_catpvs(sv, "^");
18929                 }
18930                 sv_catpvs(sv, "{");
18931             }
18932             else if (do_sep) {
18933                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
18934             }
18935             sv_catsv(sv, unresolved);
18936             if (inverted) {
18937                 sv_catpvs(sv, "}");
18938             }
18939             do_sep = ! inverted;
18940         }
18941 
18942         /* And, finally, add the above-the-bitmap stuff */
18943         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
18944             SV* contents;
18945 
18946             /* See if truncation size is overridden */
18947             const STRLEN dump_len = (PL_dump_re_max_len)
18948                                     ? PL_dump_re_max_len
18949                                     : 256;
18950 
18951             /* This is output in a separate [] */
18952             if (do_sep) {
18953                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
18954             }
18955 
18956             /* And, for easy of understanding, it is shown in the
18957              * uncomplemented form if possible.  The one exception being if
18958              * there are unresolved items, where the inversion has to be
18959              * delayed until runtime */
18960             if (inverted && ! unresolved) {
18961                 _invlist_invert(nonbitmap_invlist);
18962                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
18963             }
18964 
18965             contents = invlist_contents(nonbitmap_invlist,
18966                                         FALSE /* output suitable for catsv */
18967                                        );
18968 
18969             /* If the output is shorter than the permissible maximum, just do it. */
18970             if (SvCUR(contents) <= dump_len) {
18971                 sv_catsv(sv, contents);
18972             }
18973             else {
18974                 const char * contents_string = SvPVX(contents);
18975                 STRLEN i = dump_len;
18976 
18977                 /* Otherwise, start at the permissible max and work back to the
18978                  * first break possibility */
18979                 while (i > 0 && contents_string[i] != ' ') {
18980                     i--;
18981                 }
18982                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
18983                                        find a legal break */
18984                     i = dump_len;
18985                 }
18986 
18987                 sv_catpvn(sv, contents_string, i);
18988                 sv_catpvs(sv, "...");
18989             }
18990 
18991             SvREFCNT_dec_NN(contents);
18992             SvREFCNT_dec_NN(nonbitmap_invlist);
18993         }
18994 
18995         /* And finally the matching, closing ']' */
18996 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
18997 
18998         SvREFCNT_dec(unresolved);
18999     }
19000     else if (k == POSIXD || k == NPOSIXD) {
19001         U8 index = FLAGS(o) * 2;
19002         if (index < C_ARRAY_LENGTH(anyofs)) {
19003             if (*anyofs[index] != '[')  {
19004                 sv_catpv(sv, "[");
19005             }
19006             sv_catpv(sv, anyofs[index]);
19007             if (*anyofs[index] != '[')  {
19008                 sv_catpv(sv, "]");
19009             }
19010         }
19011         else {
19012             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
19013         }
19014     }
19015     else if (k == BOUND || k == NBOUND) {
19016         /* Must be synced with order of 'bound_type' in regcomp.h */
19017         const char * const bounds[] = {
19018             "",      /* Traditional */
19019             "{gcb}",
19020             "{lb}",
19021             "{sb}",
19022             "{wb}"
19023         };
19024         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
19025         sv_catpv(sv, bounds[FLAGS(o)]);
19026     }
19027     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
19028 	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
19029     else if (OP(o) == SBOL)
19030         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
19031 
19032     /* add on the verb argument if there is one */
19033     if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
19034         Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
19035                        SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
19036     }
19037 #else
19038     PERL_UNUSED_CONTEXT;
19039     PERL_UNUSED_ARG(sv);
19040     PERL_UNUSED_ARG(o);
19041     PERL_UNUSED_ARG(prog);
19042     PERL_UNUSED_ARG(reginfo);
19043     PERL_UNUSED_ARG(pRExC_state);
19044 #endif	/* DEBUGGING */
19045 }
19046 
19047 
19048 
19049 SV *
19050 Perl_re_intuit_string(pTHX_ REGEXP * const r)
19051 {				/* Assume that RE_INTUIT is set */
19052     struct regexp *const prog = ReANY(r);
19053     GET_RE_DEBUG_FLAGS_DECL;
19054 
19055     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
19056     PERL_UNUSED_CONTEXT;
19057 
19058     DEBUG_COMPILE_r(
19059 	{
19060 	    const char * const s = SvPV_nolen_const(RX_UTF8(r)
19061 		      ? prog->check_utf8 : prog->check_substr);
19062 
19063 	    if (!PL_colorset) reginitcolors();
19064             Perl_re_printf( aTHX_
19065 		      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
19066 		      PL_colors[4],
19067 		      RX_UTF8(r) ? "utf8 " : "",
19068 		      PL_colors[5],PL_colors[0],
19069 		      s,
19070 		      PL_colors[1],
19071 		      (strlen(s) > 60 ? "..." : ""));
19072 	} );
19073 
19074     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
19075     return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
19076 }
19077 
19078 /*
19079    pregfree()
19080 
19081    handles refcounting and freeing the perl core regexp structure. When
19082    it is necessary to actually free the structure the first thing it
19083    does is call the 'free' method of the regexp_engine associated to
19084    the regexp, allowing the handling of the void *pprivate; member
19085    first. (This routine is not overridable by extensions, which is why
19086    the extensions free is called first.)
19087 
19088    See regdupe and regdupe_internal if you change anything here.
19089 */
19090 #ifndef PERL_IN_XSUB_RE
19091 void
19092 Perl_pregfree(pTHX_ REGEXP *r)
19093 {
19094     SvREFCNT_dec(r);
19095 }
19096 
19097 void
19098 Perl_pregfree2(pTHX_ REGEXP *rx)
19099 {
19100     struct regexp *const r = ReANY(rx);
19101     GET_RE_DEBUG_FLAGS_DECL;
19102 
19103     PERL_ARGS_ASSERT_PREGFREE2;
19104 
19105     if (r->mother_re) {
19106         ReREFCNT_dec(r->mother_re);
19107     } else {
19108         CALLREGFREE_PVT(rx); /* free the private data */
19109         SvREFCNT_dec(RXp_PAREN_NAMES(r));
19110 	Safefree(r->xpv_len_u.xpvlenu_pv);
19111     }
19112     if (r->substrs) {
19113         SvREFCNT_dec(r->anchored_substr);
19114         SvREFCNT_dec(r->anchored_utf8);
19115         SvREFCNT_dec(r->float_substr);
19116         SvREFCNT_dec(r->float_utf8);
19117 	Safefree(r->substrs);
19118     }
19119     RX_MATCH_COPY_FREE(rx);
19120 #ifdef PERL_ANY_COW
19121     SvREFCNT_dec(r->saved_copy);
19122 #endif
19123     Safefree(r->offs);
19124     SvREFCNT_dec(r->qr_anoncv);
19125     if (r->recurse_locinput)
19126         Safefree(r->recurse_locinput);
19127     rx->sv_u.svu_rx = 0;
19128 }
19129 
19130 /*  reg_temp_copy()
19131 
19132     This is a hacky workaround to the structural issue of match results
19133     being stored in the regexp structure which is in turn stored in
19134     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
19135     could be PL_curpm in multiple contexts, and could require multiple
19136     result sets being associated with the pattern simultaneously, such
19137     as when doing a recursive match with (??{$qr})
19138 
19139     The solution is to make a lightweight copy of the regexp structure
19140     when a qr// is returned from the code executed by (??{$qr}) this
19141     lightweight copy doesn't actually own any of its data except for
19142     the starp/end and the actual regexp structure itself.
19143 
19144 */
19145 
19146 
19147 REGEXP *
19148 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
19149 {
19150     struct regexp *ret;
19151     struct regexp *const r = ReANY(rx);
19152     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
19153 
19154     PERL_ARGS_ASSERT_REG_TEMP_COPY;
19155 
19156     if (!ret_x)
19157 	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
19158     else {
19159 	SvOK_off((SV *)ret_x);
19160 	if (islv) {
19161 	    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
19162 	       to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
19163 	       made both spots point to the same regexp body.) */
19164 	    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
19165 	    assert(!SvPVX(ret_x));
19166 	    ret_x->sv_u.svu_rx = temp->sv_any;
19167 	    temp->sv_any = NULL;
19168 	    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
19169 	    SvREFCNT_dec_NN(temp);
19170 	    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
19171 	       ing below will not set it. */
19172 	    SvCUR_set(ret_x, SvCUR(rx));
19173 	}
19174     }
19175     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
19176        sv_force_normal(sv) is called.  */
19177     SvFAKE_on(ret_x);
19178     ret = ReANY(ret_x);
19179 
19180     SvFLAGS(ret_x) |= SvUTF8(rx);
19181     /* We share the same string buffer as the original regexp, on which we
19182        hold a reference count, incremented when mother_re is set below.
19183        The string pointer is copied here, being part of the regexp struct.
19184      */
19185     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
19186 	   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
19187     if (r->offs) {
19188         const I32 npar = r->nparens+1;
19189         Newx(ret->offs, npar, regexp_paren_pair);
19190         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19191     }
19192     if (r->substrs) {
19193         Newx(ret->substrs, 1, struct reg_substr_data);
19194 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19195 
19196 	SvREFCNT_inc_void(ret->anchored_substr);
19197 	SvREFCNT_inc_void(ret->anchored_utf8);
19198 	SvREFCNT_inc_void(ret->float_substr);
19199 	SvREFCNT_inc_void(ret->float_utf8);
19200 
19201 	/* check_substr and check_utf8, if non-NULL, point to either their
19202 	   anchored or float namesakes, and don't hold a second reference.  */
19203     }
19204     RX_MATCH_COPIED_off(ret_x);
19205 #ifdef PERL_ANY_COW
19206     ret->saved_copy = NULL;
19207 #endif
19208     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
19209     SvREFCNT_inc_void(ret->qr_anoncv);
19210     if (r->recurse_locinput)
19211         Newxz(ret->recurse_locinput,r->nparens + 1,char *);
19212 
19213     return ret_x;
19214 }
19215 #endif
19216 
19217 /* regfree_internal()
19218 
19219    Free the private data in a regexp. This is overloadable by
19220    extensions. Perl takes care of the regexp structure in pregfree(),
19221    this covers the *pprivate pointer which technically perl doesn't
19222    know about, however of course we have to handle the
19223    regexp_internal structure when no extension is in use.
19224 
19225    Note this is called before freeing anything in the regexp
19226    structure.
19227  */
19228 
19229 void
19230 Perl_regfree_internal(pTHX_ REGEXP * const rx)
19231 {
19232     struct regexp *const r = ReANY(rx);
19233     RXi_GET_DECL(r,ri);
19234     GET_RE_DEBUG_FLAGS_DECL;
19235 
19236     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
19237 
19238     DEBUG_COMPILE_r({
19239 	if (!PL_colorset)
19240 	    reginitcolors();
19241 	{
19242 	    SV *dsv= sv_newmortal();
19243             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
19244                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
19245             Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
19246                 PL_colors[4],PL_colors[5],s);
19247         }
19248     });
19249 #ifdef RE_TRACK_PATTERN_OFFSETS
19250     if (ri->u.offsets)
19251         Safefree(ri->u.offsets);             /* 20010421 MJD */
19252 #endif
19253     if (ri->code_blocks) {
19254 	int n;
19255 	for (n = 0; n < ri->num_code_blocks; n++)
19256 	    SvREFCNT_dec(ri->code_blocks[n].src_regex);
19257 	Safefree(ri->code_blocks);
19258     }
19259 
19260     if (ri->data) {
19261 	int n = ri->data->count;
19262 
19263 	while (--n >= 0) {
19264           /* If you add a ->what type here, update the comment in regcomp.h */
19265 	    switch (ri->data->what[n]) {
19266 	    case 'a':
19267 	    case 'r':
19268 	    case 's':
19269 	    case 'S':
19270 	    case 'u':
19271 		SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
19272 		break;
19273 	    case 'f':
19274 		Safefree(ri->data->data[n]);
19275 		break;
19276 	    case 'l':
19277 	    case 'L':
19278 	        break;
19279             case 'T':
19280                 { /* Aho Corasick add-on structure for a trie node.
19281                      Used in stclass optimization only */
19282                     U32 refcount;
19283                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
19284 #ifdef USE_ITHREADS
19285                     dVAR;
19286 #endif
19287                     OP_REFCNT_LOCK;
19288                     refcount = --aho->refcount;
19289                     OP_REFCNT_UNLOCK;
19290                     if ( !refcount ) {
19291                         PerlMemShared_free(aho->states);
19292                         PerlMemShared_free(aho->fail);
19293 			 /* do this last!!!! */
19294                         PerlMemShared_free(ri->data->data[n]);
19295                         /* we should only ever get called once, so
19296                          * assert as much, and also guard the free
19297                          * which /might/ happen twice. At the least
19298                          * it will make code anlyzers happy and it
19299                          * doesn't cost much. - Yves */
19300                         assert(ri->regstclass);
19301                         if (ri->regstclass) {
19302                             PerlMemShared_free(ri->regstclass);
19303                             ri->regstclass = 0;
19304                         }
19305                     }
19306                 }
19307                 break;
19308 	    case 't':
19309 	        {
19310 	            /* trie structure. */
19311 	            U32 refcount;
19312 	            reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
19313 #ifdef USE_ITHREADS
19314                     dVAR;
19315 #endif
19316                     OP_REFCNT_LOCK;
19317                     refcount = --trie->refcount;
19318                     OP_REFCNT_UNLOCK;
19319                     if ( !refcount ) {
19320                         PerlMemShared_free(trie->charmap);
19321                         PerlMemShared_free(trie->states);
19322                         PerlMemShared_free(trie->trans);
19323                         if (trie->bitmap)
19324                             PerlMemShared_free(trie->bitmap);
19325                         if (trie->jump)
19326                             PerlMemShared_free(trie->jump);
19327 			PerlMemShared_free(trie->wordinfo);
19328                         /* do this last!!!! */
19329                         PerlMemShared_free(ri->data->data[n]);
19330 		    }
19331 		}
19332 		break;
19333 	    default:
19334 		Perl_croak(aTHX_ "panic: regfree data code '%c'",
19335                                                     ri->data->what[n]);
19336 	    }
19337 	}
19338 	Safefree(ri->data->what);
19339 	Safefree(ri->data);
19340     }
19341 
19342     Safefree(ri);
19343 }
19344 
19345 #define av_dup_inc(s,t)	MUTABLE_AV(sv_dup_inc((const SV *)s,t))
19346 #define hv_dup_inc(s,t)	MUTABLE_HV(sv_dup_inc((const SV *)s,t))
19347 #define SAVEPVN(p,n)	((p) ? savepvn(p,n) : NULL)
19348 
19349 /*
19350    re_dup_guts - duplicate a regexp.
19351 
19352    This routine is expected to clone a given regexp structure. It is only
19353    compiled under USE_ITHREADS.
19354 
19355    After all of the core data stored in struct regexp is duplicated
19356    the regexp_engine.dupe method is used to copy any private data
19357    stored in the *pprivate pointer. This allows extensions to handle
19358    any duplication it needs to do.
19359 
19360    See pregfree() and regfree_internal() if you change anything here.
19361 */
19362 #if defined(USE_ITHREADS)
19363 #ifndef PERL_IN_XSUB_RE
19364 void
19365 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
19366 {
19367     dVAR;
19368     I32 npar;
19369     const struct regexp *r = ReANY(sstr);
19370     struct regexp *ret = ReANY(dstr);
19371 
19372     PERL_ARGS_ASSERT_RE_DUP_GUTS;
19373 
19374     npar = r->nparens+1;
19375     Newx(ret->offs, npar, regexp_paren_pair);
19376     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
19377 
19378     if (ret->substrs) {
19379 	/* Do it this way to avoid reading from *r after the StructCopy().
19380 	   That way, if any of the sv_dup_inc()s dislodge *r from the L1
19381 	   cache, it doesn't matter.  */
19382 	const bool anchored = r->check_substr
19383 	    ? r->check_substr == r->anchored_substr
19384 	    : r->check_utf8 == r->anchored_utf8;
19385         Newx(ret->substrs, 1, struct reg_substr_data);
19386 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
19387 
19388 	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
19389 	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
19390 	ret->float_substr = sv_dup_inc(ret->float_substr, param);
19391 	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
19392 
19393 	/* check_substr and check_utf8, if non-NULL, point to either their
19394 	   anchored or float namesakes, and don't hold a second reference.  */
19395 
19396 	if (ret->check_substr) {
19397 	    if (anchored) {
19398 		assert(r->check_utf8 == r->anchored_utf8);
19399 		ret->check_substr = ret->anchored_substr;
19400 		ret->check_utf8 = ret->anchored_utf8;
19401 	    } else {
19402 		assert(r->check_substr == r->float_substr);
19403 		assert(r->check_utf8 == r->float_utf8);
19404 		ret->check_substr = ret->float_substr;
19405 		ret->check_utf8 = ret->float_utf8;
19406 	    }
19407 	} else if (ret->check_utf8) {
19408 	    if (anchored) {
19409 		ret->check_utf8 = ret->anchored_utf8;
19410 	    } else {
19411 		ret->check_utf8 = ret->float_utf8;
19412 	    }
19413 	}
19414     }
19415 
19416     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
19417     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
19418     if (r->recurse_locinput)
19419         Newxz(ret->recurse_locinput,r->nparens + 1,char *);
19420 
19421     if (ret->pprivate)
19422 	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
19423 
19424     if (RX_MATCH_COPIED(dstr))
19425 	ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
19426     else
19427 	ret->subbeg = NULL;
19428 #ifdef PERL_ANY_COW
19429     ret->saved_copy = NULL;
19430 #endif
19431 
19432     /* Whether mother_re be set or no, we need to copy the string.  We
19433        cannot refrain from copying it when the storage points directly to
19434        our mother regexp, because that's
19435 	       1: a buffer in a different thread
19436 	       2: something we no longer hold a reference on
19437 	       so we need to copy it locally.  */
19438     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
19439     ret->mother_re   = NULL;
19440 }
19441 #endif /* PERL_IN_XSUB_RE */
19442 
19443 /*
19444    regdupe_internal()
19445 
19446    This is the internal complement to regdupe() which is used to copy
19447    the structure pointed to by the *pprivate pointer in the regexp.
19448    This is the core version of the extension overridable cloning hook.
19449    The regexp structure being duplicated will be copied by perl prior
19450    to this and will be provided as the regexp *r argument, however
19451    with the /old/ structures pprivate pointer value. Thus this routine
19452    may override any copying normally done by perl.
19453 
19454    It returns a pointer to the new regexp_internal structure.
19455 */
19456 
19457 void *
19458 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
19459 {
19460     dVAR;
19461     struct regexp *const r = ReANY(rx);
19462     regexp_internal *reti;
19463     int len;
19464     RXi_GET_DECL(r,ri);
19465 
19466     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
19467 
19468     len = ProgLen(ri);
19469 
19470     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
19471           char, regexp_internal);
19472     Copy(ri->program, reti->program, len+1, regnode);
19473 
19474 
19475     reti->num_code_blocks = ri->num_code_blocks;
19476     if (ri->code_blocks) {
19477 	int n;
19478 	Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
19479 		struct reg_code_block);
19480 	Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
19481 		struct reg_code_block);
19482 	for (n = 0; n < ri->num_code_blocks; n++)
19483 	     reti->code_blocks[n].src_regex = (REGEXP*)
19484 		    sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
19485     }
19486     else
19487 	reti->code_blocks = NULL;
19488 
19489     reti->regstclass = NULL;
19490 
19491     if (ri->data) {
19492 	struct reg_data *d;
19493         const int count = ri->data->count;
19494 	int i;
19495 
19496 	Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
19497 		char, struct reg_data);
19498 	Newx(d->what, count, U8);
19499 
19500 	d->count = count;
19501 	for (i = 0; i < count; i++) {
19502 	    d->what[i] = ri->data->what[i];
19503 	    switch (d->what[i]) {
19504 	        /* see also regcomp.h and regfree_internal() */
19505 	    case 'a': /* actually an AV, but the dup function is identical.  */
19506 	    case 'r':
19507 	    case 's':
19508 	    case 'S':
19509 	    case 'u': /* actually an HV, but the dup function is identical.  */
19510 		d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
19511 		break;
19512 	    case 'f':
19513 		/* This is cheating. */
19514 		Newx(d->data[i], 1, regnode_ssc);
19515 		StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
19516 		reti->regstclass = (regnode*)d->data[i];
19517 		break;
19518 	    case 'T':
19519 		/* Trie stclasses are readonly and can thus be shared
19520 		 * without duplication. We free the stclass in pregfree
19521 		 * when the corresponding reg_ac_data struct is freed.
19522 		 */
19523 		reti->regstclass= ri->regstclass;
19524 		/* FALLTHROUGH */
19525 	    case 't':
19526 		OP_REFCNT_LOCK;
19527 		((reg_trie_data*)ri->data->data[i])->refcount++;
19528 		OP_REFCNT_UNLOCK;
19529 		/* FALLTHROUGH */
19530 	    case 'l':
19531 	    case 'L':
19532 		d->data[i] = ri->data->data[i];
19533 		break;
19534             default:
19535                 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
19536                                                            ri->data->what[i]);
19537 	    }
19538 	}
19539 
19540 	reti->data = d;
19541     }
19542     else
19543 	reti->data = NULL;
19544 
19545     reti->name_list_idx = ri->name_list_idx;
19546 
19547 #ifdef RE_TRACK_PATTERN_OFFSETS
19548     if (ri->u.offsets) {
19549         Newx(reti->u.offsets, 2*len+1, U32);
19550         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
19551     }
19552 #else
19553     SetProgLen(reti,len);
19554 #endif
19555 
19556     return (void*)reti;
19557 }
19558 
19559 #endif    /* USE_ITHREADS */
19560 
19561 #ifndef PERL_IN_XSUB_RE
19562 
19563 /*
19564  - regnext - dig the "next" pointer out of a node
19565  */
19566 regnode *
19567 Perl_regnext(pTHX_ regnode *p)
19568 {
19569     I32 offset;
19570 
19571     if (!p)
19572 	return(NULL);
19573 
19574     if (OP(p) > REGNODE_MAX) {		/* regnode.type is unsigned */
19575 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
19576                                                 (int)OP(p), (int)REGNODE_MAX);
19577     }
19578 
19579     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
19580     if (offset == 0)
19581 	return(NULL);
19582 
19583     return(p+offset);
19584 }
19585 #endif
19586 
19587 STATIC void
19588 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
19589 {
19590     va_list args;
19591     STRLEN l1 = strlen(pat1);
19592     STRLEN l2 = strlen(pat2);
19593     char buf[512];
19594     SV *msv;
19595     const char *message;
19596 
19597     PERL_ARGS_ASSERT_RE_CROAK2;
19598 
19599     if (l1 > 510)
19600 	l1 = 510;
19601     if (l1 + l2 > 510)
19602 	l2 = 510 - l1;
19603     Copy(pat1, buf, l1 , char);
19604     Copy(pat2, buf + l1, l2 , char);
19605     buf[l1 + l2] = '\n';
19606     buf[l1 + l2 + 1] = '\0';
19607     va_start(args, pat2);
19608     msv = vmess(buf, &args);
19609     va_end(args);
19610     message = SvPV_const(msv,l1);
19611     if (l1 > 512)
19612 	l1 = 512;
19613     Copy(message, buf, l1 , char);
19614     /* l1-1 to avoid \n */
19615     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
19616 }
19617 
19618 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
19619 
19620 #ifndef PERL_IN_XSUB_RE
19621 void
19622 Perl_save_re_context(pTHX)
19623 {
19624     I32 nparens = -1;
19625     I32 i;
19626 
19627     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
19628 
19629     if (PL_curpm) {
19630 	const REGEXP * const rx = PM_GETRE(PL_curpm);
19631 	if (rx)
19632             nparens = RX_NPARENS(rx);
19633     }
19634 
19635     /* RT #124109. This is a complete hack; in the SWASHNEW case we know
19636      * that PL_curpm will be null, but that utf8.pm and the modules it
19637      * loads will only use $1..$3.
19638      * The t/porting/re_context.t test file checks this assumption.
19639      */
19640     if (nparens == -1)
19641         nparens = 3;
19642 
19643     for (i = 1; i <= nparens; i++) {
19644         char digits[TYPE_CHARS(long)];
19645         const STRLEN len = my_snprintf(digits, sizeof(digits),
19646                                        "%lu", (long)i);
19647         GV *const *const gvp
19648             = (GV**)hv_fetch(PL_defstash, digits, len, 0);
19649 
19650         if (gvp) {
19651             GV * const gv = *gvp;
19652             if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
19653                 save_scalar(gv);
19654         }
19655     }
19656 }
19657 #endif
19658 
19659 #ifdef DEBUGGING
19660 
19661 STATIC void
19662 S_put_code_point(pTHX_ SV *sv, UV c)
19663 {
19664     PERL_ARGS_ASSERT_PUT_CODE_POINT;
19665 
19666     if (c > 255) {
19667         Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
19668     }
19669     else if (isPRINT(c)) {
19670 	const char string = (char) c;
19671 
19672         /* We use {phrase} as metanotation in the class, so also escape literal
19673          * braces */
19674 	if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
19675 	    sv_catpvs(sv, "\\");
19676 	sv_catpvn(sv, &string, 1);
19677     }
19678     else if (isMNEMONIC_CNTRL(c)) {
19679         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
19680     }
19681     else {
19682         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
19683     }
19684 }
19685 
19686 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
19687 
19688 STATIC void
19689 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
19690 {
19691     /* Appends to 'sv' a displayable version of the range of code points from
19692      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
19693      * that have them, when they occur at the beginning or end of the range.
19694      * It uses hex to output the remaining code points, unless 'allow_literals'
19695      * is true, in which case the printable ASCII ones are output as-is (though
19696      * some of these will be escaped by put_code_point()).
19697      *
19698      * NOTE:  This is designed only for printing ranges of code points that fit
19699      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
19700      */
19701 
19702     const unsigned int min_range_count = 3;
19703 
19704     assert(start <= end);
19705 
19706     PERL_ARGS_ASSERT_PUT_RANGE;
19707 
19708     while (start <= end) {
19709         UV this_end;
19710         const char * format;
19711 
19712         if (end - start < min_range_count) {
19713 
19714             /* Output chars individually when they occur in short ranges */
19715             for (; start <= end; start++) {
19716                 put_code_point(sv, start);
19717             }
19718             break;
19719         }
19720 
19721         /* If permitted by the input options, and there is a possibility that
19722          * this range contains a printable literal, look to see if there is
19723          * one. */
19724         if (allow_literals && start <= MAX_PRINT_A) {
19725 
19726             /* If the character at the beginning of the range isn't an ASCII
19727              * printable, effectively split the range into two parts:
19728              *  1) the portion before the first such printable,
19729              *  2) the rest
19730              * and output them separately. */
19731             if (! isPRINT_A(start)) {
19732                 UV temp_end = start + 1;
19733 
19734                 /* There is no point looking beyond the final possible
19735                  * printable, in MAX_PRINT_A */
19736                 UV max = MIN(end, MAX_PRINT_A);
19737 
19738                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
19739                     temp_end++;
19740                 }
19741 
19742                 /* Here, temp_end points to one beyond the first printable if
19743                  * found, or to one beyond 'max' if not.  If none found, make
19744                  * sure that we use the entire range */
19745                 if (temp_end > MAX_PRINT_A) {
19746                     temp_end = end + 1;
19747                 }
19748 
19749                 /* Output the first part of the split range: the part that
19750                  * doesn't have printables, with the parameter set to not look
19751                  * for literals (otherwise we would infinitely recurse) */
19752                 put_range(sv, start, temp_end - 1, FALSE);
19753 
19754                 /* The 2nd part of the range (if any) starts here. */
19755                 start = temp_end;
19756 
19757                 /* We do a continue, instead of dropping down, because even if
19758                  * the 2nd part is non-empty, it could be so short that we want
19759                  * to output it as individual characters, as tested for at the
19760                  * top of this loop.  */
19761                 continue;
19762             }
19763 
19764             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
19765              * output a sub-range of just the digits or letters, then process
19766              * the remaining portion as usual. */
19767             if (isALPHANUMERIC_A(start)) {
19768                 UV mask = (isDIGIT_A(start))
19769                            ? _CC_DIGIT
19770                              : isUPPER_A(start)
19771                                ? _CC_UPPER
19772                                : _CC_LOWER;
19773                 UV temp_end = start + 1;
19774 
19775                 /* Find the end of the sub-range that includes just the
19776                  * characters in the same class as the first character in it */
19777                 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
19778                     temp_end++;
19779                 }
19780                 temp_end--;
19781 
19782                 /* For short ranges, don't duplicate the code above to output
19783                  * them; just call recursively */
19784                 if (temp_end - start < min_range_count) {
19785                     put_range(sv, start, temp_end, FALSE);
19786                 }
19787                 else {  /* Output as a range */
19788                     put_code_point(sv, start);
19789                     sv_catpvs(sv, "-");
19790                     put_code_point(sv, temp_end);
19791                 }
19792                 start = temp_end + 1;
19793                 continue;
19794             }
19795 
19796             /* We output any other printables as individual characters */
19797             if (isPUNCT_A(start) || isSPACE_A(start)) {
19798                 while (start <= end && (isPUNCT_A(start)
19799                                         || isSPACE_A(start)))
19800                 {
19801                     put_code_point(sv, start);
19802                     start++;
19803                 }
19804                 continue;
19805             }
19806         } /* End of looking for literals */
19807 
19808         /* Here is not to output as a literal.  Some control characters have
19809          * mnemonic names.  Split off any of those at the beginning and end of
19810          * the range to print mnemonically.  It isn't possible for many of
19811          * these to be in a row, so this won't overwhelm with output */
19812         while (isMNEMONIC_CNTRL(start) && start <= end) {
19813             put_code_point(sv, start);
19814             start++;
19815         }
19816         if (start < end && isMNEMONIC_CNTRL(end)) {
19817 
19818             /* Here, the final character in the range has a mnemonic name.
19819              * Work backwards from the end to find the final non-mnemonic */
19820             UV temp_end = end - 1;
19821             while (isMNEMONIC_CNTRL(temp_end)) {
19822                 temp_end--;
19823             }
19824 
19825             /* And separately output the interior range that doesn't start or
19826              * end with mnemonics */
19827             put_range(sv, start, temp_end, FALSE);
19828 
19829             /* Then output the mnemonic trailing controls */
19830             start = temp_end + 1;
19831             while (start <= end) {
19832                 put_code_point(sv, start);
19833                 start++;
19834             }
19835             break;
19836         }
19837 
19838         /* As a final resort, output the range or subrange as hex. */
19839 
19840         this_end = (end < NUM_ANYOF_CODE_POINTS)
19841                     ? end
19842                     : NUM_ANYOF_CODE_POINTS - 1;
19843 #if NUM_ANYOF_CODE_POINTS > 256
19844         format = (this_end < 256)
19845                  ? "\\x%02"UVXf"-\\x%02"UVXf""
19846                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
19847 #else
19848         format = "\\x%02"UVXf"-\\x%02"UVXf"";
19849 #endif
19850         GCC_DIAG_IGNORE(-Wformat-nonliteral);
19851         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
19852         GCC_DIAG_RESTORE;
19853         break;
19854     }
19855 }
19856 
19857 STATIC void
19858 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
19859 {
19860     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
19861      * 'invlist' */
19862 
19863     UV start, end;
19864     bool allow_literals = TRUE;
19865 
19866     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
19867 
19868     /* Generally, it is more readable if printable characters are output as
19869      * literals, but if a range (nearly) spans all of them, it's best to output
19870      * it as a single range.  This code will use a single range if all but 2
19871      * ASCII printables are in it */
19872     invlist_iterinit(invlist);
19873     while (invlist_iternext(invlist, &start, &end)) {
19874 
19875         /* If the range starts beyond the final printable, it doesn't have any
19876          * in it */
19877         if (start > MAX_PRINT_A) {
19878             break;
19879         }
19880 
19881         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
19882          * all but two, the range must start and end no later than 2 from
19883          * either end */
19884         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
19885             if (end > MAX_PRINT_A) {
19886                 end = MAX_PRINT_A;
19887             }
19888             if (start < ' ') {
19889                 start = ' ';
19890             }
19891             if (end - start >= MAX_PRINT_A - ' ' - 2) {
19892                 allow_literals = FALSE;
19893             }
19894             break;
19895         }
19896     }
19897     invlist_iterfinish(invlist);
19898 
19899     /* Here we have figured things out.  Output each range */
19900     invlist_iterinit(invlist);
19901     while (invlist_iternext(invlist, &start, &end)) {
19902         if (start >= NUM_ANYOF_CODE_POINTS) {
19903             break;
19904         }
19905         put_range(sv, start, end, allow_literals);
19906     }
19907     invlist_iterfinish(invlist);
19908 
19909     return;
19910 }
19911 
19912 STATIC SV*
19913 S_put_charclass_bitmap_innards_common(pTHX_
19914         SV* invlist,            /* The bitmap */
19915         SV* posixes,            /* Under /l, things like [:word:], \S */
19916         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
19917         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
19918         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
19919         const bool invert       /* Is the result to be inverted? */
19920 )
19921 {
19922     /* Create and return an SV containing a displayable version of the bitmap
19923      * and associated information determined by the input parameters.  If the
19924      * output would have been only the inversion indicator '^', NULL is instead
19925      * returned. */
19926 
19927     SV * output;
19928 
19929     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
19930 
19931     if (invert) {
19932         output = newSVpvs("^");
19933     }
19934     else {
19935         output = newSVpvs("");
19936     }
19937 
19938     /* First, the code points in the bitmap that are unconditionally there */
19939     put_charclass_bitmap_innards_invlist(output, invlist);
19940 
19941     /* Traditionally, these have been placed after the main code points */
19942     if (posixes) {
19943         sv_catsv(output, posixes);
19944     }
19945 
19946     if (only_utf8 && _invlist_len(only_utf8)) {
19947         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
19948         put_charclass_bitmap_innards_invlist(output, only_utf8);
19949     }
19950 
19951     if (not_utf8 && _invlist_len(not_utf8)) {
19952         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
19953         put_charclass_bitmap_innards_invlist(output, not_utf8);
19954     }
19955 
19956     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
19957         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
19958         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
19959 
19960         /* This is the only list in this routine that can legally contain code
19961          * points outside the bitmap range.  The call just above to
19962          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
19963          * output them here.  There's about a half-dozen possible, and none in
19964          * contiguous ranges longer than 2 */
19965         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
19966             UV start, end;
19967             SV* above_bitmap = NULL;
19968 
19969             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
19970 
19971             invlist_iterinit(above_bitmap);
19972             while (invlist_iternext(above_bitmap, &start, &end)) {
19973                 UV i;
19974 
19975                 for (i = start; i <= end; i++) {
19976                     put_code_point(output, i);
19977                 }
19978             }
19979             invlist_iterfinish(above_bitmap);
19980             SvREFCNT_dec_NN(above_bitmap);
19981         }
19982     }
19983 
19984     if (invert && SvCUR(output) == 1) {
19985         return NULL;
19986     }
19987 
19988     return output;
19989 }
19990 
19991 STATIC bool
19992 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
19993                                      char *bitmap,
19994                                      SV *nonbitmap_invlist,
19995                                      SV *only_utf8_locale_invlist,
19996                                      const regnode * const node,
19997                                      const bool force_as_is_display)
19998 {
19999     /* Appends to 'sv' a displayable version of the innards of the bracketed
20000      * character class defined by the other arguments:
20001      *  'bitmap' points to the bitmap.
20002      *  'nonbitmap_invlist' is an inversion list of the code points that are in
20003      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
20004      *      none.  The reasons for this could be that they require some
20005      *      condition such as the target string being or not being in UTF-8
20006      *      (under /d), or because they came from a user-defined property that
20007      *      was not resolved at the time of the regex compilation (under /u)
20008      *  'only_utf8_locale_invlist' is an inversion list of the code points that
20009      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
20010      *  'node' is the regex pattern node.  It is needed only when the above two
20011      *      parameters are not null, and is passed so that this routine can
20012      *      tease apart the various reasons for them.
20013      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
20014      *      to invert things to see if that leads to a cleaner display.  If
20015      *      FALSE, this routine is free to use its judgment about doing this.
20016      *
20017      * It returns TRUE if there was actually something output.  (It may be that
20018      * the bitmap, etc is empty.)
20019      *
20020      * When called for outputting the bitmap of a non-ANYOF node, just pass the
20021      * bitmap, with the succeeding parameters set to NULL, and the final one to
20022      * FALSE.
20023      */
20024 
20025     /* In general, it tries to display the 'cleanest' representation of the
20026      * innards, choosing whether to display them inverted or not, regardless of
20027      * whether the class itself is to be inverted.  However,  there are some
20028      * cases where it can't try inverting, as what actually matches isn't known
20029      * until runtime, and hence the inversion isn't either. */
20030     bool inverting_allowed = ! force_as_is_display;
20031 
20032     int i;
20033     STRLEN orig_sv_cur = SvCUR(sv);
20034 
20035     SV* invlist;            /* Inversion list we accumulate of code points that
20036                                are unconditionally matched */
20037     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
20038                                UTF-8 */
20039     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
20040                              */
20041     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
20042     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
20043                                        is UTF-8 */
20044 
20045     SV* as_is_display;      /* The output string when we take the inputs
20046                               literally */
20047     SV* inverted_display;   /* The output string when we invert the inputs */
20048 
20049     U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
20050 
20051     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
20052                                                    to match? */
20053     /* We are biased in favor of displaying things without them being inverted,
20054      * as that is generally easier to understand */
20055     const int bias = 5;
20056 
20057     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
20058 
20059     /* Start off with whatever code points are passed in.  (We clone, so we
20060      * don't change the caller's list) */
20061     if (nonbitmap_invlist) {
20062         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
20063         invlist = invlist_clone(nonbitmap_invlist);
20064     }
20065     else {  /* Worst case size is every other code point is matched */
20066         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
20067     }
20068 
20069     if (flags) {
20070         if (OP(node) == ANYOFD) {
20071 
20072             /* This flag indicates that the code points below 0x100 in the
20073              * nonbitmap list are precisely the ones that match only when the
20074              * target is UTF-8 (they should all be non-ASCII). */
20075             if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
20076             {
20077                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
20078                 _invlist_subtract(invlist, only_utf8, &invlist);
20079             }
20080 
20081             /* And this flag for matching all non-ASCII 0xFF and below */
20082             if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
20083             {
20084                 not_utf8 = invlist_clone(PL_UpperLatin1);
20085             }
20086         }
20087         else if (OP(node) == ANYOFL) {
20088 
20089             /* If either of these flags are set, what matches isn't
20090              * determinable except during execution, so don't know enough here
20091              * to invert */
20092             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
20093                 inverting_allowed = FALSE;
20094             }
20095 
20096             /* What the posix classes match also varies at runtime, so these
20097              * will be output symbolically. */
20098             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
20099                 int i;
20100 
20101                 posixes = newSVpvs("");
20102                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
20103                     if (ANYOF_POSIXL_TEST(node,i)) {
20104                         sv_catpv(posixes, anyofs[i]);
20105                     }
20106                 }
20107             }
20108         }
20109     }
20110 
20111     /* Accumulate the bit map into the unconditional match list */
20112     for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
20113         if (BITMAP_TEST(bitmap, i)) {
20114             int start = i++;
20115             for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
20116                 /* empty */
20117             }
20118             invlist = _add_range_to_invlist(invlist, start, i-1);
20119         }
20120     }
20121 
20122     /* Make sure that the conditional match lists don't have anything in them
20123      * that match unconditionally; otherwise the output is quite confusing.
20124      * This could happen if the code that populates these misses some
20125      * duplication. */
20126     if (only_utf8) {
20127         _invlist_subtract(only_utf8, invlist, &only_utf8);
20128     }
20129     if (not_utf8) {
20130         _invlist_subtract(not_utf8, invlist, &not_utf8);
20131     }
20132 
20133     if (only_utf8_locale_invlist) {
20134 
20135         /* Since this list is passed in, we have to make a copy before
20136          * modifying it */
20137         only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
20138 
20139         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
20140 
20141         /* And, it can get really weird for us to try outputting an inverted
20142          * form of this list when it has things above the bitmap, so don't even
20143          * try */
20144         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20145             inverting_allowed = FALSE;
20146         }
20147     }
20148 
20149     /* Calculate what the output would be if we take the input as-is */
20150     as_is_display = put_charclass_bitmap_innards_common(invlist,
20151                                                     posixes,
20152                                                     only_utf8,
20153                                                     not_utf8,
20154                                                     only_utf8_locale,
20155                                                     invert);
20156 
20157     /* If have to take the output as-is, just do that */
20158     if (! inverting_allowed) {
20159         if (as_is_display) {
20160             sv_catsv(sv, as_is_display);
20161             SvREFCNT_dec_NN(as_is_display);
20162         }
20163     }
20164     else { /* But otherwise, create the output again on the inverted input, and
20165               use whichever version is shorter */
20166 
20167         int inverted_bias, as_is_bias;
20168 
20169         /* We will apply our bias to whichever of the the results doesn't have
20170          * the '^' */
20171         if (invert) {
20172             invert = FALSE;
20173             as_is_bias = bias;
20174             inverted_bias = 0;
20175         }
20176         else {
20177             invert = TRUE;
20178             as_is_bias = 0;
20179             inverted_bias = bias;
20180         }
20181 
20182         /* Now invert each of the lists that contribute to the output,
20183          * excluding from the result things outside the possible range */
20184 
20185         /* For the unconditional inversion list, we have to add in all the
20186          * conditional code points, so that when inverted, they will be gone
20187          * from it */
20188         _invlist_union(only_utf8, invlist, &invlist);
20189         _invlist_union(not_utf8, invlist, &invlist);
20190         _invlist_union(only_utf8_locale, invlist, &invlist);
20191         _invlist_invert(invlist);
20192         _invlist_intersection(invlist, PL_InBitmap, &invlist);
20193 
20194         if (only_utf8) {
20195             _invlist_invert(only_utf8);
20196             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
20197         }
20198 
20199         if (not_utf8) {
20200             _invlist_invert(not_utf8);
20201             _invlist_intersection(not_utf8, PL_UpperLatin1, &not_utf8);
20202         }
20203 
20204         if (only_utf8_locale) {
20205             _invlist_invert(only_utf8_locale);
20206             _invlist_intersection(only_utf8_locale,
20207                                   PL_InBitmap,
20208                                   &only_utf8_locale);
20209         }
20210 
20211         inverted_display = put_charclass_bitmap_innards_common(
20212                                             invlist,
20213                                             posixes,
20214                                             only_utf8,
20215                                             not_utf8,
20216                                             only_utf8_locale, invert);
20217 
20218         /* Use the shortest representation, taking into account our bias
20219          * against showing it inverted */
20220         if (   inverted_display
20221             && (   ! as_is_display
20222                 || (  SvCUR(inverted_display) + inverted_bias
20223                     < SvCUR(as_is_display)    + as_is_bias)))
20224         {
20225 	    sv_catsv(sv, inverted_display);
20226         }
20227         else if (as_is_display) {
20228 	    sv_catsv(sv, as_is_display);
20229         }
20230 
20231         SvREFCNT_dec(as_is_display);
20232         SvREFCNT_dec(inverted_display);
20233     }
20234 
20235     SvREFCNT_dec_NN(invlist);
20236     SvREFCNT_dec(only_utf8);
20237     SvREFCNT_dec(not_utf8);
20238     SvREFCNT_dec(posixes);
20239     SvREFCNT_dec(only_utf8_locale);
20240 
20241     return SvCUR(sv) > orig_sv_cur;
20242 }
20243 
20244 #define CLEAR_OPTSTART                                                       \
20245     if (optstart) STMT_START {                                               \
20246         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_                                           \
20247                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
20248         optstart=NULL;                                                       \
20249     } STMT_END
20250 
20251 #define DUMPUNTIL(b,e)                                                       \
20252                     CLEAR_OPTSTART;                                          \
20253                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
20254 
20255 STATIC const regnode *
20256 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
20257 	    const regnode *last, const regnode *plast,
20258 	    SV* sv, I32 indent, U32 depth)
20259 {
20260     U8 op = PSEUDO;	/* Arbitrary non-END op. */
20261     const regnode *next;
20262     const regnode *optstart= NULL;
20263 
20264     RXi_GET_DECL(r,ri);
20265     GET_RE_DEBUG_FLAGS_DECL;
20266 
20267     PERL_ARGS_ASSERT_DUMPUNTIL;
20268 
20269 #ifdef DEBUG_DUMPUNTIL
20270     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n",indent,node-start,
20271         last ? last-start : 0,plast ? plast-start : 0);
20272 #endif
20273 
20274     if (plast && plast < last)
20275         last= plast;
20276 
20277     while (PL_regkind[op] != END && (!last || node < last)) {
20278         assert(node);
20279 	/* While that wasn't END last time... */
20280 	NODE_ALIGN(node);
20281 	op = OP(node);
20282 	if (op == CLOSE || op == WHILEM)
20283 	    indent--;
20284 	next = regnext((regnode *)node);
20285 
20286 	/* Where, what. */
20287 	if (OP(node) == OPTIMIZED) {
20288 	    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
20289 	        optstart = node;
20290 	    else
20291 		goto after_print;
20292 	} else
20293 	    CLEAR_OPTSTART;
20294 
20295         regprop(r, sv, node, NULL, NULL);
20296         Perl_re_printf( aTHX_  "%4"IVdf":%*s%s", (IV)(node - start),
20297 		      (int)(2*indent + 1), "", SvPVX_const(sv));
20298 
20299         if (OP(node) != OPTIMIZED) {
20300             if (next == NULL)		/* Next ptr. */
20301                 Perl_re_printf( aTHX_  " (0)");
20302             else if (PL_regkind[(U8)op] == BRANCH
20303                      && PL_regkind[OP(next)] != BRANCH )
20304                 Perl_re_printf( aTHX_  " (FAIL)");
20305             else
20306                 Perl_re_printf( aTHX_  " (%"IVdf")", (IV)(next - start));
20307             Perl_re_printf( aTHX_ "\n");
20308         }
20309 
20310       after_print:
20311 	if (PL_regkind[(U8)op] == BRANCHJ) {
20312 	    assert(next);
20313 	    {
20314                 const regnode *nnode = (OP(next) == LONGJMP
20315                                        ? regnext((regnode *)next)
20316                                        : next);
20317                 if (last && nnode > last)
20318                     nnode = last;
20319                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
20320 	    }
20321 	}
20322 	else if (PL_regkind[(U8)op] == BRANCH) {
20323 	    assert(next);
20324 	    DUMPUNTIL(NEXTOPER(node), next);
20325 	}
20326 	else if ( PL_regkind[(U8)op]  == TRIE ) {
20327 	    const regnode *this_trie = node;
20328 	    const char op = OP(node);
20329             const U32 n = ARG(node);
20330 	    const reg_ac_data * const ac = op>=AHOCORASICK ?
20331                (reg_ac_data *)ri->data->data[n] :
20332                NULL;
20333 	    const reg_trie_data * const trie =
20334 	        (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
20335 #ifdef DEBUGGING
20336 	    AV *const trie_words
20337                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
20338 #endif
20339 	    const regnode *nextbranch= NULL;
20340 	    I32 word_idx;
20341             sv_setpvs(sv, "");
20342 	    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
20343 		SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
20344 
20345                 Perl_re_indentf( aTHX_  "%s ",
20346                     indent+3,
20347                     elem_ptr
20348                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
20349                                 SvCUR(*elem_ptr), 60,
20350                                 PL_colors[0], PL_colors[1],
20351                                 (SvUTF8(*elem_ptr)
20352                                  ? PERL_PV_ESCAPE_UNI
20353                                  : 0)
20354                                 | PERL_PV_PRETTY_ELLIPSES
20355                                 | PERL_PV_PRETTY_LTGT
20356                             )
20357                     : "???"
20358                 );
20359                 if (trie->jump) {
20360                     U16 dist= trie->jump[word_idx+1];
20361                     Perl_re_printf( aTHX_  "(%"UVuf")\n",
20362                                (UV)((dist ? this_trie + dist : next) - start));
20363                     if (dist) {
20364                         if (!nextbranch)
20365                             nextbranch= this_trie + trie->jump[0];
20366 			DUMPUNTIL(this_trie + dist, nextbranch);
20367                     }
20368                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
20369                         nextbranch= regnext((regnode *)nextbranch);
20370                 } else {
20371                     Perl_re_printf( aTHX_  "\n");
20372 		}
20373 	    }
20374 	    if (last && next > last)
20375 	        node= last;
20376 	    else
20377 	        node= next;
20378 	}
20379 	else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
20380 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
20381                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
20382 	}
20383 	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
20384 	    assert(next);
20385 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
20386 	}
20387 	else if ( op == PLUS || op == STAR) {
20388 	    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
20389 	}
20390 	else if (PL_regkind[(U8)op] == ANYOF) {
20391 	    /* arglen 1 + class block */
20392 	    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
20393                           ? ANYOF_POSIXL_SKIP
20394                           : ANYOF_SKIP);
20395 	    node = NEXTOPER(node);
20396 	}
20397 	else if (PL_regkind[(U8)op] == EXACT) {
20398             /* Literal string, where present. */
20399 	    node += NODE_SZ_STR(node) - 1;
20400 	    node = NEXTOPER(node);
20401 	}
20402 	else {
20403 	    node = NEXTOPER(node);
20404 	    node += regarglen[(U8)op];
20405 	}
20406 	if (op == CURLYX || op == OPEN)
20407 	    indent++;
20408     }
20409     CLEAR_OPTSTART;
20410 #ifdef DEBUG_DUMPUNTIL
20411     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
20412 #endif
20413     return node;
20414 }
20415 
20416 #endif	/* DEBUGGING */
20417 
20418 /*
20419  * ex: set ts=8 sts=4 sw=4 et:
20420  */
20421