xref: /openbsd-src/gnu/usr.bin/perl/regcomp.c (revision d13be5d47e4149db2549a9828e244d59dbc43f15)
1 /*    regcomp.c
2  */
3 
4 /*
5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6  *
7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8  */
9 
10 /* This file contains functions for compiling a regular expression.  See
11  * also regexec.c which funnily enough, contains functions for executing
12  * a regular expression.
13  *
14  * This file is also copied at build time to ext/re/re_comp.c, where
15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16  * This causes the main functions to be compiled under new names and with
17  * debugging support added, which makes "use re 'debug'" work.
18  */
19 
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21  * confused with the original package (see point 3 below).  Thanks, Henry!
22  */
23 
24 /* Additional note: this code is very heavily munged from Henry's version
25  * in places.  In some spots I've traded clarity for efficiency, so don't
26  * blame Henry for some of the lack of readability.
27  */
28 
29 /* The names of the functions have been changed from regcomp and
30  * regexec to  pregcomp and pregexec in order to avoid conflicts
31  * with the POSIX routines of the same names.
32 */
33 
34 #ifdef PERL_EXT_RE_BUILD
35 #include "re_top.h"
36 #endif
37 
38 /*
39  * pregcomp and pregexec -- regsub and regerror are not used in perl
40  *
41  *	Copyright (c) 1986 by University of Toronto.
42  *	Written by Henry Spencer.  Not derived from licensed software.
43  *
44  *	Permission is granted to anyone to use this software for any
45  *	purpose on any computer system, and to redistribute it freely,
46  *	subject to the following restrictions:
47  *
48  *	1. The author is not responsible for the consequences of use of
49  *		this software, no matter how awful, even if they arise
50  *		from defects in it.
51  *
52  *	2. The origin of this software must not be misrepresented, either
53  *		by explicit claim or by omission.
54  *
55  *	3. Altered versions must be plainly marked as such, and must not
56  *		be misrepresented as being the original software.
57  *
58  *
59  ****    Alterations to Henry's code are...
60  ****
61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63  ****    by Larry Wall and others
64  ****
65  ****    You may distribute under the terms of either the GNU General Public
66  ****    License or the Artistic License, as specified in the README file.
67 
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGCOMP_C
75 #include "perl.h"
76 
77 #ifndef PERL_IN_XSUB_RE
78 #  include "INTERN.h"
79 #endif
80 
81 #define REG_COMP_C
82 #ifdef PERL_IN_XSUB_RE
83 #  include "re_comp.h"
84 #else
85 #  include "regcomp.h"
86 #endif
87 
88 #ifdef op
89 #undef op
90 #endif /* op */
91 
92 #ifdef MSDOS
93 #  if defined(BUGGY_MSC6)
94  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 #    pragma optimize("a",off)
96  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 #    pragma optimize("w",on )
98 #  endif /* BUGGY_MSC6 */
99 #endif /* MSDOS */
100 
101 #ifndef STATIC
102 #define	STATIC	static
103 #endif
104 
105 typedef struct RExC_state_t {
106     U32		flags;			/* are we folding, multilining? */
107     char	*precomp;		/* uncompiled string. */
108     REGEXP	*rx_sv;			/* The SV that is the regexp. */
109     regexp	*rx;                    /* perl core regexp structure */
110     regexp_internal	*rxi;           /* internal data for regexp object pprivate field */
111     char	*start;			/* Start of input for compile */
112     char	*end;			/* End of input for compile */
113     char	*parse;			/* Input-scan pointer. */
114     I32		whilem_seen;		/* number of WHILEM in this expr */
115     regnode	*emit_start;		/* Start of emitted-code area */
116     regnode	*emit_bound;		/* First regnode outside of the allocated space */
117     regnode	*emit;			/* Code-emit pointer; &regdummy = don't = compiling */
118     I32		naughty;		/* How bad is this pattern? */
119     I32		sawback;		/* Did we see \1, ...? */
120     U32		seen;
121     I32		size;			/* Code size. */
122     I32		npar;			/* Capture buffer count, (OPEN). */
123     I32		cpar;			/* Capture buffer count, (CLOSE). */
124     I32		nestroot;		/* root parens we are in - used by accept */
125     I32		extralen;
126     I32		seen_zerolen;
127     I32		seen_evals;
128     regnode	**open_parens;		/* pointers to open parens */
129     regnode	**close_parens;		/* pointers to close parens */
130     regnode	*opend;			/* END node in program */
131     I32		utf8;		/* whether the pattern is utf8 or not */
132     I32		orig_utf8;	/* whether the pattern was originally in utf8 */
133 				/* XXX use this for future optimisation of case
134 				 * where pattern must be upgraded to utf8. */
135     HV		*paren_names;		/* Paren names */
136 
137     regnode	**recurse;		/* Recurse regops */
138     I32		recurse_count;		/* Number of recurse regops */
139 #if ADD_TO_REGEXEC
140     char 	*starttry;		/* -Dr: where regtry was called. */
141 #define RExC_starttry	(pRExC_state->starttry)
142 #endif
143 #ifdef DEBUGGING
144     const char  *lastparse;
145     I32         lastnum;
146     AV          *paren_name_list;       /* idx -> name */
147 #define RExC_lastparse	(pRExC_state->lastparse)
148 #define RExC_lastnum	(pRExC_state->lastnum)
149 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
150 #endif
151 } RExC_state_t;
152 
153 #define RExC_flags	(pRExC_state->flags)
154 #define RExC_precomp	(pRExC_state->precomp)
155 #define RExC_rx_sv	(pRExC_state->rx_sv)
156 #define RExC_rx		(pRExC_state->rx)
157 #define RExC_rxi	(pRExC_state->rxi)
158 #define RExC_start	(pRExC_state->start)
159 #define RExC_end	(pRExC_state->end)
160 #define RExC_parse	(pRExC_state->parse)
161 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
162 #ifdef RE_TRACK_PATTERN_OFFSETS
163 #define RExC_offsets	(pRExC_state->rxi->u.offsets) /* I am not like the others */
164 #endif
165 #define RExC_emit	(pRExC_state->emit)
166 #define RExC_emit_start	(pRExC_state->emit_start)
167 #define RExC_emit_bound	(pRExC_state->emit_bound)
168 #define RExC_naughty	(pRExC_state->naughty)
169 #define RExC_sawback	(pRExC_state->sawback)
170 #define RExC_seen	(pRExC_state->seen)
171 #define RExC_size	(pRExC_state->size)
172 #define RExC_npar	(pRExC_state->npar)
173 #define RExC_nestroot   (pRExC_state->nestroot)
174 #define RExC_extralen	(pRExC_state->extralen)
175 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
176 #define RExC_seen_evals	(pRExC_state->seen_evals)
177 #define RExC_utf8	(pRExC_state->utf8)
178 #define RExC_orig_utf8	(pRExC_state->orig_utf8)
179 #define RExC_open_parens	(pRExC_state->open_parens)
180 #define RExC_close_parens	(pRExC_state->close_parens)
181 #define RExC_opend	(pRExC_state->opend)
182 #define RExC_paren_names	(pRExC_state->paren_names)
183 #define RExC_recurse	(pRExC_state->recurse)
184 #define RExC_recurse_count	(pRExC_state->recurse_count)
185 
186 
187 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
188 #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 	((*s) == '{' && regcurly(s)))
190 
191 #ifdef SPSTART
192 #undef SPSTART		/* dratted cpp namespace... */
193 #endif
194 /*
195  * Flags to be passed up and down.
196  */
197 #define	WORST		0	/* Worst case. */
198 #define	HASWIDTH	0x01	/* Known to match non-null strings. */
199 #define	SIMPLE		0x02	/* Simple enough to be STAR/PLUS operand. */
200 #define	SPSTART		0x04	/* Starts with * or +. */
201 #define TRYAGAIN	0x08	/* Weeded out a declaration. */
202 #define POSTPONED	0x10    /* (?1),(?&name), (??{...}) or similar */
203 
204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
205 
206 /* whether trie related optimizations are enabled */
207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208 #define TRIE_STUDY_OPT
209 #define FULL_TRIE_STUDY
210 #define TRIE_STCLASS
211 #endif
212 
213 
214 
215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216 #define PBITVAL(paren) (1 << ((paren) & 7))
217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
220 
221 
222 /* About scan_data_t.
223 
224   During optimisation we recurse through the regexp program performing
225   various inplace (keyhole style) optimisations. In addition study_chunk
226   and scan_commit populate this data structure with information about
227   what strings MUST appear in the pattern. We look for the longest
228   string that must appear for at a fixed location, and we look for the
229   longest string that may appear at a floating location. So for instance
230   in the pattern:
231 
232     /FOO[xX]A.*B[xX]BAR/
233 
234   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235   strings (because they follow a .* construct). study_chunk will identify
236   both FOO and BAR as being the longest fixed and floating strings respectively.
237 
238   The strings can be composites, for instance
239 
240      /(f)(o)(o)/
241 
242   will result in a composite fixed substring 'foo'.
243 
244   For each string some basic information is maintained:
245 
246   - offset or min_offset
247     This is the position the string must appear at, or not before.
248     It also implicitly (when combined with minlenp) tells us how many
249     character must match before the string we are searching.
250     Likewise when combined with minlenp and the length of the string
251     tells us how many characters must appear after the string we have
252     found.
253 
254   - max_offset
255     Only used for floating strings. This is the rightmost point that
256     the string can appear at. Ifset to I32 max it indicates that the
257     string can occur infinitely far to the right.
258 
259   - minlenp
260     A pointer to the minimum length of the pattern that the string
261     was found inside. This is important as in the case of positive
262     lookahead or positive lookbehind we can have multiple patterns
263     involved. Consider
264 
265     /(?=FOO).*F/
266 
267     The minimum length of the pattern overall is 3, the minimum length
268     of the lookahead part is 3, but the minimum length of the part that
269     will actually match is 1. So 'FOO's minimum length is 3, but the
270     minimum length for the F is 1. This is important as the minimum length
271     is used to determine offsets in front of and behind the string being
272     looked for.  Since strings can be composites this is the length of the
273     pattern at the time it was commited with a scan_commit. Note that
274     the length is calculated by study_chunk, so that the minimum lengths
275     are not known until the full pattern has been compiled, thus the
276     pointer to the value.
277 
278   - lookbehind
279 
280     In the case of lookbehind the string being searched for can be
281     offset past the start point of the final matching string.
282     If this value was just blithely removed from the min_offset it would
283     invalidate some of the calculations for how many chars must match
284     before or after (as they are derived from min_offset and minlen and
285     the length of the string being searched for).
286     When the final pattern is compiled and the data is moved from the
287     scan_data_t structure into the regexp structure the information
288     about lookbehind is factored in, with the information that would
289     have been lost precalculated in the end_shift field for the
290     associated string.
291 
292   The fields pos_min and pos_delta are used to store the minimum offset
293   and the delta to the maximum offset at the current point in the pattern.
294 
295 */
296 
297 typedef struct scan_data_t {
298     /*I32 len_min;      unused */
299     /*I32 len_delta;    unused */
300     I32 pos_min;
301     I32 pos_delta;
302     SV *last_found;
303     I32 last_end;	    /* min value, <0 unless valid. */
304     I32 last_start_min;
305     I32 last_start_max;
306     SV **longest;	    /* Either &l_fixed, or &l_float. */
307     SV *longest_fixed;      /* longest fixed string found in pattern */
308     I32 offset_fixed;       /* offset where it starts */
309     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
310     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
311     SV *longest_float;      /* longest floating string found in pattern */
312     I32 offset_float_min;   /* earliest point in string it can appear */
313     I32 offset_float_max;   /* latest point in string it can appear */
314     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
315     I32 lookbehind_float;   /* is the position of the string modified by LB */
316     I32 flags;
317     I32 whilem_c;
318     I32 *last_closep;
319     struct regnode_charclass_class *start_class;
320 } scan_data_t;
321 
322 /*
323  * Forward declarations for pregcomp()'s friends.
324  */
325 
326 static const scan_data_t zero_scan_data =
327   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
328 
329 #define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
330 #define SF_BEFORE_SEOL		0x0001
331 #define SF_BEFORE_MEOL		0x0002
332 #define SF_FIX_BEFORE_EOL	(SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333 #define SF_FL_BEFORE_EOL	(SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
334 
335 #ifdef NO_UNARY_PLUS
336 #  define SF_FIX_SHIFT_EOL	(0+2)
337 #  define SF_FL_SHIFT_EOL		(0+4)
338 #else
339 #  define SF_FIX_SHIFT_EOL	(+2)
340 #  define SF_FL_SHIFT_EOL		(+4)
341 #endif
342 
343 #define SF_FIX_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344 #define SF_FIX_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
345 
346 #define SF_FL_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347 #define SF_FL_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
348 #define SF_IS_INF		0x0040
349 #define SF_HAS_PAR		0x0080
350 #define SF_IN_PAR		0x0100
351 #define SF_HAS_EVAL		0x0200
352 #define SCF_DO_SUBSTR		0x0400
353 #define SCF_DO_STCLASS_AND	0x0800
354 #define SCF_DO_STCLASS_OR	0x1000
355 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
356 #define SCF_WHILEM_VISITED_POS	0x2000
357 
358 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
359 #define SCF_SEEN_ACCEPT         0x8000
360 
361 #define UTF (RExC_utf8 != 0)
362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
364 
365 #define OOB_UNICODE		12345678
366 #define OOB_NAMEDCLASS		-1
367 
368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
370 
371 
372 /* length of regex to show in messages that don't mark a position within */
373 #define RegexLengthToShowInErrorMessages 127
374 
375 /*
376  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378  * op/pragma/warn/regcomp.
379  */
380 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
381 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
382 
383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
384 
385 /*
386  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387  * arg. Show regex, up to a maximum length. If it's too long, chop and add
388  * "...".
389  */
390 #define _FAIL(code) STMT_START {					\
391     const char *ellipses = "";						\
392     IV len = RExC_end - RExC_precomp;					\
393 									\
394     if (!SIZE_ONLY)							\
395 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);			\
396     if (len > RegexLengthToShowInErrorMessages) {			\
397 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
398 	len = RegexLengthToShowInErrorMessages - 10;			\
399 	ellipses = "...";						\
400     }									\
401     code;                                                               \
402 } STMT_END
403 
404 #define	FAIL(msg) _FAIL(			    \
405     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",	    \
406 	    msg, (int)len, RExC_precomp, ellipses))
407 
408 #define	FAIL2(msg,arg) _FAIL(			    \
409     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",	    \
410 	    arg, (int)len, RExC_precomp, ellipses))
411 
412 /*
413  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
414  */
415 #define	Simple_vFAIL(m) STMT_START {					\
416     const IV offset = RExC_parse - RExC_precomp;			\
417     Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
418 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
419 } STMT_END
420 
421 /*
422  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
423  */
424 #define	vFAIL(m) STMT_START {				\
425     if (!SIZE_ONLY)					\
426 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);	\
427     Simple_vFAIL(m);					\
428 } STMT_END
429 
430 /*
431  * Like Simple_vFAIL(), but accepts two arguments.
432  */
433 #define	Simple_vFAIL2(m,a1) STMT_START {			\
434     const IV offset = RExC_parse - RExC_precomp;			\
435     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,			\
436 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
437 } STMT_END
438 
439 /*
440  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
441  */
442 #define	vFAIL2(m,a1) STMT_START {			\
443     if (!SIZE_ONLY)					\
444 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);	\
445     Simple_vFAIL2(m, a1);				\
446 } STMT_END
447 
448 
449 /*
450  * Like Simple_vFAIL(), but accepts three arguments.
451  */
452 #define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
453     const IV offset = RExC_parse - RExC_precomp;		\
454     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,		\
455 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
456 } STMT_END
457 
458 /*
459  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
460  */
461 #define	vFAIL3(m,a1,a2) STMT_START {			\
462     if (!SIZE_ONLY)					\
463 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);	\
464     Simple_vFAIL3(m, a1, a2);				\
465 } STMT_END
466 
467 /*
468  * Like Simple_vFAIL(), but accepts four arguments.
469  */
470 #define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
471     const IV offset = RExC_parse - RExC_precomp;		\
472     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,		\
473 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
474 } STMT_END
475 
476 #define	ckWARNreg(loc,m) STMT_START {					\
477     const IV offset = loc - RExC_precomp;				\
478     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
479 	    (int)offset, RExC_precomp, RExC_precomp + offset);		\
480 } STMT_END
481 
482 #define	ckWARNregdep(loc,m) STMT_START {				\
483     const IV offset = loc - RExC_precomp;				\
484     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),	\
485 	    m REPORT_LOCATION,						\
486 	    (int)offset, RExC_precomp, RExC_precomp + offset);		\
487 } STMT_END
488 
489 #define	ckWARN2reg(loc, m, a1) STMT_START {				\
490     const IV offset = loc - RExC_precomp;				\
491     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
492 	    a1, (int)offset, RExC_precomp, RExC_precomp + offset);	\
493 } STMT_END
494 
495 #define	vWARN3(loc, m, a1, a2) STMT_START {				\
496     const IV offset = loc - RExC_precomp;				\
497     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
498 	    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);	\
499 } STMT_END
500 
501 #define	ckWARN3reg(loc, m, a1, a2) STMT_START {				\
502     const IV offset = loc - RExC_precomp;				\
503     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
504 	    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);	\
505 } STMT_END
506 
507 #define	vWARN4(loc, m, a1, a2, a3) STMT_START {				\
508     const IV offset = loc - RExC_precomp;				\
509     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
510 	    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
511 } STMT_END
512 
513 #define	ckWARN4reg(loc, m, a1, a2, a3) STMT_START {			\
514     const IV offset = loc - RExC_precomp;				\
515     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,	\
516 	    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
517 } STMT_END
518 
519 #define	vWARN5(loc, m, a1, a2, a3, a4) STMT_START {			\
520     const IV offset = loc - RExC_precomp;				\
521     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
522 	    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
523 } STMT_END
524 
525 
526 /* Allow for side effects in s */
527 #define REGC(c,s) STMT_START {			\
528     if (!SIZE_ONLY) *(s) = (c); else (void)(s);	\
529 } STMT_END
530 
531 /* Macros for recording node offsets.   20001227 mjd@plover.com
532  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
533  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
534  * Element 0 holds the number n.
535  * Position is 1 indexed.
536  */
537 #ifndef RE_TRACK_PATTERN_OFFSETS
538 #define Set_Node_Offset_To_R(node,byte)
539 #define Set_Node_Offset(node,byte)
540 #define Set_Cur_Node_Offset
541 #define Set_Node_Length_To_R(node,len)
542 #define Set_Node_Length(node,len)
543 #define Set_Node_Cur_Length(node)
544 #define Node_Offset(n)
545 #define Node_Length(n)
546 #define Set_Node_Offset_Length(node,offset,len)
547 #define ProgLen(ri) ri->u.proglen
548 #define SetProgLen(ri,x) ri->u.proglen = x
549 #else
550 #define ProgLen(ri) ri->u.offsets[0]
551 #define SetProgLen(ri,x) ri->u.offsets[0] = x
552 #define Set_Node_Offset_To_R(node,byte) STMT_START {			\
553     if (! SIZE_ONLY) {							\
554 	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
555 		    __LINE__, (int)(node), (int)(byte)));		\
556 	if((node) < 0) {						\
557 	    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
558 	} else {							\
559 	    RExC_offsets[2*(node)-1] = (byte);				\
560 	}								\
561     }									\
562 } STMT_END
563 
564 #define Set_Node_Offset(node,byte) \
565     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
566 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
567 
568 #define Set_Node_Length_To_R(node,len) STMT_START {			\
569     if (! SIZE_ONLY) {							\
570 	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
571 		__LINE__, (int)(node), (int)(len)));			\
572 	if((node) < 0) {						\
573 	    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
574 	} else {							\
575 	    RExC_offsets[2*(node)] = (len);				\
576 	}								\
577     }									\
578 } STMT_END
579 
580 #define Set_Node_Length(node,len) \
581     Set_Node_Length_To_R((node)-RExC_emit_start, len)
582 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
583 #define Set_Node_Cur_Length(node) \
584     Set_Node_Length(node, RExC_parse - parse_start)
585 
586 /* Get offsets and lengths */
587 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
588 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
589 
590 #define Set_Node_Offset_Length(node,offset,len) STMT_START {	\
591     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));	\
592     Set_Node_Length_To_R((node)-RExC_emit_start, (len));	\
593 } STMT_END
594 #endif
595 
596 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
597 #define EXPERIMENTAL_INPLACESCAN
598 #endif /*RE_TRACK_PATTERN_OFFSETS*/
599 
600 #define DEBUG_STUDYDATA(str,data,depth)                              \
601 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
602     PerlIO_printf(Perl_debug_log,                                    \
603         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
604         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
605         (int)(depth)*2, "",                                          \
606         (IV)((data)->pos_min),                                       \
607         (IV)((data)->pos_delta),                                     \
608         (UV)((data)->flags),                                         \
609         (IV)((data)->whilem_c),                                      \
610         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
611         is_inf ? "INF " : ""                                         \
612     );                                                               \
613     if ((data)->last_found)                                          \
614         PerlIO_printf(Perl_debug_log,                                \
615             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
616             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
617             SvPVX_const((data)->last_found),                         \
618             (IV)((data)->last_end),                                  \
619             (IV)((data)->last_start_min),                            \
620             (IV)((data)->last_start_max),                            \
621             ((data)->longest &&                                      \
622              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
623             SvPVX_const((data)->longest_fixed),                      \
624             (IV)((data)->offset_fixed),                              \
625             ((data)->longest &&                                      \
626              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
627             SvPVX_const((data)->longest_float),                      \
628             (IV)((data)->offset_float_min),                          \
629             (IV)((data)->offset_float_max)                           \
630         );                                                           \
631     PerlIO_printf(Perl_debug_log,"\n");                              \
632 });
633 
634 static void clear_re(pTHX_ void *r);
635 
636 /* Mark that we cannot extend a found fixed substring at this point.
637    Update the longest found anchored substring and the longest found
638    floating substrings if needed. */
639 
640 STATIC void
641 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
642 {
643     const STRLEN l = CHR_SVLEN(data->last_found);
644     const STRLEN old_l = CHR_SVLEN(*data->longest);
645     GET_RE_DEBUG_FLAGS_DECL;
646 
647     PERL_ARGS_ASSERT_SCAN_COMMIT;
648 
649     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
650 	SvSetMagicSV(*data->longest, data->last_found);
651 	if (*data->longest == data->longest_fixed) {
652 	    data->offset_fixed = l ? data->last_start_min : data->pos_min;
653 	    if (data->flags & SF_BEFORE_EOL)
654 		data->flags
655 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
656 	    else
657 		data->flags &= ~SF_FIX_BEFORE_EOL;
658 	    data->minlen_fixed=minlenp;
659 	    data->lookbehind_fixed=0;
660 	}
661 	else { /* *data->longest == data->longest_float */
662 	    data->offset_float_min = l ? data->last_start_min : data->pos_min;
663 	    data->offset_float_max = (l
664 				      ? data->last_start_max
665 				      : data->pos_min + data->pos_delta);
666 	    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
667 		data->offset_float_max = I32_MAX;
668 	    if (data->flags & SF_BEFORE_EOL)
669 		data->flags
670 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
671 	    else
672 		data->flags &= ~SF_FL_BEFORE_EOL;
673             data->minlen_float=minlenp;
674             data->lookbehind_float=0;
675 	}
676     }
677     SvCUR_set(data->last_found, 0);
678     {
679 	SV * const sv = data->last_found;
680 	if (SvUTF8(sv) && SvMAGICAL(sv)) {
681 	    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
682 	    if (mg)
683 		mg->mg_len = 0;
684 	}
685     }
686     data->last_end = -1;
687     data->flags &= ~SF_BEFORE_EOL;
688     DEBUG_STUDYDATA("commit: ",data,0);
689 }
690 
691 /* Can match anything (initialization) */
692 STATIC void
693 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
694 {
695     PERL_ARGS_ASSERT_CL_ANYTHING;
696 
697     ANYOF_CLASS_ZERO(cl);
698     ANYOF_BITMAP_SETALL(cl);
699     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
700     if (LOC)
701 	cl->flags |= ANYOF_LOCALE;
702 }
703 
704 /* Can match anything (initialization) */
705 STATIC int
706 S_cl_is_anything(const struct regnode_charclass_class *cl)
707 {
708     int value;
709 
710     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
711 
712     for (value = 0; value <= ANYOF_MAX; value += 2)
713 	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
714 	    return 1;
715     if (!(cl->flags & ANYOF_UNICODE_ALL))
716 	return 0;
717     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
718 	return 0;
719     return 1;
720 }
721 
722 /* Can match anything (initialization) */
723 STATIC void
724 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
725 {
726     PERL_ARGS_ASSERT_CL_INIT;
727 
728     Zero(cl, 1, struct regnode_charclass_class);
729     cl->type = ANYOF;
730     cl_anything(pRExC_state, cl);
731 }
732 
733 STATIC void
734 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
735 {
736     PERL_ARGS_ASSERT_CL_INIT_ZERO;
737 
738     Zero(cl, 1, struct regnode_charclass_class);
739     cl->type = ANYOF;
740     cl_anything(pRExC_state, cl);
741     if (LOC)
742 	cl->flags |= ANYOF_LOCALE;
743 }
744 
745 /* 'And' a given class with another one.  Can create false positives */
746 /* We assume that cl is not inverted */
747 STATIC void
748 S_cl_and(struct regnode_charclass_class *cl,
749 	const struct regnode_charclass_class *and_with)
750 {
751     PERL_ARGS_ASSERT_CL_AND;
752 
753     assert(and_with->type == ANYOF);
754     if (!(and_with->flags & ANYOF_CLASS)
755 	&& !(cl->flags & ANYOF_CLASS)
756 	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
757 	&& !(and_with->flags & ANYOF_FOLD)
758 	&& !(cl->flags & ANYOF_FOLD)) {
759 	int i;
760 
761 	if (and_with->flags & ANYOF_INVERT)
762 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
763 		cl->bitmap[i] &= ~and_with->bitmap[i];
764 	else
765 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 		cl->bitmap[i] &= and_with->bitmap[i];
767     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
768     if (!(and_with->flags & ANYOF_EOS))
769 	cl->flags &= ~ANYOF_EOS;
770 
771     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
772 	!(and_with->flags & ANYOF_INVERT)) {
773 	cl->flags &= ~ANYOF_UNICODE_ALL;
774 	cl->flags |= ANYOF_UNICODE;
775 	ARG_SET(cl, ARG(and_with));
776     }
777     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
778 	!(and_with->flags & ANYOF_INVERT))
779 	cl->flags &= ~ANYOF_UNICODE_ALL;
780     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
781 	!(and_with->flags & ANYOF_INVERT))
782 	cl->flags &= ~ANYOF_UNICODE;
783 }
784 
785 /* 'OR' a given class with another one.  Can create false positives */
786 /* We assume that cl is not inverted */
787 STATIC void
788 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
789 {
790     PERL_ARGS_ASSERT_CL_OR;
791 
792     if (or_with->flags & ANYOF_INVERT) {
793 	/* We do not use
794 	 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
795 	 *   <= (B1 | !B2) | (CL1 | !CL2)
796 	 * which is wasteful if CL2 is small, but we ignore CL2:
797 	 *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
798 	 * XXXX Can we handle case-fold?  Unclear:
799 	 *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
800 	 *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
801 	 */
802 	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
803 	     && !(or_with->flags & ANYOF_FOLD)
804 	     && !(cl->flags & ANYOF_FOLD) ) {
805 	    int i;
806 
807 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 		cl->bitmap[i] |= ~or_with->bitmap[i];
809 	} /* XXXX: logic is complicated otherwise */
810 	else {
811 	    cl_anything(pRExC_state, cl);
812 	}
813     } else {
814 	/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
815 	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
816 	     && (!(or_with->flags & ANYOF_FOLD)
817 		 || (cl->flags & ANYOF_FOLD)) ) {
818 	    int i;
819 
820 	    /* OR char bitmap and class bitmap separately */
821 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 		cl->bitmap[i] |= or_with->bitmap[i];
823 	    if (or_with->flags & ANYOF_CLASS) {
824 		for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
825 		    cl->classflags[i] |= or_with->classflags[i];
826 		cl->flags |= ANYOF_CLASS;
827 	    }
828 	}
829 	else { /* XXXX: logic is complicated, leave it along for a moment. */
830 	    cl_anything(pRExC_state, cl);
831 	}
832     }
833     if (or_with->flags & ANYOF_EOS)
834 	cl->flags |= ANYOF_EOS;
835 
836     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
837 	ARG(cl) != ARG(or_with)) {
838 	cl->flags |= ANYOF_UNICODE_ALL;
839 	cl->flags &= ~ANYOF_UNICODE;
840     }
841     if (or_with->flags & ANYOF_UNICODE_ALL) {
842 	cl->flags |= ANYOF_UNICODE_ALL;
843 	cl->flags &= ~ANYOF_UNICODE;
844     }
845 }
846 
847 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
848 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
849 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
850 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
851 
852 
853 #ifdef DEBUGGING
854 /*
855    dump_trie(trie,widecharmap,revcharmap)
856    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
857    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
858 
859    These routines dump out a trie in a somewhat readable format.
860    The _interim_ variants are used for debugging the interim
861    tables that are used to generate the final compressed
862    representation which is what dump_trie expects.
863 
864    Part of the reason for their existance is to provide a form
865    of documentation as to how the different representations function.
866 
867 */
868 
869 /*
870   Dumps the final compressed table form of the trie to Perl_debug_log.
871   Used for debugging make_trie().
872 */
873 
874 STATIC void
875 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
876 	    AV *revcharmap, U32 depth)
877 {
878     U32 state;
879     SV *sv=sv_newmortal();
880     int colwidth= widecharmap ? 6 : 4;
881     GET_RE_DEBUG_FLAGS_DECL;
882 
883     PERL_ARGS_ASSERT_DUMP_TRIE;
884 
885     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
886         (int)depth * 2 + 2,"",
887         "Match","Base","Ofs" );
888 
889     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
890 	SV ** const tmp = av_fetch( revcharmap, state, 0);
891         if ( tmp ) {
892             PerlIO_printf( Perl_debug_log, "%*s",
893                 colwidth,
894                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
895 	                    PL_colors[0], PL_colors[1],
896 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
897 	                    PERL_PV_ESCAPE_FIRSTCHAR
898                 )
899             );
900         }
901     }
902     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
903         (int)depth * 2 + 2,"");
904 
905     for( state = 0 ; state < trie->uniquecharcount ; state++ )
906         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
907     PerlIO_printf( Perl_debug_log, "\n");
908 
909     for( state = 1 ; state < trie->statecount ; state++ ) {
910 	const U32 base = trie->states[ state ].trans.base;
911 
912         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
913 
914         if ( trie->states[ state ].wordnum ) {
915             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
916         } else {
917             PerlIO_printf( Perl_debug_log, "%6s", "" );
918         }
919 
920         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
921 
922         if ( base ) {
923             U32 ofs = 0;
924 
925             while( ( base + ofs  < trie->uniquecharcount ) ||
926                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
927                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
928                     ofs++;
929 
930             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
931 
932             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
933                 if ( ( base + ofs >= trie->uniquecharcount ) &&
934                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
935                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
936                 {
937                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
938                     colwidth,
939                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
940                 } else {
941                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
942                 }
943             }
944 
945             PerlIO_printf( Perl_debug_log, "]");
946 
947         }
948         PerlIO_printf( Perl_debug_log, "\n" );
949     }
950 }
951 /*
952   Dumps a fully constructed but uncompressed trie in list form.
953   List tries normally only are used for construction when the number of
954   possible chars (trie->uniquecharcount) is very high.
955   Used for debugging make_trie().
956 */
957 STATIC void
958 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
959 			 HV *widecharmap, AV *revcharmap, U32 next_alloc,
960 			 U32 depth)
961 {
962     U32 state;
963     SV *sv=sv_newmortal();
964     int colwidth= widecharmap ? 6 : 4;
965     GET_RE_DEBUG_FLAGS_DECL;
966 
967     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
968 
969     /* print out the table precompression.  */
970     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
971         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
972         "------:-----+-----------------\n" );
973 
974     for( state=1 ; state < next_alloc ; state ++ ) {
975         U16 charid;
976 
977         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
978             (int)depth * 2 + 2,"", (UV)state  );
979         if ( ! trie->states[ state ].wordnum ) {
980             PerlIO_printf( Perl_debug_log, "%5s| ","");
981         } else {
982             PerlIO_printf( Perl_debug_log, "W%4x| ",
983                 trie->states[ state ].wordnum
984             );
985         }
986         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
987 	    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
988 	    if ( tmp ) {
989                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
990                     colwidth,
991                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
992 	                    PL_colors[0], PL_colors[1],
993 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
994 	                    PERL_PV_ESCAPE_FIRSTCHAR
995                     ) ,
996                     TRIE_LIST_ITEM(state,charid).forid,
997                     (UV)TRIE_LIST_ITEM(state,charid).newstate
998                 );
999                 if (!(charid % 10))
1000                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1001                         (int)((depth * 2) + 14), "");
1002             }
1003         }
1004         PerlIO_printf( Perl_debug_log, "\n");
1005     }
1006 }
1007 
1008 /*
1009   Dumps a fully constructed but uncompressed trie in table form.
1010   This is the normal DFA style state transition table, with a few
1011   twists to facilitate compression later.
1012   Used for debugging make_trie().
1013 */
1014 STATIC void
1015 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1016 			  HV *widecharmap, AV *revcharmap, U32 next_alloc,
1017 			  U32 depth)
1018 {
1019     U32 state;
1020     U16 charid;
1021     SV *sv=sv_newmortal();
1022     int colwidth= widecharmap ? 6 : 4;
1023     GET_RE_DEBUG_FLAGS_DECL;
1024 
1025     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1026 
1027     /*
1028        print out the table precompression so that we can do a visual check
1029        that they are identical.
1030      */
1031 
1032     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1033 
1034     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1035 	SV ** const tmp = av_fetch( revcharmap, charid, 0);
1036         if ( tmp ) {
1037             PerlIO_printf( Perl_debug_log, "%*s",
1038                 colwidth,
1039                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1040 	                    PL_colors[0], PL_colors[1],
1041 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1042 	                    PERL_PV_ESCAPE_FIRSTCHAR
1043                 )
1044             );
1045         }
1046     }
1047 
1048     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1049 
1050     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1051         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1052     }
1053 
1054     PerlIO_printf( Perl_debug_log, "\n" );
1055 
1056     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1057 
1058         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1059             (int)depth * 2 + 2,"",
1060             (UV)TRIE_NODENUM( state ) );
1061 
1062         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1063             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1064             if (v)
1065                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1066             else
1067                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1068         }
1069         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1070             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1071         } else {
1072             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1073             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1074         }
1075     }
1076 }
1077 
1078 #endif
1079 
1080 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1081   startbranch: the first branch in the whole branch sequence
1082   first      : start branch of sequence of branch-exact nodes.
1083 	       May be the same as startbranch
1084   last       : Thing following the last branch.
1085 	       May be the same as tail.
1086   tail       : item following the branch sequence
1087   count      : words in the sequence
1088   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1089   depth      : indent depth
1090 
1091 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1092 
1093 A trie is an N'ary tree where the branches are determined by digital
1094 decomposition of the key. IE, at the root node you look up the 1st character and
1095 follow that branch repeat until you find the end of the branches. Nodes can be
1096 marked as "accepting" meaning they represent a complete word. Eg:
1097 
1098   /he|she|his|hers/
1099 
1100 would convert into the following structure. Numbers represent states, letters
1101 following numbers represent valid transitions on the letter from that state, if
1102 the number is in square brackets it represents an accepting state, otherwise it
1103 will be in parenthesis.
1104 
1105       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1106       |    |
1107       |   (2)
1108       |    |
1109      (1)   +-i->(6)-+-s->[7]
1110       |
1111       +-s->(3)-+-h->(4)-+-e->[5]
1112 
1113       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1114 
1115 This shows that when matching against the string 'hers' we will begin at state 1
1116 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1117 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1118 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1119 single traverse. We store a mapping from accepting to state to which word was
1120 matched, and then when we have multiple possibilities we try to complete the
1121 rest of the regex in the order in which they occured in the alternation.
1122 
1123 The only prior NFA like behaviour that would be changed by the TRIE support is
1124 the silent ignoring of duplicate alternations which are of the form:
1125 
1126  / (DUPE|DUPE) X? (?{ ... }) Y /x
1127 
1128 Thus EVAL blocks follwing a trie may be called a different number of times with
1129 and without the optimisation. With the optimisations dupes will be silently
1130 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1131 the following demonstrates:
1132 
1133  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1134 
1135 which prints out 'word' three times, but
1136 
1137  'words'=~/(word|word|word)(?{ print $1 })S/
1138 
1139 which doesnt print it out at all. This is due to other optimisations kicking in.
1140 
1141 Example of what happens on a structural level:
1142 
1143 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1144 
1145    1: CURLYM[1] {1,32767}(18)
1146    5:   BRANCH(8)
1147    6:     EXACT <ac>(16)
1148    8:   BRANCH(11)
1149    9:     EXACT <ad>(16)
1150   11:   BRANCH(14)
1151   12:     EXACT <ab>(16)
1152   16:   SUCCEED(0)
1153   17:   NOTHING(18)
1154   18: END(0)
1155 
1156 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1157 and should turn into:
1158 
1159    1: CURLYM[1] {1,32767}(18)
1160    5:   TRIE(16)
1161 	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1162 	  <ac>
1163 	  <ad>
1164 	  <ab>
1165   16:   SUCCEED(0)
1166   17:   NOTHING(18)
1167   18: END(0)
1168 
1169 Cases where tail != last would be like /(?foo|bar)baz/:
1170 
1171    1: BRANCH(4)
1172    2:   EXACT <foo>(8)
1173    4: BRANCH(7)
1174    5:   EXACT <bar>(8)
1175    7: TAIL(8)
1176    8: EXACT <baz>(10)
1177   10: END(0)
1178 
1179 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1180 and would end up looking like:
1181 
1182     1: TRIE(8)
1183       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1184 	<foo>
1185 	<bar>
1186    7: TAIL(8)
1187    8: EXACT <baz>(10)
1188   10: END(0)
1189 
1190     d = uvuni_to_utf8_flags(d, uv, 0);
1191 
1192 is the recommended Unicode-aware way of saying
1193 
1194     *(d++) = uv;
1195 */
1196 
1197 #define TRIE_STORE_REVCHAR                                                 \
1198     STMT_START {                                                           \
1199 	if (UTF) {							   \
1200 	    SV *zlopp = newSV(2);					   \
1201 	    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);	   \
1202 	    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1203 	    SvCUR_set(zlopp, kapow - flrbbbbb);				   \
1204 	    SvPOK_on(zlopp);						   \
1205 	    SvUTF8_on(zlopp);						   \
1206 	    av_push(revcharmap, zlopp);					   \
1207 	} else {							   \
1208 	    char ooooff = (char)uvc;					   	   \
1209 	    av_push(revcharmap, newSVpvn(&ooooff, 1));			   \
1210 	}								   \
1211         } STMT_END
1212 
1213 #define TRIE_READ_CHAR STMT_START {                                           \
1214     wordlen++;                                                                \
1215     if ( UTF ) {                                                              \
1216 	if ( folder ) {                                                       \
1217 	    if ( foldlen > 0 ) {                                              \
1218 	       uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1219 	       foldlen -= len;                                                \
1220 	       scan += len;                                                   \
1221 	       len = 0;                                                       \
1222 	    } else {                                                          \
1223 		uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1224 		uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1225 		foldlen -= UNISKIP( uvc );                                    \
1226 		scan = foldbuf + UNISKIP( uvc );                              \
1227 	    }                                                                 \
1228 	} else {                                                              \
1229 	    uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1230 	}                                                                     \
1231     } else {                                                                  \
1232 	uvc = (U32)*uc;                                                       \
1233 	len = 1;                                                              \
1234     }                                                                         \
1235 } STMT_END
1236 
1237 
1238 
1239 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1240     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1241 	U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1242 	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1243     }                                                           \
1244     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1245     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1246     TRIE_LIST_CUR( state )++;                                   \
1247 } STMT_END
1248 
1249 #define TRIE_LIST_NEW(state) STMT_START {                       \
1250     Newxz( trie->states[ state ].trans.list,               \
1251 	4, reg_trie_trans_le );                                 \
1252      TRIE_LIST_CUR( state ) = 1;                                \
1253      TRIE_LIST_LEN( state ) = 4;                                \
1254 } STMT_END
1255 
1256 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1257     U16 dupe= trie->states[ state ].wordnum;                    \
1258     regnode * const noper_next = regnext( noper );              \
1259                                                                 \
1260     if (trie->wordlen)                                          \
1261         trie->wordlen[ curword ] = wordlen;                     \
1262     DEBUG_r({                                                   \
1263         /* store the word for dumping */                        \
1264         SV* tmp;                                                \
1265         if (OP(noper) != NOTHING)                               \
1266             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);	\
1267         else                                                    \
1268             tmp = newSVpvn_utf8( "", 0, UTF );			\
1269         av_push( trie_words, tmp );                             \
1270     });                                                         \
1271                                                                 \
1272     curword++;                                                  \
1273                                                                 \
1274     if ( noper_next < tail ) {                                  \
1275         if (!trie->jump)                                        \
1276             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1277         trie->jump[curword] = (U16)(noper_next - convert);      \
1278         if (!jumper)                                            \
1279             jumper = noper_next;                                \
1280         if (!nextbranch)                                        \
1281             nextbranch= regnext(cur);                           \
1282     }                                                           \
1283                                                                 \
1284     if ( dupe ) {                                               \
1285         /* So it's a dupe. This means we need to maintain a   */\
1286         /* linked-list from the first to the next.            */\
1287         /* we only allocate the nextword buffer when there    */\
1288         /* a dupe, so first time we have to do the allocation */\
1289         if (!trie->nextword)                                    \
1290             trie->nextword = (U16 *)					\
1291 		PerlMemShared_calloc( word_count + 1, sizeof(U16));	\
1292         while ( trie->nextword[dupe] )                          \
1293             dupe= trie->nextword[dupe];                         \
1294         trie->nextword[dupe]= curword;                          \
1295     } else {                                                    \
1296         /* we haven't inserted this word yet.                */ \
1297         trie->states[ state ].wordnum = curword;                \
1298     }                                                           \
1299 } STMT_END
1300 
1301 
1302 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)		\
1303      ( ( base + charid >=  ucharcount					\
1304          && base + charid < ubound					\
1305          && state == trie->trans[ base - ucharcount + charid ].check	\
1306          && trie->trans[ base - ucharcount + charid ].next )		\
1307            ? trie->trans[ base - ucharcount + charid ].next		\
1308            : ( state==1 ? special : 0 )					\
1309       )
1310 
1311 #define MADE_TRIE       1
1312 #define MADE_JUMP_TRIE  2
1313 #define MADE_EXACT_TRIE 4
1314 
1315 STATIC I32
1316 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1317 {
1318     dVAR;
1319     /* first pass, loop through and scan words */
1320     reg_trie_data *trie;
1321     HV *widecharmap = NULL;
1322     AV *revcharmap = newAV();
1323     regnode *cur;
1324     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1325     STRLEN len = 0;
1326     UV uvc = 0;
1327     U16 curword = 0;
1328     U32 next_alloc = 0;
1329     regnode *jumper = NULL;
1330     regnode *nextbranch = NULL;
1331     regnode *convert = NULL;
1332     /* we just use folder as a flag in utf8 */
1333     const U8 * const folder = ( flags == EXACTF
1334                        ? PL_fold
1335                        : ( flags == EXACTFL
1336                            ? PL_fold_locale
1337                            : NULL
1338                          )
1339                      );
1340 
1341 #ifdef DEBUGGING
1342     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1343     AV *trie_words = NULL;
1344     /* along with revcharmap, this only used during construction but both are
1345      * useful during debugging so we store them in the struct when debugging.
1346      */
1347 #else
1348     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1349     STRLEN trie_charcount=0;
1350 #endif
1351     SV *re_trie_maxbuff;
1352     GET_RE_DEBUG_FLAGS_DECL;
1353 
1354     PERL_ARGS_ASSERT_MAKE_TRIE;
1355 #ifndef DEBUGGING
1356     PERL_UNUSED_ARG(depth);
1357 #endif
1358 
1359     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1360     trie->refcount = 1;
1361     trie->startstate = 1;
1362     trie->wordcount = word_count;
1363     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1364     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1365     if (!(UTF && folder))
1366 	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1367     DEBUG_r({
1368         trie_words = newAV();
1369     });
1370 
1371     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1372     if (!SvIOK(re_trie_maxbuff)) {
1373         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1374     }
1375     DEBUG_OPTIMISE_r({
1376                 PerlIO_printf( Perl_debug_log,
1377                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1378                   (int)depth * 2 + 2, "",
1379                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1380                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1381                   (int)depth);
1382     });
1383 
1384    /* Find the node we are going to overwrite */
1385     if ( first == startbranch && OP( last ) != BRANCH ) {
1386         /* whole branch chain */
1387         convert = first;
1388     } else {
1389         /* branch sub-chain */
1390         convert = NEXTOPER( first );
1391     }
1392 
1393     /*  -- First loop and Setup --
1394 
1395        We first traverse the branches and scan each word to determine if it
1396        contains widechars, and how many unique chars there are, this is
1397        important as we have to build a table with at least as many columns as we
1398        have unique chars.
1399 
1400        We use an array of integers to represent the character codes 0..255
1401        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1402        native representation of the character value as the key and IV's for the
1403        coded index.
1404 
1405        *TODO* If we keep track of how many times each character is used we can
1406        remap the columns so that the table compression later on is more
1407        efficient in terms of memory by ensuring most common value is in the
1408        middle and the least common are on the outside.  IMO this would be better
1409        than a most to least common mapping as theres a decent chance the most
1410        common letter will share a node with the least common, meaning the node
1411        will not be compressable. With a middle is most common approach the worst
1412        case is when we have the least common nodes twice.
1413 
1414      */
1415 
1416     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1417         regnode * const noper = NEXTOPER( cur );
1418         const U8 *uc = (U8*)STRING( noper );
1419         const U8 * const e  = uc + STR_LEN( noper );
1420         STRLEN foldlen = 0;
1421         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1422         const U8 *scan = (U8*)NULL;
1423         U32 wordlen      = 0;         /* required init */
1424         STRLEN chars = 0;
1425         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1426 
1427         if (OP(noper) == NOTHING) {
1428             trie->minlen= 0;
1429             continue;
1430         }
1431         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1432             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1433                                           regardless of encoding */
1434 
1435         for ( ; uc < e ; uc += len ) {
1436             TRIE_CHARCOUNT(trie)++;
1437             TRIE_READ_CHAR;
1438             chars++;
1439             if ( uvc < 256 ) {
1440                 if ( !trie->charmap[ uvc ] ) {
1441                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1442                     if ( folder )
1443                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1444                     TRIE_STORE_REVCHAR;
1445                 }
1446                 if ( set_bit ) {
1447                     /* store the codepoint in the bitmap, and if its ascii
1448                        also store its folded equivelent. */
1449                     TRIE_BITMAP_SET(trie,uvc);
1450 
1451 		    /* store the folded codepoint */
1452 		    if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1453 
1454 		    if ( !UTF ) {
1455 			/* store first byte of utf8 representation of
1456 			   codepoints in the 127 < uvc < 256 range */
1457 			if (127 < uvc && uvc < 192) {
1458 			    TRIE_BITMAP_SET(trie,194);
1459 			} else if (191 < uvc ) {
1460 			    TRIE_BITMAP_SET(trie,195);
1461 			/* && uvc < 256 -- we know uvc is < 256 already */
1462 			}
1463 		    }
1464                     set_bit = 0; /* We've done our bit :-) */
1465                 }
1466             } else {
1467                 SV** svpp;
1468                 if ( !widecharmap )
1469                     widecharmap = newHV();
1470 
1471                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1472 
1473                 if ( !svpp )
1474                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1475 
1476                 if ( !SvTRUE( *svpp ) ) {
1477                     sv_setiv( *svpp, ++trie->uniquecharcount );
1478                     TRIE_STORE_REVCHAR;
1479                 }
1480             }
1481         }
1482         if( cur == first ) {
1483             trie->minlen=chars;
1484             trie->maxlen=chars;
1485         } else if (chars < trie->minlen) {
1486             trie->minlen=chars;
1487         } else if (chars > trie->maxlen) {
1488             trie->maxlen=chars;
1489         }
1490 
1491     } /* end first pass */
1492     DEBUG_TRIE_COMPILE_r(
1493         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1494                 (int)depth * 2 + 2,"",
1495                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1496 		(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1497 		(int)trie->minlen, (int)trie->maxlen )
1498     );
1499     trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1500 
1501     /*
1502         We now know what we are dealing with in terms of unique chars and
1503         string sizes so we can calculate how much memory a naive
1504         representation using a flat table  will take. If it's over a reasonable
1505         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1506         conservative but potentially much slower representation using an array
1507         of lists.
1508 
1509         At the end we convert both representations into the same compressed
1510         form that will be used in regexec.c for matching with. The latter
1511         is a form that cannot be used to construct with but has memory
1512         properties similar to the list form and access properties similar
1513         to the table form making it both suitable for fast searches and
1514         small enough that its feasable to store for the duration of a program.
1515 
1516         See the comment in the code where the compressed table is produced
1517         inplace from the flat tabe representation for an explanation of how
1518         the compression works.
1519 
1520     */
1521 
1522 
1523     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1524         /*
1525             Second Pass -- Array Of Lists Representation
1526 
1527             Each state will be represented by a list of charid:state records
1528             (reg_trie_trans_le) the first such element holds the CUR and LEN
1529             points of the allocated array. (See defines above).
1530 
1531             We build the initial structure using the lists, and then convert
1532             it into the compressed table form which allows faster lookups
1533             (but cant be modified once converted).
1534         */
1535 
1536         STRLEN transcount = 1;
1537 
1538         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1539             "%*sCompiling trie using list compiler\n",
1540             (int)depth * 2 + 2, ""));
1541 
1542 	trie->states = (reg_trie_state *)
1543 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1544 				  sizeof(reg_trie_state) );
1545         TRIE_LIST_NEW(1);
1546         next_alloc = 2;
1547 
1548         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1549 
1550 	    regnode * const noper = NEXTOPER( cur );
1551 	    U8 *uc           = (U8*)STRING( noper );
1552 	    const U8 * const e = uc + STR_LEN( noper );
1553 	    U32 state        = 1;         /* required init */
1554 	    U16 charid       = 0;         /* sanity init */
1555 	    U8 *scan         = (U8*)NULL; /* sanity init */
1556 	    STRLEN foldlen   = 0;         /* required init */
1557             U32 wordlen      = 0;         /* required init */
1558 	    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1559 
1560             if (OP(noper) != NOTHING) {
1561                 for ( ; uc < e ; uc += len ) {
1562 
1563                     TRIE_READ_CHAR;
1564 
1565                     if ( uvc < 256 ) {
1566                         charid = trie->charmap[ uvc ];
1567 		    } else {
1568                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1569                         if ( !svpp ) {
1570                             charid = 0;
1571                         } else {
1572                             charid=(U16)SvIV( *svpp );
1573                         }
1574 		    }
1575                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1576                     if ( charid ) {
1577 
1578                         U16 check;
1579                         U32 newstate = 0;
1580 
1581                         charid--;
1582                         if ( !trie->states[ state ].trans.list ) {
1583                             TRIE_LIST_NEW( state );
1584 			}
1585                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1586                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1587                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1588                                 break;
1589                             }
1590                         }
1591                         if ( ! newstate ) {
1592                             newstate = next_alloc++;
1593                             TRIE_LIST_PUSH( state, charid, newstate );
1594                             transcount++;
1595                         }
1596                         state = newstate;
1597                     } else {
1598                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1599 		    }
1600 		}
1601 	    }
1602             TRIE_HANDLE_WORD(state);
1603 
1604         } /* end second pass */
1605 
1606         /* next alloc is the NEXT state to be allocated */
1607         trie->statecount = next_alloc;
1608         trie->states = (reg_trie_state *)
1609 	    PerlMemShared_realloc( trie->states,
1610 				   next_alloc
1611 				   * sizeof(reg_trie_state) );
1612 
1613         /* and now dump it out before we compress it */
1614         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1615 							 revcharmap, next_alloc,
1616 							 depth+1)
1617         );
1618 
1619         trie->trans = (reg_trie_trans *)
1620 	    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1621         {
1622             U32 state;
1623             U32 tp = 0;
1624             U32 zp = 0;
1625 
1626 
1627             for( state=1 ; state < next_alloc ; state ++ ) {
1628                 U32 base=0;
1629 
1630                 /*
1631                 DEBUG_TRIE_COMPILE_MORE_r(
1632                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1633                 );
1634                 */
1635 
1636                 if (trie->states[state].trans.list) {
1637                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1638                     U16 maxid=minid;
1639 		    U16 idx;
1640 
1641                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1642 			const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1643 			if ( forid < minid ) {
1644 			    minid=forid;
1645 			} else if ( forid > maxid ) {
1646 			    maxid=forid;
1647 			}
1648                     }
1649                     if ( transcount < tp + maxid - minid + 1) {
1650                         transcount *= 2;
1651 			trie->trans = (reg_trie_trans *)
1652 			    PerlMemShared_realloc( trie->trans,
1653 						     transcount
1654 						     * sizeof(reg_trie_trans) );
1655                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1656                     }
1657                     base = trie->uniquecharcount + tp - minid;
1658                     if ( maxid == minid ) {
1659                         U32 set = 0;
1660                         for ( ; zp < tp ; zp++ ) {
1661                             if ( ! trie->trans[ zp ].next ) {
1662                                 base = trie->uniquecharcount + zp - minid;
1663                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1664                                 trie->trans[ zp ].check = state;
1665                                 set = 1;
1666                                 break;
1667                             }
1668                         }
1669                         if ( !set ) {
1670                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1671                             trie->trans[ tp ].check = state;
1672                             tp++;
1673                             zp = tp;
1674                         }
1675                     } else {
1676                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1677                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1678                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1679                             trie->trans[ tid ].check = state;
1680                         }
1681                         tp += ( maxid - minid + 1 );
1682                     }
1683                     Safefree(trie->states[ state ].trans.list);
1684                 }
1685                 /*
1686                 DEBUG_TRIE_COMPILE_MORE_r(
1687                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1688                 );
1689                 */
1690                 trie->states[ state ].trans.base=base;
1691             }
1692             trie->lasttrans = tp + 1;
1693         }
1694     } else {
1695         /*
1696            Second Pass -- Flat Table Representation.
1697 
1698            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1699            We know that we will need Charcount+1 trans at most to store the data
1700            (one row per char at worst case) So we preallocate both structures
1701            assuming worst case.
1702 
1703            We then construct the trie using only the .next slots of the entry
1704            structs.
1705 
1706            We use the .check field of the first entry of the node  temporarily to
1707            make compression both faster and easier by keeping track of how many non
1708            zero fields are in the node.
1709 
1710            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1711            transition.
1712 
1713            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1714            number representing the first entry of the node, and state as a
1715            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1716            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1717            are 2 entrys per node. eg:
1718 
1719              A B       A B
1720           1. 2 4    1. 3 7
1721           2. 0 3    3. 0 5
1722           3. 0 0    5. 0 0
1723           4. 0 0    7. 0 0
1724 
1725            The table is internally in the right hand, idx form. However as we also
1726            have to deal with the states array which is indexed by nodenum we have to
1727            use TRIE_NODENUM() to convert.
1728 
1729         */
1730         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1731             "%*sCompiling trie using table compiler\n",
1732             (int)depth * 2 + 2, ""));
1733 
1734 	trie->trans = (reg_trie_trans *)
1735 	    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1736 				  * trie->uniquecharcount + 1,
1737 				  sizeof(reg_trie_trans) );
1738         trie->states = (reg_trie_state *)
1739 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1740 				  sizeof(reg_trie_state) );
1741         next_alloc = trie->uniquecharcount + 1;
1742 
1743 
1744         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1745 
1746 	    regnode * const noper   = NEXTOPER( cur );
1747 	    const U8 *uc     = (U8*)STRING( noper );
1748 	    const U8 * const e = uc + STR_LEN( noper );
1749 
1750             U32 state        = 1;         /* required init */
1751 
1752             U16 charid       = 0;         /* sanity init */
1753             U32 accept_state = 0;         /* sanity init */
1754             U8 *scan         = (U8*)NULL; /* sanity init */
1755 
1756             STRLEN foldlen   = 0;         /* required init */
1757             U32 wordlen      = 0;         /* required init */
1758             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1759 
1760             if ( OP(noper) != NOTHING ) {
1761                 for ( ; uc < e ; uc += len ) {
1762 
1763                     TRIE_READ_CHAR;
1764 
1765                     if ( uvc < 256 ) {
1766                         charid = trie->charmap[ uvc ];
1767                     } else {
1768                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1769                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1770                     }
1771                     if ( charid ) {
1772                         charid--;
1773                         if ( !trie->trans[ state + charid ].next ) {
1774                             trie->trans[ state + charid ].next = next_alloc;
1775                             trie->trans[ state ].check++;
1776                             next_alloc += trie->uniquecharcount;
1777                         }
1778                         state = trie->trans[ state + charid ].next;
1779                     } else {
1780                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1781                     }
1782                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1783                 }
1784             }
1785             accept_state = TRIE_NODENUM( state );
1786             TRIE_HANDLE_WORD(accept_state);
1787 
1788         } /* end second pass */
1789 
1790         /* and now dump it out before we compress it */
1791         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1792 							  revcharmap,
1793 							  next_alloc, depth+1));
1794 
1795         {
1796         /*
1797            * Inplace compress the table.*
1798 
1799            For sparse data sets the table constructed by the trie algorithm will
1800            be mostly 0/FAIL transitions or to put it another way mostly empty.
1801            (Note that leaf nodes will not contain any transitions.)
1802 
1803            This algorithm compresses the tables by eliminating most such
1804            transitions, at the cost of a modest bit of extra work during lookup:
1805 
1806            - Each states[] entry contains a .base field which indicates the
1807            index in the state[] array wheres its transition data is stored.
1808 
1809            - If .base is 0 there are no  valid transitions from that node.
1810 
1811            - If .base is nonzero then charid is added to it to find an entry in
1812            the trans array.
1813 
1814            -If trans[states[state].base+charid].check!=state then the
1815            transition is taken to be a 0/Fail transition. Thus if there are fail
1816            transitions at the front of the node then the .base offset will point
1817            somewhere inside the previous nodes data (or maybe even into a node
1818            even earlier), but the .check field determines if the transition is
1819            valid.
1820 
1821            XXX - wrong maybe?
1822            The following process inplace converts the table to the compressed
1823            table: We first do not compress the root node 1,and mark its all its
1824            .check pointers as 1 and set its .base pointer as 1 as well. This
1825            allows to do a DFA construction from the compressed table later, and
1826            ensures that any .base pointers we calculate later are greater than
1827            0.
1828 
1829            - We set 'pos' to indicate the first entry of the second node.
1830 
1831            - We then iterate over the columns of the node, finding the first and
1832            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1833            and set the .check pointers accordingly, and advance pos
1834            appropriately and repreat for the next node. Note that when we copy
1835            the next pointers we have to convert them from the original
1836            NODEIDX form to NODENUM form as the former is not valid post
1837            compression.
1838 
1839            - If a node has no transitions used we mark its base as 0 and do not
1840            advance the pos pointer.
1841 
1842            - If a node only has one transition we use a second pointer into the
1843            structure to fill in allocated fail transitions from other states.
1844            This pointer is independent of the main pointer and scans forward
1845            looking for null transitions that are allocated to a state. When it
1846            finds one it writes the single transition into the "hole".  If the
1847            pointer doesnt find one the single transition is appended as normal.
1848 
1849            - Once compressed we can Renew/realloc the structures to release the
1850            excess space.
1851 
1852            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1853            specifically Fig 3.47 and the associated pseudocode.
1854 
1855            demq
1856         */
1857         const U32 laststate = TRIE_NODENUM( next_alloc );
1858 	U32 state, charid;
1859         U32 pos = 0, zp=0;
1860         trie->statecount = laststate;
1861 
1862         for ( state = 1 ; state < laststate ; state++ ) {
1863             U8 flag = 0;
1864 	    const U32 stateidx = TRIE_NODEIDX( state );
1865 	    const U32 o_used = trie->trans[ stateidx ].check;
1866 	    U32 used = trie->trans[ stateidx ].check;
1867             trie->trans[ stateidx ].check = 0;
1868 
1869             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1870                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1871                     if ( trie->trans[ stateidx + charid ].next ) {
1872                         if (o_used == 1) {
1873                             for ( ; zp < pos ; zp++ ) {
1874                                 if ( ! trie->trans[ zp ].next ) {
1875                                     break;
1876                                 }
1877                             }
1878                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1879                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880                             trie->trans[ zp ].check = state;
1881                             if ( ++zp > pos ) pos = zp;
1882                             break;
1883                         }
1884                         used--;
1885                     }
1886                     if ( !flag ) {
1887                         flag = 1;
1888                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1889                     }
1890                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1891                     trie->trans[ pos ].check = state;
1892                     pos++;
1893                 }
1894             }
1895         }
1896         trie->lasttrans = pos + 1;
1897         trie->states = (reg_trie_state *)
1898 	    PerlMemShared_realloc( trie->states, laststate
1899 				   * sizeof(reg_trie_state) );
1900         DEBUG_TRIE_COMPILE_MORE_r(
1901                 PerlIO_printf( Perl_debug_log,
1902 		    "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1903 		    (int)depth * 2 + 2,"",
1904                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1905 		    (IV)next_alloc,
1906 		    (IV)pos,
1907                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1908             );
1909 
1910         } /* end table compress */
1911     }
1912     DEBUG_TRIE_COMPILE_MORE_r(
1913             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1914                 (int)depth * 2 + 2, "",
1915                 (UV)trie->statecount,
1916                 (UV)trie->lasttrans)
1917     );
1918     /* resize the trans array to remove unused space */
1919     trie->trans = (reg_trie_trans *)
1920 	PerlMemShared_realloc( trie->trans, trie->lasttrans
1921 			       * sizeof(reg_trie_trans) );
1922 
1923     /* and now dump out the compressed format */
1924     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1925 
1926     {   /* Modify the program and insert the new TRIE node*/
1927         U8 nodetype =(U8)(flags & 0xFF);
1928         char *str=NULL;
1929 
1930 #ifdef DEBUGGING
1931         regnode *optimize = NULL;
1932 #ifdef RE_TRACK_PATTERN_OFFSETS
1933 
1934         U32 mjd_offset = 0;
1935         U32 mjd_nodelen = 0;
1936 #endif /* RE_TRACK_PATTERN_OFFSETS */
1937 #endif /* DEBUGGING */
1938         /*
1939            This means we convert either the first branch or the first Exact,
1940            depending on whether the thing following (in 'last') is a branch
1941            or not and whther first is the startbranch (ie is it a sub part of
1942            the alternation or is it the whole thing.)
1943            Assuming its a sub part we conver the EXACT otherwise we convert
1944            the whole branch sequence, including the first.
1945          */
1946         /* Find the node we are going to overwrite */
1947         if ( first != startbranch || OP( last ) == BRANCH ) {
1948             /* branch sub-chain */
1949             NEXT_OFF( first ) = (U16)(last - first);
1950 #ifdef RE_TRACK_PATTERN_OFFSETS
1951             DEBUG_r({
1952                 mjd_offset= Node_Offset((convert));
1953                 mjd_nodelen= Node_Length((convert));
1954             });
1955 #endif
1956             /* whole branch chain */
1957         }
1958 #ifdef RE_TRACK_PATTERN_OFFSETS
1959         else {
1960             DEBUG_r({
1961                 const  regnode *nop = NEXTOPER( convert );
1962                 mjd_offset= Node_Offset((nop));
1963                 mjd_nodelen= Node_Length((nop));
1964             });
1965         }
1966         DEBUG_OPTIMISE_r(
1967             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1968                 (int)depth * 2 + 2, "",
1969                 (UV)mjd_offset, (UV)mjd_nodelen)
1970         );
1971 #endif
1972         /* But first we check to see if there is a common prefix we can
1973            split out as an EXACT and put in front of the TRIE node.  */
1974         trie->startstate= 1;
1975         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1976             U32 state;
1977             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1978                 U32 ofs = 0;
1979                 I32 idx = -1;
1980                 U32 count = 0;
1981                 const U32 base = trie->states[ state ].trans.base;
1982 
1983                 if ( trie->states[state].wordnum )
1984                         count = 1;
1985 
1986                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1987                     if ( ( base + ofs >= trie->uniquecharcount ) &&
1988                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1989                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1990                     {
1991                         if ( ++count > 1 ) {
1992                             SV **tmp = av_fetch( revcharmap, ofs, 0);
1993 			    const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1994                             if ( state == 1 ) break;
1995                             if ( count == 2 ) {
1996                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1997                                 DEBUG_OPTIMISE_r(
1998                                     PerlIO_printf(Perl_debug_log,
1999 					"%*sNew Start State=%"UVuf" Class: [",
2000                                         (int)depth * 2 + 2, "",
2001                                         (UV)state));
2002 				if (idx >= 0) {
2003 				    SV ** const tmp = av_fetch( revcharmap, idx, 0);
2004 				    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2005 
2006                                     TRIE_BITMAP_SET(trie,*ch);
2007                                     if ( folder )
2008                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2009                                     DEBUG_OPTIMISE_r(
2010                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2011                                     );
2012 				}
2013 			    }
2014 			    TRIE_BITMAP_SET(trie,*ch);
2015 			    if ( folder )
2016 				TRIE_BITMAP_SET(trie,folder[ *ch ]);
2017 			    DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2018 			}
2019                         idx = ofs;
2020 		    }
2021                 }
2022                 if ( count == 1 ) {
2023                     SV **tmp = av_fetch( revcharmap, idx, 0);
2024                     STRLEN len;
2025                     char *ch = SvPV( *tmp, len );
2026                     DEBUG_OPTIMISE_r({
2027                         SV *sv=sv_newmortal();
2028                         PerlIO_printf( Perl_debug_log,
2029 			    "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2030                             (int)depth * 2 + 2, "",
2031                             (UV)state, (UV)idx,
2032                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2033 	                        PL_colors[0], PL_colors[1],
2034 	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2035 	                        PERL_PV_ESCAPE_FIRSTCHAR
2036                             )
2037                         );
2038                     });
2039                     if ( state==1 ) {
2040                         OP( convert ) = nodetype;
2041                         str=STRING(convert);
2042                         STR_LEN(convert)=0;
2043                     }
2044                     STR_LEN(convert) += len;
2045                     while (len--)
2046                         *str++ = *ch++;
2047 		} else {
2048 #ifdef DEBUGGING
2049 		    if (state>1)
2050 			DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2051 #endif
2052 		    break;
2053 		}
2054 	    }
2055             if (str) {
2056                 regnode *n = convert+NODE_SZ_STR(convert);
2057                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2058                 trie->startstate = state;
2059                 trie->minlen -= (state - 1);
2060                 trie->maxlen -= (state - 1);
2061 #ifdef DEBUGGING
2062                /* At least the UNICOS C compiler choked on this
2063                 * being argument to DEBUG_r(), so let's just have
2064                 * it right here. */
2065                if (
2066 #ifdef PERL_EXT_RE_BUILD
2067                    1
2068 #else
2069                    DEBUG_r_TEST
2070 #endif
2071                    ) {
2072                    regnode *fix = convert;
2073                    U32 word = trie->wordcount;
2074                    mjd_nodelen++;
2075                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2076                    while( ++fix < n ) {
2077                        Set_Node_Offset_Length(fix, 0, 0);
2078                    }
2079                    while (word--) {
2080                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2081                        if (tmp) {
2082                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2083                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2084                            else
2085                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2086                        }
2087                    }
2088                }
2089 #endif
2090                 if (trie->maxlen) {
2091                     convert = n;
2092 		} else {
2093                     NEXT_OFF(convert) = (U16)(tail - convert);
2094                     DEBUG_r(optimize= n);
2095                 }
2096             }
2097         }
2098         if (!jumper)
2099             jumper = last;
2100         if ( trie->maxlen ) {
2101 	    NEXT_OFF( convert ) = (U16)(tail - convert);
2102 	    ARG_SET( convert, data_slot );
2103 	    /* Store the offset to the first unabsorbed branch in
2104 	       jump[0], which is otherwise unused by the jump logic.
2105 	       We use this when dumping a trie and during optimisation. */
2106 	    if (trie->jump)
2107 	        trie->jump[0] = (U16)(nextbranch - convert);
2108 
2109             /* XXXX */
2110             if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2111                  ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2112             {
2113                 OP( convert ) = TRIEC;
2114                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2115                 PerlMemShared_free(trie->bitmap);
2116                 trie->bitmap= NULL;
2117             } else
2118                 OP( convert ) = TRIE;
2119 
2120             /* store the type in the flags */
2121             convert->flags = nodetype;
2122             DEBUG_r({
2123             optimize = convert
2124                       + NODE_STEP_REGNODE
2125                       + regarglen[ OP( convert ) ];
2126             });
2127             /* XXX We really should free up the resource in trie now,
2128                    as we won't use them - (which resources?) dmq */
2129         }
2130         /* needed for dumping*/
2131         DEBUG_r(if (optimize) {
2132             regnode *opt = convert;
2133 
2134             while ( ++opt < optimize) {
2135                 Set_Node_Offset_Length(opt,0,0);
2136             }
2137             /*
2138                 Try to clean up some of the debris left after the
2139                 optimisation.
2140              */
2141             while( optimize < jumper ) {
2142                 mjd_nodelen += Node_Length((optimize));
2143                 OP( optimize ) = OPTIMIZED;
2144                 Set_Node_Offset_Length(optimize,0,0);
2145                 optimize++;
2146             }
2147             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2148         });
2149     } /* end node insert */
2150     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2151 #ifdef DEBUGGING
2152     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2153     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2154 #else
2155     SvREFCNT_dec(revcharmap);
2156 #endif
2157     return trie->jump
2158            ? MADE_JUMP_TRIE
2159            : trie->startstate>1
2160              ? MADE_EXACT_TRIE
2161              : MADE_TRIE;
2162 }
2163 
2164 STATIC void
2165 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2166 {
2167 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2168 
2169    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2170    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2171    ISBN 0-201-10088-6
2172 
2173    We find the fail state for each state in the trie, this state is the longest proper
2174    suffix of the current states 'word' that is also a proper prefix of another word in our
2175    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2176    the DFA not to have to restart after its tried and failed a word at a given point, it
2177    simply continues as though it had been matching the other word in the first place.
2178    Consider
2179       'abcdgu'=~/abcdefg|cdgu/
2180    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2181    fail, which would bring use to the state representing 'd' in the second word where we would
2182    try 'g' and succeed, prodceding to match 'cdgu'.
2183  */
2184  /* add a fail transition */
2185     const U32 trie_offset = ARG(source);
2186     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2187     U32 *q;
2188     const U32 ucharcount = trie->uniquecharcount;
2189     const U32 numstates = trie->statecount;
2190     const U32 ubound = trie->lasttrans + ucharcount;
2191     U32 q_read = 0;
2192     U32 q_write = 0;
2193     U32 charid;
2194     U32 base = trie->states[ 1 ].trans.base;
2195     U32 *fail;
2196     reg_ac_data *aho;
2197     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2198     GET_RE_DEBUG_FLAGS_DECL;
2199 
2200     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2201 #ifndef DEBUGGING
2202     PERL_UNUSED_ARG(depth);
2203 #endif
2204 
2205 
2206     ARG_SET( stclass, data_slot );
2207     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2208     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2209     aho->trie=trie_offset;
2210     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2211     Copy( trie->states, aho->states, numstates, reg_trie_state );
2212     Newxz( q, numstates, U32);
2213     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2214     aho->refcount = 1;
2215     fail = aho->fail;
2216     /* initialize fail[0..1] to be 1 so that we always have
2217        a valid final fail state */
2218     fail[ 0 ] = fail[ 1 ] = 1;
2219 
2220     for ( charid = 0; charid < ucharcount ; charid++ ) {
2221 	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2222 	if ( newstate ) {
2223             q[ q_write ] = newstate;
2224             /* set to point at the root */
2225             fail[ q[ q_write++ ] ]=1;
2226         }
2227     }
2228     while ( q_read < q_write) {
2229 	const U32 cur = q[ q_read++ % numstates ];
2230         base = trie->states[ cur ].trans.base;
2231 
2232         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2233 	    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2234 	    if (ch_state) {
2235                 U32 fail_state = cur;
2236                 U32 fail_base;
2237                 do {
2238                     fail_state = fail[ fail_state ];
2239                     fail_base = aho->states[ fail_state ].trans.base;
2240                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2241 
2242                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2243                 fail[ ch_state ] = fail_state;
2244                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2245                 {
2246                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2247                 }
2248                 q[ q_write++ % numstates] = ch_state;
2249             }
2250         }
2251     }
2252     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2253        when we fail in state 1, this allows us to use the
2254        charclass scan to find a valid start char. This is based on the principle
2255        that theres a good chance the string being searched contains lots of stuff
2256        that cant be a start char.
2257      */
2258     fail[ 0 ] = fail[ 1 ] = 0;
2259     DEBUG_TRIE_COMPILE_r({
2260         PerlIO_printf(Perl_debug_log,
2261 		      "%*sStclass Failtable (%"UVuf" states): 0",
2262 		      (int)(depth * 2), "", (UV)numstates
2263         );
2264         for( q_read=1; q_read<numstates; q_read++ ) {
2265             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2266         }
2267         PerlIO_printf(Perl_debug_log, "\n");
2268     });
2269     Safefree(q);
2270     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2271 }
2272 
2273 
2274 /*
2275  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2276  * These need to be revisited when a newer toolchain becomes available.
2277  */
2278 #if defined(__sparc64__) && defined(__GNUC__)
2279 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2280 #       undef  SPARC64_GCC_WORKAROUND
2281 #       define SPARC64_GCC_WORKAROUND 1
2282 #   endif
2283 #endif
2284 
2285 #define DEBUG_PEEP(str,scan,depth) \
2286     DEBUG_OPTIMISE_r({if (scan){ \
2287        SV * const mysv=sv_newmortal(); \
2288        regnode *Next = regnext(scan); \
2289        regprop(RExC_rx, mysv, scan); \
2290        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2291        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2292        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2293    }});
2294 
2295 
2296 
2297 
2298 
2299 #define JOIN_EXACT(scan,min,flags) \
2300     if (PL_regkind[OP(scan)] == EXACT) \
2301         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2302 
2303 STATIC U32
2304 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2305     /* Merge several consecutive EXACTish nodes into one. */
2306     regnode *n = regnext(scan);
2307     U32 stringok = 1;
2308     regnode *next = scan + NODE_SZ_STR(scan);
2309     U32 merged = 0;
2310     U32 stopnow = 0;
2311 #ifdef DEBUGGING
2312     regnode *stop = scan;
2313     GET_RE_DEBUG_FLAGS_DECL;
2314 #else
2315     PERL_UNUSED_ARG(depth);
2316 #endif
2317 
2318     PERL_ARGS_ASSERT_JOIN_EXACT;
2319 #ifndef EXPERIMENTAL_INPLACESCAN
2320     PERL_UNUSED_ARG(flags);
2321     PERL_UNUSED_ARG(val);
2322 #endif
2323     DEBUG_PEEP("join",scan,depth);
2324 
2325     /* Skip NOTHING, merge EXACT*. */
2326     while (n &&
2327            ( PL_regkind[OP(n)] == NOTHING ||
2328              (stringok && (OP(n) == OP(scan))))
2329            && NEXT_OFF(n)
2330            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2331 
2332         if (OP(n) == TAIL || n > next)
2333             stringok = 0;
2334         if (PL_regkind[OP(n)] == NOTHING) {
2335             DEBUG_PEEP("skip:",n,depth);
2336             NEXT_OFF(scan) += NEXT_OFF(n);
2337             next = n + NODE_STEP_REGNODE;
2338 #ifdef DEBUGGING
2339             if (stringok)
2340                 stop = n;
2341 #endif
2342             n = regnext(n);
2343         }
2344         else if (stringok) {
2345             const unsigned int oldl = STR_LEN(scan);
2346             regnode * const nnext = regnext(n);
2347 
2348             DEBUG_PEEP("merg",n,depth);
2349 
2350             merged++;
2351             if (oldl + STR_LEN(n) > U8_MAX)
2352                 break;
2353             NEXT_OFF(scan) += NEXT_OFF(n);
2354             STR_LEN(scan) += STR_LEN(n);
2355             next = n + NODE_SZ_STR(n);
2356             /* Now we can overwrite *n : */
2357             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2358 #ifdef DEBUGGING
2359             stop = next - 1;
2360 #endif
2361             n = nnext;
2362             if (stopnow) break;
2363         }
2364 
2365 #ifdef EXPERIMENTAL_INPLACESCAN
2366 	if (flags && !NEXT_OFF(n)) {
2367 	    DEBUG_PEEP("atch", val, depth);
2368 	    if (reg_off_by_arg[OP(n)]) {
2369 		ARG_SET(n, val - n);
2370 	    }
2371 	    else {
2372 		NEXT_OFF(n) = val - n;
2373 	    }
2374 	    stopnow = 1;
2375 	}
2376 #endif
2377     }
2378 
2379     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2380     /*
2381     Two problematic code points in Unicode casefolding of EXACT nodes:
2382 
2383     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2384     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2385 
2386     which casefold to
2387 
2388     Unicode                      UTF-8
2389 
2390     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2391     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2392 
2393     This means that in case-insensitive matching (or "loose matching",
2394     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2395     length of the above casefolded versions) can match a target string
2396     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2397     This would rather mess up the minimum length computation.
2398 
2399     What we'll do is to look for the tail four bytes, and then peek
2400     at the preceding two bytes to see whether we need to decrease
2401     the minimum length by four (six minus two).
2402 
2403     Thanks to the design of UTF-8, there cannot be false matches:
2404     A sequence of valid UTF-8 bytes cannot be a subsequence of
2405     another valid sequence of UTF-8 bytes.
2406 
2407     */
2408          char * const s0 = STRING(scan), *s, *t;
2409          char * const s1 = s0 + STR_LEN(scan) - 1;
2410          char * const s2 = s1 - 4;
2411 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2412 	 const char t0[] = "\xaf\x49\xaf\x42";
2413 #else
2414          const char t0[] = "\xcc\x88\xcc\x81";
2415 #endif
2416          const char * const t1 = t0 + 3;
2417 
2418          for (s = s0 + 2;
2419               s < s2 && (t = ninstr(s, s1, t0, t1));
2420               s = t + 4) {
2421 #ifdef EBCDIC
2422 	      if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2423 		  ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2424 #else
2425               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2426                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2427 #endif
2428                    *min -= 4;
2429          }
2430     }
2431 
2432 #ifdef DEBUGGING
2433     /* Allow dumping */
2434     n = scan + NODE_SZ_STR(scan);
2435     while (n <= stop) {
2436         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2437             OP(n) = OPTIMIZED;
2438             NEXT_OFF(n) = 0;
2439         }
2440         n++;
2441     }
2442 #endif
2443     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2444     return stopnow;
2445 }
2446 
2447 /* REx optimizer.  Converts nodes into quickier variants "in place".
2448    Finds fixed substrings.  */
2449 
2450 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2451    to the position after last scanned or to NULL. */
2452 
2453 #define INIT_AND_WITHP \
2454     assert(!and_withp); \
2455     Newx(and_withp,1,struct regnode_charclass_class); \
2456     SAVEFREEPV(and_withp)
2457 
2458 /* this is a chain of data about sub patterns we are processing that
2459    need to be handled seperately/specially in study_chunk. Its so
2460    we can simulate recursion without losing state.  */
2461 struct scan_frame;
2462 typedef struct scan_frame {
2463     regnode *last;  /* last node to process in this frame */
2464     regnode *next;  /* next node to process when last is reached */
2465     struct scan_frame *prev; /*previous frame*/
2466     I32 stop; /* what stopparen do we use */
2467 } scan_frame;
2468 
2469 
2470 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2471 
2472 #define CASE_SYNST_FNC(nAmE)                                       \
2473 case nAmE:                                                         \
2474     if (flags & SCF_DO_STCLASS_AND) {                              \
2475 	    for (value = 0; value < 256; value++)                  \
2476 		if (!is_ ## nAmE ## _cp(value))                       \
2477 		    ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2478     }                                                              \
2479     else {                                                         \
2480 	    for (value = 0; value < 256; value++)                  \
2481 		if (is_ ## nAmE ## _cp(value))                        \
2482 		    ANYOF_BITMAP_SET(data->start_class, value);	   \
2483     }                                                              \
2484     break;                                                         \
2485 case N ## nAmE:                                                    \
2486     if (flags & SCF_DO_STCLASS_AND) {                              \
2487 	    for (value = 0; value < 256; value++)                   \
2488 		if (is_ ## nAmE ## _cp(value))                         \
2489 		    ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2490     }                                                               \
2491     else {                                                          \
2492 	    for (value = 0; value < 256; value++)                   \
2493 		if (!is_ ## nAmE ## _cp(value))                        \
2494 		    ANYOF_BITMAP_SET(data->start_class, value);	    \
2495     }                                                               \
2496     break
2497 
2498 
2499 
2500 STATIC I32
2501 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2502                         I32 *minlenp, I32 *deltap,
2503 			regnode *last,
2504 			scan_data_t *data,
2505 			I32 stopparen,
2506 			U8* recursed,
2507 			struct regnode_charclass_class *and_withp,
2508 			U32 flags, U32 depth)
2509 			/* scanp: Start here (read-write). */
2510 			/* deltap: Write maxlen-minlen here. */
2511 			/* last: Stop before this one. */
2512 			/* data: string data about the pattern */
2513 			/* stopparen: treat close N as END */
2514 			/* recursed: which subroutines have we recursed into */
2515 			/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2516 {
2517     dVAR;
2518     I32 min = 0, pars = 0, code;
2519     regnode *scan = *scanp, *next;
2520     I32 delta = 0;
2521     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2522     int is_inf_internal = 0;		/* The studied chunk is infinite */
2523     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2524     scan_data_t data_fake;
2525     SV *re_trie_maxbuff = NULL;
2526     regnode *first_non_open = scan;
2527     I32 stopmin = I32_MAX;
2528     scan_frame *frame = NULL;
2529     GET_RE_DEBUG_FLAGS_DECL;
2530 
2531     PERL_ARGS_ASSERT_STUDY_CHUNK;
2532 
2533 #ifdef DEBUGGING
2534     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2535 #endif
2536 
2537     if ( depth == 0 ) {
2538         while (first_non_open && OP(first_non_open) == OPEN)
2539             first_non_open=regnext(first_non_open);
2540     }
2541 
2542 
2543   fake_study_recurse:
2544     while ( scan && OP(scan) != END && scan < last ){
2545 	/* Peephole optimizer: */
2546 	DEBUG_STUDYDATA("Peep:", data,depth);
2547 	DEBUG_PEEP("Peep",scan,depth);
2548         JOIN_EXACT(scan,&min,0);
2549 
2550 	/* Follow the next-chain of the current node and optimize
2551 	   away all the NOTHINGs from it.  */
2552 	if (OP(scan) != CURLYX) {
2553 	    const int max = (reg_off_by_arg[OP(scan)]
2554 		       ? I32_MAX
2555 		       /* I32 may be smaller than U16 on CRAYs! */
2556 		       : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2557 	    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2558 	    int noff;
2559 	    regnode *n = scan;
2560 
2561 	    /* Skip NOTHING and LONGJMP. */
2562 	    while ((n = regnext(n))
2563 		   && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2564 		       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2565 		   && off + noff < max)
2566 		off += noff;
2567 	    if (reg_off_by_arg[OP(scan)])
2568 		ARG(scan) = off;
2569 	    else
2570 		NEXT_OFF(scan) = off;
2571 	}
2572 
2573 
2574 
2575 	/* The principal pseudo-switch.  Cannot be a switch, since we
2576 	   look into several different things.  */
2577 	if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2578 		   || OP(scan) == IFTHEN) {
2579 	    next = regnext(scan);
2580 	    code = OP(scan);
2581 	    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2582 
2583 	    if (OP(next) == code || code == IFTHEN) {
2584 	        /* NOTE - There is similar code to this block below for handling
2585 	           TRIE nodes on a re-study.  If you change stuff here check there
2586 	           too. */
2587 		I32 max1 = 0, min1 = I32_MAX, num = 0;
2588 		struct regnode_charclass_class accum;
2589 		regnode * const startbranch=scan;
2590 
2591 		if (flags & SCF_DO_SUBSTR)
2592 		    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2593 		if (flags & SCF_DO_STCLASS)
2594 		    cl_init_zero(pRExC_state, &accum);
2595 
2596 		while (OP(scan) == code) {
2597 		    I32 deltanext, minnext, f = 0, fake;
2598 		    struct regnode_charclass_class this_class;
2599 
2600 		    num++;
2601 		    data_fake.flags = 0;
2602 		    if (data) {
2603 			data_fake.whilem_c = data->whilem_c;
2604 			data_fake.last_closep = data->last_closep;
2605 		    }
2606 		    else
2607 			data_fake.last_closep = &fake;
2608 
2609 		    data_fake.pos_delta = delta;
2610 		    next = regnext(scan);
2611 		    scan = NEXTOPER(scan);
2612 		    if (code != BRANCH)
2613 			scan = NEXTOPER(scan);
2614 		    if (flags & SCF_DO_STCLASS) {
2615 			cl_init(pRExC_state, &this_class);
2616 			data_fake.start_class = &this_class;
2617 			f = SCF_DO_STCLASS_AND;
2618 		    }
2619 		    if (flags & SCF_WHILEM_VISITED_POS)
2620 			f |= SCF_WHILEM_VISITED_POS;
2621 
2622 		    /* we suppose the run is continuous, last=next...*/
2623 		    minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2624 					  next, &data_fake,
2625 					  stopparen, recursed, NULL, f,depth+1);
2626 		    if (min1 > minnext)
2627 			min1 = minnext;
2628 		    if (max1 < minnext + deltanext)
2629 			max1 = minnext + deltanext;
2630 		    if (deltanext == I32_MAX)
2631 			is_inf = is_inf_internal = 1;
2632 		    scan = next;
2633 		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2634 			pars++;
2635 	            if (data_fake.flags & SCF_SEEN_ACCEPT) {
2636 	                if ( stopmin > minnext)
2637 	                    stopmin = min + min1;
2638 	                flags &= ~SCF_DO_SUBSTR;
2639 	                if (data)
2640 	                    data->flags |= SCF_SEEN_ACCEPT;
2641 	            }
2642 		    if (data) {
2643 			if (data_fake.flags & SF_HAS_EVAL)
2644 			    data->flags |= SF_HAS_EVAL;
2645 			data->whilem_c = data_fake.whilem_c;
2646 		    }
2647 		    if (flags & SCF_DO_STCLASS)
2648 			cl_or(pRExC_state, &accum, &this_class);
2649 		}
2650 		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2651 		    min1 = 0;
2652 		if (flags & SCF_DO_SUBSTR) {
2653 		    data->pos_min += min1;
2654 		    data->pos_delta += max1 - min1;
2655 		    if (max1 != min1 || is_inf)
2656 			data->longest = &(data->longest_float);
2657 		}
2658 		min += min1;
2659 		delta += max1 - min1;
2660 		if (flags & SCF_DO_STCLASS_OR) {
2661 		    cl_or(pRExC_state, data->start_class, &accum);
2662 		    if (min1) {
2663 			cl_and(data->start_class, and_withp);
2664 			flags &= ~SCF_DO_STCLASS;
2665 		    }
2666 		}
2667 		else if (flags & SCF_DO_STCLASS_AND) {
2668 		    if (min1) {
2669 			cl_and(data->start_class, &accum);
2670 			flags &= ~SCF_DO_STCLASS;
2671 		    }
2672 		    else {
2673 			/* Switch to OR mode: cache the old value of
2674 			 * data->start_class */
2675 			INIT_AND_WITHP;
2676 			StructCopy(data->start_class, and_withp,
2677 				   struct regnode_charclass_class);
2678 			flags &= ~SCF_DO_STCLASS_AND;
2679 			StructCopy(&accum, data->start_class,
2680 				   struct regnode_charclass_class);
2681 			flags |= SCF_DO_STCLASS_OR;
2682 			data->start_class->flags |= ANYOF_EOS;
2683 		    }
2684 		}
2685 
2686                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2687 		/* demq.
2688 
2689 		   Assuming this was/is a branch we are dealing with: 'scan' now
2690 		   points at the item that follows the branch sequence, whatever
2691 		   it is. We now start at the beginning of the sequence and look
2692 		   for subsequences of
2693 
2694 		   BRANCH->EXACT=>x1
2695 		   BRANCH->EXACT=>x2
2696 		   tail
2697 
2698 		   which would be constructed from a pattern like /A|LIST|OF|WORDS/
2699 
2700 		   If we can find such a subseqence we need to turn the first
2701 		   element into a trie and then add the subsequent branch exact
2702 		   strings to the trie.
2703 
2704 		   We have two cases
2705 
2706 		     1. patterns where the whole set of branch can be converted.
2707 
2708 		     2. patterns where only a subset can be converted.
2709 
2710 		   In case 1 we can replace the whole set with a single regop
2711 		   for the trie. In case 2 we need to keep the start and end
2712 		   branchs so
2713 
2714 		     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2715 		     becomes BRANCH TRIE; BRANCH X;
2716 
2717 		  There is an additional case, that being where there is a
2718 		  common prefix, which gets split out into an EXACT like node
2719 		  preceding the TRIE node.
2720 
2721 		  If x(1..n)==tail then we can do a simple trie, if not we make
2722 		  a "jump" trie, such that when we match the appropriate word
2723 		  we "jump" to the appopriate tail node. Essentailly we turn
2724 		  a nested if into a case structure of sorts.
2725 
2726 		*/
2727 
2728 		    int made=0;
2729 		    if (!re_trie_maxbuff) {
2730 			re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2731 			if (!SvIOK(re_trie_maxbuff))
2732 			    sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2733 		    }
2734                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2735                         regnode *cur;
2736                         regnode *first = (regnode *)NULL;
2737                         regnode *last = (regnode *)NULL;
2738                         regnode *tail = scan;
2739                         U8 optype = 0;
2740                         U32 count=0;
2741 
2742 #ifdef DEBUGGING
2743                         SV * const mysv = sv_newmortal();       /* for dumping */
2744 #endif
2745                         /* var tail is used because there may be a TAIL
2746                            regop in the way. Ie, the exacts will point to the
2747                            thing following the TAIL, but the last branch will
2748                            point at the TAIL. So we advance tail. If we
2749                            have nested (?:) we may have to move through several
2750                            tails.
2751                          */
2752 
2753                         while ( OP( tail ) == TAIL ) {
2754                             /* this is the TAIL generated by (?:) */
2755                             tail = regnext( tail );
2756                         }
2757 
2758 
2759                         DEBUG_OPTIMISE_r({
2760                             regprop(RExC_rx, mysv, tail );
2761                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2762                                 (int)depth * 2 + 2, "",
2763                                 "Looking for TRIE'able sequences. Tail node is: ",
2764                                 SvPV_nolen_const( mysv )
2765                             );
2766                         });
2767 
2768                         /*
2769 
2770                            step through the branches, cur represents each
2771                            branch, noper is the first thing to be matched
2772                            as part of that branch and noper_next is the
2773                            regnext() of that node. if noper is an EXACT
2774                            and noper_next is the same as scan (our current
2775                            position in the regex) then the EXACT branch is
2776                            a possible optimization target. Once we have
2777                            two or more consequetive such branches we can
2778                            create a trie of the EXACT's contents and stich
2779                            it in place. If the sequence represents all of
2780                            the branches we eliminate the whole thing and
2781                            replace it with a single TRIE. If it is a
2782                            subsequence then we need to stitch it in. This
2783                            means the first branch has to remain, and needs
2784                            to be repointed at the item on the branch chain
2785                            following the last branch optimized. This could
2786                            be either a BRANCH, in which case the
2787                            subsequence is internal, or it could be the
2788                            item following the branch sequence in which
2789                            case the subsequence is at the end.
2790 
2791                         */
2792 
2793                         /* dont use tail as the end marker for this traverse */
2794                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2795                             regnode * const noper = NEXTOPER( cur );
2796 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2797                             regnode * const noper_next = regnext( noper );
2798 #endif
2799 
2800                             DEBUG_OPTIMISE_r({
2801                                 regprop(RExC_rx, mysv, cur);
2802                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2803                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2804 
2805                                 regprop(RExC_rx, mysv, noper);
2806                                 PerlIO_printf( Perl_debug_log, " -> %s",
2807                                     SvPV_nolen_const(mysv));
2808 
2809                                 if ( noper_next ) {
2810                                   regprop(RExC_rx, mysv, noper_next );
2811                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2812                                     SvPV_nolen_const(mysv));
2813                                 }
2814                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2815                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2816                             });
2817                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2818                                          : PL_regkind[ OP( noper ) ] == EXACT )
2819                                   || OP(noper) == NOTHING )
2820 #ifdef NOJUMPTRIE
2821                                   && noper_next == tail
2822 #endif
2823                                   && count < U16_MAX)
2824                             {
2825                                 count++;
2826                                 if ( !first || optype == NOTHING ) {
2827                                     if (!first) first = cur;
2828                                     optype = OP( noper );
2829                                 } else {
2830                                     last = cur;
2831                                 }
2832                             } else {
2833 /*
2834     Currently we do not believe that the trie logic can
2835     handle case insensitive matching properly when the
2836     pattern is not unicode (thus forcing unicode semantics).
2837 
2838     If/when this is fixed the following define can be swapped
2839     in below to fully enable trie logic.
2840 
2841 #define TRIE_TYPE_IS_SAFE 1
2842 
2843 */
2844 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2845 
2846                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2847                                     make_trie( pRExC_state,
2848                                             startbranch, first, cur, tail, count,
2849                                             optype, depth+1 );
2850                                 }
2851                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2852 #ifdef NOJUMPTRIE
2853                                      && noper_next == tail
2854 #endif
2855                                 ){
2856                                     count = 1;
2857                                     first = cur;
2858                                     optype = OP( noper );
2859                                 } else {
2860                                     count = 0;
2861                                     first = NULL;
2862                                     optype = 0;
2863                                 }
2864                                 last = NULL;
2865                             }
2866                         }
2867                         DEBUG_OPTIMISE_r({
2868                             regprop(RExC_rx, mysv, cur);
2869                             PerlIO_printf( Perl_debug_log,
2870                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2871                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2872 
2873                         });
2874 
2875                         if ( last && TRIE_TYPE_IS_SAFE ) {
2876                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2877 #ifdef TRIE_STUDY_OPT
2878                             if ( ((made == MADE_EXACT_TRIE &&
2879                                  startbranch == first)
2880                                  || ( first_non_open == first )) &&
2881                                  depth==0 ) {
2882                                 flags |= SCF_TRIE_RESTUDY;
2883                                 if ( startbranch == first
2884                                      && scan == tail )
2885                                 {
2886                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2887                                 }
2888                             }
2889 #endif
2890                         }
2891                     }
2892 
2893                 } /* do trie */
2894 
2895 	    }
2896 	    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2897 		scan = NEXTOPER(NEXTOPER(scan));
2898 	    } else			/* single branch is optimized. */
2899 		scan = NEXTOPER(scan);
2900 	    continue;
2901 	} else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2902 	    scan_frame *newframe = NULL;
2903 	    I32 paren;
2904 	    regnode *start;
2905 	    regnode *end;
2906 
2907 	    if (OP(scan) != SUSPEND) {
2908 	    /* set the pointer */
2909 	        if (OP(scan) == GOSUB) {
2910 	            paren = ARG(scan);
2911 	            RExC_recurse[ARG2L(scan)] = scan;
2912                     start = RExC_open_parens[paren-1];
2913                     end   = RExC_close_parens[paren-1];
2914                 } else {
2915                     paren = 0;
2916                     start = RExC_rxi->program + 1;
2917                     end   = RExC_opend;
2918                 }
2919                 if (!recursed) {
2920                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2921                     SAVEFREEPV(recursed);
2922                 }
2923                 if (!PAREN_TEST(recursed,paren+1)) {
2924 		    PAREN_SET(recursed,paren+1);
2925                     Newx(newframe,1,scan_frame);
2926                 } else {
2927                     if (flags & SCF_DO_SUBSTR) {
2928                         SCAN_COMMIT(pRExC_state,data,minlenp);
2929                         data->longest = &(data->longest_float);
2930                     }
2931                     is_inf = is_inf_internal = 1;
2932                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2933                         cl_anything(pRExC_state, data->start_class);
2934                     flags &= ~SCF_DO_STCLASS;
2935 	        }
2936             } else {
2937 	        Newx(newframe,1,scan_frame);
2938 	        paren = stopparen;
2939 	        start = scan+2;
2940 	        end = regnext(scan);
2941 	    }
2942 	    if (newframe) {
2943                 assert(start);
2944                 assert(end);
2945 	        SAVEFREEPV(newframe);
2946 	        newframe->next = regnext(scan);
2947 	        newframe->last = last;
2948 	        newframe->stop = stopparen;
2949 	        newframe->prev = frame;
2950 
2951 	        frame = newframe;
2952 	        scan =  start;
2953 	        stopparen = paren;
2954 	        last = end;
2955 
2956 	        continue;
2957 	    }
2958 	}
2959 	else if (OP(scan) == EXACT) {
2960 	    I32 l = STR_LEN(scan);
2961 	    UV uc;
2962 	    if (UTF) {
2963 		const U8 * const s = (U8*)STRING(scan);
2964 		l = utf8_length(s, s + l);
2965 		uc = utf8_to_uvchr(s, NULL);
2966 	    } else {
2967 		uc = *((U8*)STRING(scan));
2968 	    }
2969 	    min += l;
2970 	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2971 		/* The code below prefers earlier match for fixed
2972 		   offset, later match for variable offset.  */
2973 		if (data->last_end == -1) { /* Update the start info. */
2974 		    data->last_start_min = data->pos_min;
2975  		    data->last_start_max = is_inf
2976  			? I32_MAX : data->pos_min + data->pos_delta;
2977 		}
2978 		sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2979 		if (UTF)
2980 		    SvUTF8_on(data->last_found);
2981 		{
2982 		    SV * const sv = data->last_found;
2983 		    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2984 			mg_find(sv, PERL_MAGIC_utf8) : NULL;
2985 		    if (mg && mg->mg_len >= 0)
2986 			mg->mg_len += utf8_length((U8*)STRING(scan),
2987 						  (U8*)STRING(scan)+STR_LEN(scan));
2988 		}
2989 		data->last_end = data->pos_min + l;
2990 		data->pos_min += l; /* As in the first entry. */
2991 		data->flags &= ~SF_BEFORE_EOL;
2992 	    }
2993 	    if (flags & SCF_DO_STCLASS_AND) {
2994 		/* Check whether it is compatible with what we know already! */
2995 		int compat = 1;
2996 
2997 		if (uc >= 0x100 ||
2998 		    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2999 		    && !ANYOF_BITMAP_TEST(data->start_class, uc)
3000 		    && (!(data->start_class->flags & ANYOF_FOLD)
3001 			|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3002                     )
3003 		    compat = 0;
3004 		ANYOF_CLASS_ZERO(data->start_class);
3005 		ANYOF_BITMAP_ZERO(data->start_class);
3006 		if (compat)
3007 		    ANYOF_BITMAP_SET(data->start_class, uc);
3008 		data->start_class->flags &= ~ANYOF_EOS;
3009 		if (uc < 0x100)
3010 		  data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3011 	    }
3012 	    else if (flags & SCF_DO_STCLASS_OR) {
3013 		/* false positive possible if the class is case-folded */
3014 		if (uc < 0x100)
3015 		    ANYOF_BITMAP_SET(data->start_class, uc);
3016 		else
3017 		    data->start_class->flags |= ANYOF_UNICODE_ALL;
3018 		data->start_class->flags &= ~ANYOF_EOS;
3019 		cl_and(data->start_class, and_withp);
3020 	    }
3021 	    flags &= ~SCF_DO_STCLASS;
3022 	}
3023 	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3024 	    I32 l = STR_LEN(scan);
3025 	    UV uc = *((U8*)STRING(scan));
3026 
3027 	    /* Search for fixed substrings supports EXACT only. */
3028 	    if (flags & SCF_DO_SUBSTR) {
3029 		assert(data);
3030 		SCAN_COMMIT(pRExC_state, data, minlenp);
3031 	    }
3032 	    if (UTF) {
3033 		const U8 * const s = (U8 *)STRING(scan);
3034 		l = utf8_length(s, s + l);
3035 		uc = utf8_to_uvchr(s, NULL);
3036 	    }
3037 	    min += l;
3038 	    if (flags & SCF_DO_SUBSTR)
3039 		data->pos_min += l;
3040 	    if (flags & SCF_DO_STCLASS_AND) {
3041 		/* Check whether it is compatible with what we know already! */
3042 		int compat = 1;
3043 
3044 		if (uc >= 0x100 ||
3045 		    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3046 		    && !ANYOF_BITMAP_TEST(data->start_class, uc)
3047 		     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3048 		    compat = 0;
3049 		ANYOF_CLASS_ZERO(data->start_class);
3050 		ANYOF_BITMAP_ZERO(data->start_class);
3051 		if (compat) {
3052 		    ANYOF_BITMAP_SET(data->start_class, uc);
3053 		    data->start_class->flags &= ~ANYOF_EOS;
3054 		    data->start_class->flags |= ANYOF_FOLD;
3055 		    if (OP(scan) == EXACTFL)
3056 			data->start_class->flags |= ANYOF_LOCALE;
3057 		}
3058 	    }
3059 	    else if (flags & SCF_DO_STCLASS_OR) {
3060 		if (data->start_class->flags & ANYOF_FOLD) {
3061 		    /* false positive possible if the class is case-folded.
3062 		       Assume that the locale settings are the same... */
3063 		    if (uc < 0x100)
3064 			ANYOF_BITMAP_SET(data->start_class, uc);
3065 		    data->start_class->flags &= ~ANYOF_EOS;
3066 		}
3067 		cl_and(data->start_class, and_withp);
3068 	    }
3069 	    flags &= ~SCF_DO_STCLASS;
3070 	}
3071 	else if (strchr((const char*)PL_varies,OP(scan))) {
3072 	    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3073 	    I32 f = flags, pos_before = 0;
3074 	    regnode * const oscan = scan;
3075 	    struct regnode_charclass_class this_class;
3076 	    struct regnode_charclass_class *oclass = NULL;
3077 	    I32 next_is_eval = 0;
3078 
3079 	    switch (PL_regkind[OP(scan)]) {
3080 	    case WHILEM:		/* End of (?:...)* . */
3081 		scan = NEXTOPER(scan);
3082 		goto finish;
3083 	    case PLUS:
3084 		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3085 		    next = NEXTOPER(scan);
3086 		    if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3087 			mincount = 1;
3088 			maxcount = REG_INFTY;
3089 			next = regnext(scan);
3090 			scan = NEXTOPER(scan);
3091 			goto do_curly;
3092 		    }
3093 		}
3094 		if (flags & SCF_DO_SUBSTR)
3095 		    data->pos_min++;
3096 		min++;
3097 		/* Fall through. */
3098 	    case STAR:
3099 		if (flags & SCF_DO_STCLASS) {
3100 		    mincount = 0;
3101 		    maxcount = REG_INFTY;
3102 		    next = regnext(scan);
3103 		    scan = NEXTOPER(scan);
3104 		    goto do_curly;
3105 		}
3106 		is_inf = is_inf_internal = 1;
3107 		scan = regnext(scan);
3108 		if (flags & SCF_DO_SUBSTR) {
3109 		    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3110 		    data->longest = &(data->longest_float);
3111 		}
3112 		goto optimize_curly_tail;
3113 	    case CURLY:
3114 	        if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3115 	            && (scan->flags == stopparen))
3116 		{
3117 		    mincount = 1;
3118 		    maxcount = 1;
3119 		} else {
3120 		    mincount = ARG1(scan);
3121 		    maxcount = ARG2(scan);
3122 		}
3123 		next = regnext(scan);
3124 		if (OP(scan) == CURLYX) {
3125 		    I32 lp = (data ? *(data->last_closep) : 0);
3126 		    scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3127 		}
3128 		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3129 		next_is_eval = (OP(scan) == EVAL);
3130 	      do_curly:
3131 		if (flags & SCF_DO_SUBSTR) {
3132 		    if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3133 		    pos_before = data->pos_min;
3134 		}
3135 		if (data) {
3136 		    fl = data->flags;
3137 		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3138 		    if (is_inf)
3139 			data->flags |= SF_IS_INF;
3140 		}
3141 		if (flags & SCF_DO_STCLASS) {
3142 		    cl_init(pRExC_state, &this_class);
3143 		    oclass = data->start_class;
3144 		    data->start_class = &this_class;
3145 		    f |= SCF_DO_STCLASS_AND;
3146 		    f &= ~SCF_DO_STCLASS_OR;
3147 		}
3148 		/* These are the cases when once a subexpression
3149 		   fails at a particular position, it cannot succeed
3150 		   even after backtracking at the enclosing scope.
3151 
3152 		   XXXX what if minimal match and we are at the
3153 		        initial run of {n,m}? */
3154 		if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3155 		    f &= ~SCF_WHILEM_VISITED_POS;
3156 
3157 		/* This will finish on WHILEM, setting scan, or on NULL: */
3158 		minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3159 		                      last, data, stopparen, recursed, NULL,
3160 				      (mincount == 0
3161 					? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3162 
3163 		if (flags & SCF_DO_STCLASS)
3164 		    data->start_class = oclass;
3165 		if (mincount == 0 || minnext == 0) {
3166 		    if (flags & SCF_DO_STCLASS_OR) {
3167 			cl_or(pRExC_state, data->start_class, &this_class);
3168 		    }
3169 		    else if (flags & SCF_DO_STCLASS_AND) {
3170 			/* Switch to OR mode: cache the old value of
3171 			 * data->start_class */
3172 			INIT_AND_WITHP;
3173 			StructCopy(data->start_class, and_withp,
3174 				   struct regnode_charclass_class);
3175 			flags &= ~SCF_DO_STCLASS_AND;
3176 			StructCopy(&this_class, data->start_class,
3177 				   struct regnode_charclass_class);
3178 			flags |= SCF_DO_STCLASS_OR;
3179 			data->start_class->flags |= ANYOF_EOS;
3180 		    }
3181 		} else {		/* Non-zero len */
3182 		    if (flags & SCF_DO_STCLASS_OR) {
3183 			cl_or(pRExC_state, data->start_class, &this_class);
3184 			cl_and(data->start_class, and_withp);
3185 		    }
3186 		    else if (flags & SCF_DO_STCLASS_AND)
3187 			cl_and(data->start_class, &this_class);
3188 		    flags &= ~SCF_DO_STCLASS;
3189 		}
3190 		if (!scan) 		/* It was not CURLYX, but CURLY. */
3191 		    scan = next;
3192 		if ( /* ? quantifier ok, except for (?{ ... }) */
3193 		    (next_is_eval || !(mincount == 0 && maxcount == 1))
3194 		    && (minnext == 0) && (deltanext == 0)
3195 		    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3196 		    && maxcount <= REG_INFTY/3) /* Complement check for big count */
3197 		{
3198 		    ckWARNreg(RExC_parse,
3199 			      "Quantifier unexpected on zero-length expression");
3200 		}
3201 
3202 		min += minnext * mincount;
3203 		is_inf_internal |= ((maxcount == REG_INFTY
3204 				     && (minnext + deltanext) > 0)
3205 				    || deltanext == I32_MAX);
3206 		is_inf |= is_inf_internal;
3207 		delta += (minnext + deltanext) * maxcount - minnext * mincount;
3208 
3209 		/* Try powerful optimization CURLYX => CURLYN. */
3210 		if (  OP(oscan) == CURLYX && data
3211 		      && data->flags & SF_IN_PAR
3212 		      && !(data->flags & SF_HAS_EVAL)
3213 		      && !deltanext && minnext == 1 ) {
3214 		    /* Try to optimize to CURLYN.  */
3215 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3216 		    regnode * const nxt1 = nxt;
3217 #ifdef DEBUGGING
3218 		    regnode *nxt2;
3219 #endif
3220 
3221 		    /* Skip open. */
3222 		    nxt = regnext(nxt);
3223 		    if (!strchr((const char*)PL_simple,OP(nxt))
3224 			&& !(PL_regkind[OP(nxt)] == EXACT
3225 			     && STR_LEN(nxt) == 1))
3226 			goto nogo;
3227 #ifdef DEBUGGING
3228 		    nxt2 = nxt;
3229 #endif
3230 		    nxt = regnext(nxt);
3231 		    if (OP(nxt) != CLOSE)
3232 			goto nogo;
3233 		    if (RExC_open_parens) {
3234 			RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3235 			RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3236 		    }
3237 		    /* Now we know that nxt2 is the only contents: */
3238 		    oscan->flags = (U8)ARG(nxt);
3239 		    OP(oscan) = CURLYN;
3240 		    OP(nxt1) = NOTHING;	/* was OPEN. */
3241 
3242 #ifdef DEBUGGING
3243 		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3244 		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3245 		    NEXT_OFF(nxt2) = 0;	/* just for consistancy with CURLY. */
3246 		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
3247 		    OP(nxt + 1) = OPTIMIZED; /* was count. */
3248 		    NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3249 #endif
3250 		}
3251 	      nogo:
3252 
3253 		/* Try optimization CURLYX => CURLYM. */
3254 		if (  OP(oscan) == CURLYX && data
3255 		      && !(data->flags & SF_HAS_PAR)
3256 		      && !(data->flags & SF_HAS_EVAL)
3257 		      && !deltanext	/* atom is fixed width */
3258 		      && minnext != 0	/* CURLYM can't handle zero width */
3259 		) {
3260 		    /* XXXX How to optimize if data == 0? */
3261 		    /* Optimize to a simpler form.  */
3262 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3263 		    regnode *nxt2;
3264 
3265 		    OP(oscan) = CURLYM;
3266 		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3267 			    && (OP(nxt2) != WHILEM))
3268 			nxt = nxt2;
3269 		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3270 		    /* Need to optimize away parenths. */
3271 		    if (data->flags & SF_IN_PAR) {
3272 			/* Set the parenth number.  */
3273 			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3274 
3275 			if (OP(nxt) != CLOSE)
3276 			    FAIL("Panic opt close");
3277 			oscan->flags = (U8)ARG(nxt);
3278 			if (RExC_open_parens) {
3279 			    RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3280 			    RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3281 			}
3282 			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
3283 			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
3284 
3285 #ifdef DEBUGGING
3286 			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3287 			OP(nxt + 1) = OPTIMIZED; /* was count. */
3288 			NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3289 			NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3290 #endif
3291 #if 0
3292 			while ( nxt1 && (OP(nxt1) != WHILEM)) {
3293 			    regnode *nnxt = regnext(nxt1);
3294 
3295 			    if (nnxt == nxt) {
3296 				if (reg_off_by_arg[OP(nxt1)])
3297 				    ARG_SET(nxt1, nxt2 - nxt1);
3298 				else if (nxt2 - nxt1 < U16_MAX)
3299 				    NEXT_OFF(nxt1) = nxt2 - nxt1;
3300 				else
3301 				    OP(nxt) = NOTHING;	/* Cannot beautify */
3302 			    }
3303 			    nxt1 = nnxt;
3304 			}
3305 #endif
3306 			/* Optimize again: */
3307 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3308 				    NULL, stopparen, recursed, NULL, 0,depth+1);
3309 		    }
3310 		    else
3311 			oscan->flags = 0;
3312 		}
3313 		else if ((OP(oscan) == CURLYX)
3314 			 && (flags & SCF_WHILEM_VISITED_POS)
3315 			 /* See the comment on a similar expression above.
3316 			    However, this time it not a subexpression
3317 			    we care about, but the expression itself. */
3318 			 && (maxcount == REG_INFTY)
3319 			 && data && ++data->whilem_c < 16) {
3320 		    /* This stays as CURLYX, we can put the count/of pair. */
3321 		    /* Find WHILEM (as in regexec.c) */
3322 		    regnode *nxt = oscan + NEXT_OFF(oscan);
3323 
3324 		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3325 			nxt += ARG(nxt);
3326 		    PREVOPER(nxt)->flags = (U8)(data->whilem_c
3327 			| (RExC_whilem_seen << 4)); /* On WHILEM */
3328 		}
3329 		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3330 		    pars++;
3331 		if (flags & SCF_DO_SUBSTR) {
3332 		    SV *last_str = NULL;
3333 		    int counted = mincount != 0;
3334 
3335 		    if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3336 #if defined(SPARC64_GCC_WORKAROUND)
3337 			I32 b = 0;
3338 			STRLEN l = 0;
3339 			const char *s = NULL;
3340 			I32 old = 0;
3341 
3342 			if (pos_before >= data->last_start_min)
3343 			    b = pos_before;
3344 			else
3345 			    b = data->last_start_min;
3346 
3347 			l = 0;
3348 			s = SvPV_const(data->last_found, l);
3349 			old = b - data->last_start_min;
3350 
3351 #else
3352 			I32 b = pos_before >= data->last_start_min
3353 			    ? pos_before : data->last_start_min;
3354 			STRLEN l;
3355 			const char * const s = SvPV_const(data->last_found, l);
3356 			I32 old = b - data->last_start_min;
3357 #endif
3358 
3359 			if (UTF)
3360 			    old = utf8_hop((U8*)s, old) - (U8*)s;
3361 
3362 			l -= old;
3363 			/* Get the added string: */
3364 			last_str = newSVpvn_utf8(s  + old, l, UTF);
3365 			if (deltanext == 0 && pos_before == b) {
3366 			    /* What was added is a constant string */
3367 			    if (mincount > 1) {
3368 				SvGROW(last_str, (mincount * l) + 1);
3369 				repeatcpy(SvPVX(last_str) + l,
3370 					  SvPVX_const(last_str), l, mincount - 1);
3371 				SvCUR_set(last_str, SvCUR(last_str) * mincount);
3372 				/* Add additional parts. */
3373 				SvCUR_set(data->last_found,
3374 					  SvCUR(data->last_found) - l);
3375 				sv_catsv(data->last_found, last_str);
3376 				{
3377 				    SV * sv = data->last_found;
3378 				    MAGIC *mg =
3379 					SvUTF8(sv) && SvMAGICAL(sv) ?
3380 					mg_find(sv, PERL_MAGIC_utf8) : NULL;
3381 				    if (mg && mg->mg_len >= 0)
3382 					mg->mg_len += CHR_SVLEN(last_str) - l;
3383 				}
3384 				data->last_end += l * (mincount - 1);
3385 			    }
3386 			} else {
3387 			    /* start offset must point into the last copy */
3388 			    data->last_start_min += minnext * (mincount - 1);
3389 			    data->last_start_max += is_inf ? I32_MAX
3390 				: (maxcount - 1) * (minnext + data->pos_delta);
3391 			}
3392 		    }
3393 		    /* It is counted once already... */
3394 		    data->pos_min += minnext * (mincount - counted);
3395 		    data->pos_delta += - counted * deltanext +
3396 			(minnext + deltanext) * maxcount - minnext * mincount;
3397 		    if (mincount != maxcount) {
3398 			 /* Cannot extend fixed substrings found inside
3399 			    the group.  */
3400 			SCAN_COMMIT(pRExC_state,data,minlenp);
3401 			if (mincount && last_str) {
3402 			    SV * const sv = data->last_found;
3403 			    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3404 				mg_find(sv, PERL_MAGIC_utf8) : NULL;
3405 
3406 			    if (mg)
3407 				mg->mg_len = -1;
3408 			    sv_setsv(sv, last_str);
3409 			    data->last_end = data->pos_min;
3410 			    data->last_start_min =
3411 				data->pos_min - CHR_SVLEN(last_str);
3412 			    data->last_start_max = is_inf
3413 				? I32_MAX
3414 				: data->pos_min + data->pos_delta
3415 				- CHR_SVLEN(last_str);
3416 			}
3417 			data->longest = &(data->longest_float);
3418 		    }
3419 		    SvREFCNT_dec(last_str);
3420 		}
3421 		if (data && (fl & SF_HAS_EVAL))
3422 		    data->flags |= SF_HAS_EVAL;
3423 	      optimize_curly_tail:
3424 		if (OP(oscan) != CURLYX) {
3425 		    while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3426 			   && NEXT_OFF(next))
3427 			NEXT_OFF(oscan) += NEXT_OFF(next);
3428 		}
3429 		continue;
3430 	    default:			/* REF and CLUMP only? */
3431 		if (flags & SCF_DO_SUBSTR) {
3432 		    SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3433 		    data->longest = &(data->longest_float);
3434 		}
3435 		is_inf = is_inf_internal = 1;
3436 		if (flags & SCF_DO_STCLASS_OR)
3437 		    cl_anything(pRExC_state, data->start_class);
3438 		flags &= ~SCF_DO_STCLASS;
3439 		break;
3440 	    }
3441 	}
3442 	else if (OP(scan) == LNBREAK) {
3443 	    if (flags & SCF_DO_STCLASS) {
3444 		int value = 0;
3445 		data->start_class->flags &= ~ANYOF_EOS;	/* No match on empty */
3446     	        if (flags & SCF_DO_STCLASS_AND) {
3447                     for (value = 0; value < 256; value++)
3448                         if (!is_VERTWS_cp(value))
3449                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3450                 }
3451                 else {
3452                     for (value = 0; value < 256; value++)
3453                         if (is_VERTWS_cp(value))
3454                             ANYOF_BITMAP_SET(data->start_class, value);
3455                 }
3456                 if (flags & SCF_DO_STCLASS_OR)
3457 		    cl_and(data->start_class, and_withp);
3458 		flags &= ~SCF_DO_STCLASS;
3459             }
3460 	    min += 1;
3461 	    delta += 1;
3462             if (flags & SCF_DO_SUBSTR) {
3463     	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3464     	        data->pos_min += 1;
3465 	        data->pos_delta += 1;
3466 		data->longest = &(data->longest_float);
3467     	    }
3468 
3469 	}
3470 	else if (OP(scan) == FOLDCHAR) {
3471 	    int d = ARG(scan)==0xDF ? 1 : 2;
3472 	    flags &= ~SCF_DO_STCLASS;
3473             min += 1;
3474             delta += d;
3475             if (flags & SCF_DO_SUBSTR) {
3476 	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3477 	        data->pos_min += 1;
3478 	        data->pos_delta += d;
3479 		data->longest = &(data->longest_float);
3480 	    }
3481 	}
3482 	else if (strchr((const char*)PL_simple,OP(scan))) {
3483 	    int value = 0;
3484 
3485 	    if (flags & SCF_DO_SUBSTR) {
3486 		SCAN_COMMIT(pRExC_state,data,minlenp);
3487 		data->pos_min++;
3488 	    }
3489 	    min++;
3490 	    if (flags & SCF_DO_STCLASS) {
3491 		data->start_class->flags &= ~ANYOF_EOS;	/* No match on empty */
3492 
3493 		/* Some of the logic below assumes that switching
3494 		   locale on will only add false positives. */
3495 		switch (PL_regkind[OP(scan)]) {
3496 		case SANY:
3497 		default:
3498 		  do_default:
3499 		    /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3500 		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3501 			cl_anything(pRExC_state, data->start_class);
3502 		    break;
3503 		case REG_ANY:
3504 		    if (OP(scan) == SANY)
3505 			goto do_default;
3506 		    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3507 			value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3508 				 || (data->start_class->flags & ANYOF_CLASS));
3509 			cl_anything(pRExC_state, data->start_class);
3510 		    }
3511 		    if (flags & SCF_DO_STCLASS_AND || !value)
3512 			ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3513 		    break;
3514 		case ANYOF:
3515 		    if (flags & SCF_DO_STCLASS_AND)
3516 			cl_and(data->start_class,
3517 			       (struct regnode_charclass_class*)scan);
3518 		    else
3519 			cl_or(pRExC_state, data->start_class,
3520 			      (struct regnode_charclass_class*)scan);
3521 		    break;
3522 		case ALNUM:
3523 		    if (flags & SCF_DO_STCLASS_AND) {
3524 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3525 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3526 			    for (value = 0; value < 256; value++)
3527 				if (!isALNUM(value))
3528 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3529 			}
3530 		    }
3531 		    else {
3532 			if (data->start_class->flags & ANYOF_LOCALE)
3533 			    ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3534 			else {
3535 			    for (value = 0; value < 256; value++)
3536 				if (isALNUM(value))
3537 				    ANYOF_BITMAP_SET(data->start_class, value);
3538 			}
3539 		    }
3540 		    break;
3541 		case ALNUML:
3542 		    if (flags & SCF_DO_STCLASS_AND) {
3543 			if (data->start_class->flags & ANYOF_LOCALE)
3544 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3545 		    }
3546 		    else {
3547 			ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3548 			data->start_class->flags |= ANYOF_LOCALE;
3549 		    }
3550 		    break;
3551 		case NALNUM:
3552 		    if (flags & SCF_DO_STCLASS_AND) {
3553 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3554 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3555 			    for (value = 0; value < 256; value++)
3556 				if (isALNUM(value))
3557 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3558 			}
3559 		    }
3560 		    else {
3561 			if (data->start_class->flags & ANYOF_LOCALE)
3562 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3563 			else {
3564 			    for (value = 0; value < 256; value++)
3565 				if (!isALNUM(value))
3566 				    ANYOF_BITMAP_SET(data->start_class, value);
3567 			}
3568 		    }
3569 		    break;
3570 		case NALNUML:
3571 		    if (flags & SCF_DO_STCLASS_AND) {
3572 			if (data->start_class->flags & ANYOF_LOCALE)
3573 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3574 		    }
3575 		    else {
3576 			data->start_class->flags |= ANYOF_LOCALE;
3577 			ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3578 		    }
3579 		    break;
3580 		case SPACE:
3581 		    if (flags & SCF_DO_STCLASS_AND) {
3582 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3583 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3584 			    for (value = 0; value < 256; value++)
3585 				if (!isSPACE(value))
3586 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3587 			}
3588 		    }
3589 		    else {
3590 			if (data->start_class->flags & ANYOF_LOCALE)
3591 			    ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3592 			else {
3593 			    for (value = 0; value < 256; value++)
3594 				if (isSPACE(value))
3595 				    ANYOF_BITMAP_SET(data->start_class, value);
3596 			}
3597 		    }
3598 		    break;
3599 		case SPACEL:
3600 		    if (flags & SCF_DO_STCLASS_AND) {
3601 			if (data->start_class->flags & ANYOF_LOCALE)
3602 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3603 		    }
3604 		    else {
3605 			data->start_class->flags |= ANYOF_LOCALE;
3606 			ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3607 		    }
3608 		    break;
3609 		case NSPACE:
3610 		    if (flags & SCF_DO_STCLASS_AND) {
3611 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3612 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3613 			    for (value = 0; value < 256; value++)
3614 				if (isSPACE(value))
3615 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3616 			}
3617 		    }
3618 		    else {
3619 			if (data->start_class->flags & ANYOF_LOCALE)
3620 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3621 			else {
3622 			    for (value = 0; value < 256; value++)
3623 				if (!isSPACE(value))
3624 				    ANYOF_BITMAP_SET(data->start_class, value);
3625 			}
3626 		    }
3627 		    break;
3628 		case NSPACEL:
3629 		    if (flags & SCF_DO_STCLASS_AND) {
3630 			if (data->start_class->flags & ANYOF_LOCALE) {
3631 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3632 			    for (value = 0; value < 256; value++)
3633 				if (!isSPACE(value))
3634 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3635 			}
3636 		    }
3637 		    else {
3638 			data->start_class->flags |= ANYOF_LOCALE;
3639 			ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3640 		    }
3641 		    break;
3642 		case DIGIT:
3643 		    if (flags & SCF_DO_STCLASS_AND) {
3644 			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3645 			for (value = 0; value < 256; value++)
3646 			    if (!isDIGIT(value))
3647 				ANYOF_BITMAP_CLEAR(data->start_class, value);
3648 		    }
3649 		    else {
3650 			if (data->start_class->flags & ANYOF_LOCALE)
3651 			    ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3652 			else {
3653 			    for (value = 0; value < 256; value++)
3654 				if (isDIGIT(value))
3655 				    ANYOF_BITMAP_SET(data->start_class, value);
3656 			}
3657 		    }
3658 		    break;
3659 		case NDIGIT:
3660 		    if (flags & SCF_DO_STCLASS_AND) {
3661 			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3662 			for (value = 0; value < 256; value++)
3663 			    if (isDIGIT(value))
3664 				ANYOF_BITMAP_CLEAR(data->start_class, value);
3665 		    }
3666 		    else {
3667 			if (data->start_class->flags & ANYOF_LOCALE)
3668 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3669 			else {
3670 			    for (value = 0; value < 256; value++)
3671 				if (!isDIGIT(value))
3672 				    ANYOF_BITMAP_SET(data->start_class, value);
3673 			}
3674 		    }
3675 		    break;
3676 		CASE_SYNST_FNC(VERTWS);
3677 		CASE_SYNST_FNC(HORIZWS);
3678 
3679 		}
3680 		if (flags & SCF_DO_STCLASS_OR)
3681 		    cl_and(data->start_class, and_withp);
3682 		flags &= ~SCF_DO_STCLASS;
3683 	    }
3684 	}
3685 	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3686 	    data->flags |= (OP(scan) == MEOL
3687 			    ? SF_BEFORE_MEOL
3688 			    : SF_BEFORE_SEOL);
3689 	}
3690 	else if (  PL_regkind[OP(scan)] == BRANCHJ
3691 		 /* Lookbehind, or need to calculate parens/evals/stclass: */
3692 		   && (scan->flags || data || (flags & SCF_DO_STCLASS))
3693 		   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3694             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3695                 || OP(scan) == UNLESSM )
3696             {
3697                 /* Negative Lookahead/lookbehind
3698                    In this case we can't do fixed string optimisation.
3699                 */
3700 
3701                 I32 deltanext, minnext, fake = 0;
3702                 regnode *nscan;
3703                 struct regnode_charclass_class intrnl;
3704                 int f = 0;
3705 
3706                 data_fake.flags = 0;
3707                 if (data) {
3708                     data_fake.whilem_c = data->whilem_c;
3709                     data_fake.last_closep = data->last_closep;
3710 		}
3711                 else
3712                     data_fake.last_closep = &fake;
3713 		data_fake.pos_delta = delta;
3714                 if ( flags & SCF_DO_STCLASS && !scan->flags
3715                      && OP(scan) == IFMATCH ) { /* Lookahead */
3716                     cl_init(pRExC_state, &intrnl);
3717                     data_fake.start_class = &intrnl;
3718                     f |= SCF_DO_STCLASS_AND;
3719 		}
3720                 if (flags & SCF_WHILEM_VISITED_POS)
3721                     f |= SCF_WHILEM_VISITED_POS;
3722                 next = regnext(scan);
3723                 nscan = NEXTOPER(NEXTOPER(scan));
3724                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3725                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3726                 if (scan->flags) {
3727                     if (deltanext) {
3728 			FAIL("Variable length lookbehind not implemented");
3729                     }
3730                     else if (minnext > (I32)U8_MAX) {
3731 			FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3732                     }
3733                     scan->flags = (U8)minnext;
3734                 }
3735                 if (data) {
3736                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3737                         pars++;
3738                     if (data_fake.flags & SF_HAS_EVAL)
3739                         data->flags |= SF_HAS_EVAL;
3740                     data->whilem_c = data_fake.whilem_c;
3741                 }
3742                 if (f & SCF_DO_STCLASS_AND) {
3743 		    if (flags & SCF_DO_STCLASS_OR) {
3744 			/* OR before, AND after: ideally we would recurse with
3745 			 * data_fake to get the AND applied by study of the
3746 			 * remainder of the pattern, and then derecurse;
3747 			 * *** HACK *** for now just treat as "no information".
3748 			 * See [perl #56690].
3749 			 */
3750 			cl_init(pRExC_state, data->start_class);
3751 		    }  else {
3752 			/* AND before and after: combine and continue */
3753 			const int was = (data->start_class->flags & ANYOF_EOS);
3754 
3755 			cl_and(data->start_class, &intrnl);
3756 			if (was)
3757 			    data->start_class->flags |= ANYOF_EOS;
3758 		    }
3759                 }
3760 	    }
3761 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3762             else {
3763                 /* Positive Lookahead/lookbehind
3764                    In this case we can do fixed string optimisation,
3765                    but we must be careful about it. Note in the case of
3766                    lookbehind the positions will be offset by the minimum
3767                    length of the pattern, something we won't know about
3768                    until after the recurse.
3769                 */
3770                 I32 deltanext, fake = 0;
3771                 regnode *nscan;
3772                 struct regnode_charclass_class intrnl;
3773                 int f = 0;
3774                 /* We use SAVEFREEPV so that when the full compile
3775                     is finished perl will clean up the allocated
3776                     minlens when its all done. This was we don't
3777                     have to worry about freeing them when we know
3778                     they wont be used, which would be a pain.
3779                  */
3780                 I32 *minnextp;
3781                 Newx( minnextp, 1, I32 );
3782                 SAVEFREEPV(minnextp);
3783 
3784                 if (data) {
3785                     StructCopy(data, &data_fake, scan_data_t);
3786                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3787                         f |= SCF_DO_SUBSTR;
3788                         if (scan->flags)
3789                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3790                         data_fake.last_found=newSVsv(data->last_found);
3791                     }
3792                 }
3793                 else
3794                     data_fake.last_closep = &fake;
3795                 data_fake.flags = 0;
3796 		data_fake.pos_delta = delta;
3797                 if (is_inf)
3798 	            data_fake.flags |= SF_IS_INF;
3799                 if ( flags & SCF_DO_STCLASS && !scan->flags
3800                      && OP(scan) == IFMATCH ) { /* Lookahead */
3801                     cl_init(pRExC_state, &intrnl);
3802                     data_fake.start_class = &intrnl;
3803                     f |= SCF_DO_STCLASS_AND;
3804                 }
3805                 if (flags & SCF_WHILEM_VISITED_POS)
3806                     f |= SCF_WHILEM_VISITED_POS;
3807                 next = regnext(scan);
3808                 nscan = NEXTOPER(NEXTOPER(scan));
3809 
3810                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3811                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3812                 if (scan->flags) {
3813                     if (deltanext) {
3814 			FAIL("Variable length lookbehind not implemented");
3815                     }
3816                     else if (*minnextp > (I32)U8_MAX) {
3817 			FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3818                     }
3819                     scan->flags = (U8)*minnextp;
3820                 }
3821 
3822                 *minnextp += min;
3823 
3824                 if (f & SCF_DO_STCLASS_AND) {
3825                     const int was = (data->start_class->flags & ANYOF_EOS);
3826 
3827                     cl_and(data->start_class, &intrnl);
3828                     if (was)
3829                         data->start_class->flags |= ANYOF_EOS;
3830                 }
3831                 if (data) {
3832                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3833                         pars++;
3834                     if (data_fake.flags & SF_HAS_EVAL)
3835                         data->flags |= SF_HAS_EVAL;
3836                     data->whilem_c = data_fake.whilem_c;
3837                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3838                         if (RExC_rx->minlen<*minnextp)
3839                             RExC_rx->minlen=*minnextp;
3840                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3841                         SvREFCNT_dec(data_fake.last_found);
3842 
3843                         if ( data_fake.minlen_fixed != minlenp )
3844                         {
3845                             data->offset_fixed= data_fake.offset_fixed;
3846                             data->minlen_fixed= data_fake.minlen_fixed;
3847                             data->lookbehind_fixed+= scan->flags;
3848                         }
3849                         if ( data_fake.minlen_float != minlenp )
3850                         {
3851                             data->minlen_float= data_fake.minlen_float;
3852                             data->offset_float_min=data_fake.offset_float_min;
3853                             data->offset_float_max=data_fake.offset_float_max;
3854                             data->lookbehind_float+= scan->flags;
3855                         }
3856                     }
3857                 }
3858 
3859 
3860 	    }
3861 #endif
3862 	}
3863 	else if (OP(scan) == OPEN) {
3864 	    if (stopparen != (I32)ARG(scan))
3865 	        pars++;
3866 	}
3867 	else if (OP(scan) == CLOSE) {
3868 	    if (stopparen == (I32)ARG(scan)) {
3869 	        break;
3870 	    }
3871 	    if ((I32)ARG(scan) == is_par) {
3872 		next = regnext(scan);
3873 
3874 		if ( next && (OP(next) != WHILEM) && next < last)
3875 		    is_par = 0;		/* Disable optimization */
3876 	    }
3877 	    if (data)
3878 		*(data->last_closep) = ARG(scan);
3879 	}
3880 	else if (OP(scan) == EVAL) {
3881 		if (data)
3882 		    data->flags |= SF_HAS_EVAL;
3883 	}
3884 	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3885 	    if (flags & SCF_DO_SUBSTR) {
3886 		SCAN_COMMIT(pRExC_state,data,minlenp);
3887 		flags &= ~SCF_DO_SUBSTR;
3888 	    }
3889 	    if (data && OP(scan)==ACCEPT) {
3890 	        data->flags |= SCF_SEEN_ACCEPT;
3891 	        if (stopmin > min)
3892 	            stopmin = min;
3893 	    }
3894 	}
3895 	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3896 	{
3897 		if (flags & SCF_DO_SUBSTR) {
3898 		    SCAN_COMMIT(pRExC_state,data,minlenp);
3899 		    data->longest = &(data->longest_float);
3900 		}
3901 		is_inf = is_inf_internal = 1;
3902 		if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3903 		    cl_anything(pRExC_state, data->start_class);
3904 		flags &= ~SCF_DO_STCLASS;
3905 	}
3906 	else if (OP(scan) == GPOS) {
3907 	    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3908 	        !(delta || is_inf || (data && data->pos_delta)))
3909 	    {
3910 	        if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3911 		    RExC_rx->extflags |= RXf_ANCH_GPOS;
3912 	        if (RExC_rx->gofs < (U32)min)
3913 		    RExC_rx->gofs = min;
3914             } else {
3915                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3916                 RExC_rx->gofs = 0;
3917             }
3918 	}
3919 #ifdef TRIE_STUDY_OPT
3920 #ifdef FULL_TRIE_STUDY
3921         else if (PL_regkind[OP(scan)] == TRIE) {
3922             /* NOTE - There is similar code to this block above for handling
3923                BRANCH nodes on the initial study.  If you change stuff here
3924                check there too. */
3925             regnode *trie_node= scan;
3926             regnode *tail= regnext(scan);
3927             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3928             I32 max1 = 0, min1 = I32_MAX;
3929             struct regnode_charclass_class accum;
3930 
3931             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3932                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3933             if (flags & SCF_DO_STCLASS)
3934                 cl_init_zero(pRExC_state, &accum);
3935 
3936             if (!trie->jump) {
3937                 min1= trie->minlen;
3938                 max1= trie->maxlen;
3939             } else {
3940                 const regnode *nextbranch= NULL;
3941                 U32 word;
3942 
3943                 for ( word=1 ; word <= trie->wordcount ; word++)
3944                 {
3945                     I32 deltanext=0, minnext=0, f = 0, fake;
3946                     struct regnode_charclass_class this_class;
3947 
3948                     data_fake.flags = 0;
3949                     if (data) {
3950                         data_fake.whilem_c = data->whilem_c;
3951                         data_fake.last_closep = data->last_closep;
3952                     }
3953                     else
3954                         data_fake.last_closep = &fake;
3955 		    data_fake.pos_delta = delta;
3956                     if (flags & SCF_DO_STCLASS) {
3957                         cl_init(pRExC_state, &this_class);
3958                         data_fake.start_class = &this_class;
3959                         f = SCF_DO_STCLASS_AND;
3960                     }
3961                     if (flags & SCF_WHILEM_VISITED_POS)
3962                         f |= SCF_WHILEM_VISITED_POS;
3963 
3964                     if (trie->jump[word]) {
3965                         if (!nextbranch)
3966                             nextbranch = trie_node + trie->jump[0];
3967                         scan= trie_node + trie->jump[word];
3968                         /* We go from the jump point to the branch that follows
3969                            it. Note this means we need the vestigal unused branches
3970                            even though they arent otherwise used.
3971                          */
3972                         minnext = study_chunk(pRExC_state, &scan, minlenp,
3973                             &deltanext, (regnode *)nextbranch, &data_fake,
3974                             stopparen, recursed, NULL, f,depth+1);
3975                     }
3976                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3977                         nextbranch= regnext((regnode*)nextbranch);
3978 
3979                     if (min1 > (I32)(minnext + trie->minlen))
3980                         min1 = minnext + trie->minlen;
3981                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3982                         max1 = minnext + deltanext + trie->maxlen;
3983                     if (deltanext == I32_MAX)
3984                         is_inf = is_inf_internal = 1;
3985 
3986                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3987                         pars++;
3988                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3989                         if ( stopmin > min + min1)
3990 	                    stopmin = min + min1;
3991 	                flags &= ~SCF_DO_SUBSTR;
3992 	                if (data)
3993 	                    data->flags |= SCF_SEEN_ACCEPT;
3994 	            }
3995                     if (data) {
3996                         if (data_fake.flags & SF_HAS_EVAL)
3997                             data->flags |= SF_HAS_EVAL;
3998                         data->whilem_c = data_fake.whilem_c;
3999                     }
4000                     if (flags & SCF_DO_STCLASS)
4001                         cl_or(pRExC_state, &accum, &this_class);
4002                 }
4003             }
4004             if (flags & SCF_DO_SUBSTR) {
4005                 data->pos_min += min1;
4006                 data->pos_delta += max1 - min1;
4007                 if (max1 != min1 || is_inf)
4008                     data->longest = &(data->longest_float);
4009             }
4010             min += min1;
4011             delta += max1 - min1;
4012             if (flags & SCF_DO_STCLASS_OR) {
4013                 cl_or(pRExC_state, data->start_class, &accum);
4014                 if (min1) {
4015                     cl_and(data->start_class, and_withp);
4016                     flags &= ~SCF_DO_STCLASS;
4017                 }
4018             }
4019             else if (flags & SCF_DO_STCLASS_AND) {
4020                 if (min1) {
4021                     cl_and(data->start_class, &accum);
4022                     flags &= ~SCF_DO_STCLASS;
4023                 }
4024                 else {
4025                     /* Switch to OR mode: cache the old value of
4026                      * data->start_class */
4027 		    INIT_AND_WITHP;
4028                     StructCopy(data->start_class, and_withp,
4029                                struct regnode_charclass_class);
4030                     flags &= ~SCF_DO_STCLASS_AND;
4031                     StructCopy(&accum, data->start_class,
4032                                struct regnode_charclass_class);
4033                     flags |= SCF_DO_STCLASS_OR;
4034                     data->start_class->flags |= ANYOF_EOS;
4035                 }
4036             }
4037             scan= tail;
4038             continue;
4039         }
4040 #else
4041 	else if (PL_regkind[OP(scan)] == TRIE) {
4042 	    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4043 	    U8*bang=NULL;
4044 
4045 	    min += trie->minlen;
4046 	    delta += (trie->maxlen - trie->minlen);
4047 	    flags &= ~SCF_DO_STCLASS; /* xxx */
4048             if (flags & SCF_DO_SUBSTR) {
4049     	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
4050     	        data->pos_min += trie->minlen;
4051     	        data->pos_delta += (trie->maxlen - trie->minlen);
4052 		if (trie->maxlen != trie->minlen)
4053 		    data->longest = &(data->longest_float);
4054     	    }
4055     	    if (trie->jump) /* no more substrings -- for now /grr*/
4056     	        flags &= ~SCF_DO_SUBSTR;
4057 	}
4058 #endif /* old or new */
4059 #endif /* TRIE_STUDY_OPT */
4060 
4061 	/* Else: zero-length, ignore. */
4062 	scan = regnext(scan);
4063     }
4064     if (frame) {
4065         last = frame->last;
4066         scan = frame->next;
4067         stopparen = frame->stop;
4068         frame = frame->prev;
4069         goto fake_study_recurse;
4070     }
4071 
4072   finish:
4073     assert(!frame);
4074     DEBUG_STUDYDATA("pre-fin:",data,depth);
4075 
4076     *scanp = scan;
4077     *deltap = is_inf_internal ? I32_MAX : delta;
4078     if (flags & SCF_DO_SUBSTR && is_inf)
4079 	data->pos_delta = I32_MAX - data->pos_min;
4080     if (is_par > (I32)U8_MAX)
4081 	is_par = 0;
4082     if (is_par && pars==1 && data) {
4083 	data->flags |= SF_IN_PAR;
4084 	data->flags &= ~SF_HAS_PAR;
4085     }
4086     else if (pars && data) {
4087 	data->flags |= SF_HAS_PAR;
4088 	data->flags &= ~SF_IN_PAR;
4089     }
4090     if (flags & SCF_DO_STCLASS_OR)
4091 	cl_and(data->start_class, and_withp);
4092     if (flags & SCF_TRIE_RESTUDY)
4093         data->flags |= 	SCF_TRIE_RESTUDY;
4094 
4095     DEBUG_STUDYDATA("post-fin:",data,depth);
4096 
4097     return min < stopmin ? min : stopmin;
4098 }
4099 
4100 STATIC U32
4101 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4102 {
4103     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4104 
4105     PERL_ARGS_ASSERT_ADD_DATA;
4106 
4107     Renewc(RExC_rxi->data,
4108 	   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4109 	   char, struct reg_data);
4110     if(count)
4111 	Renew(RExC_rxi->data->what, count + n, U8);
4112     else
4113 	Newx(RExC_rxi->data->what, n, U8);
4114     RExC_rxi->data->count = count + n;
4115     Copy(s, RExC_rxi->data->what + count, n, U8);
4116     return count;
4117 }
4118 
4119 /*XXX: todo make this not included in a non debugging perl */
4120 #ifndef PERL_IN_XSUB_RE
4121 void
4122 Perl_reginitcolors(pTHX)
4123 {
4124     dVAR;
4125     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4126     if (s) {
4127 	char *t = savepv(s);
4128 	int i = 0;
4129 	PL_colors[0] = t;
4130 	while (++i < 6) {
4131 	    t = strchr(t, '\t');
4132 	    if (t) {
4133 		*t = '\0';
4134 		PL_colors[i] = ++t;
4135 	    }
4136 	    else
4137 		PL_colors[i] = t = (char *)"";
4138 	}
4139     } else {
4140 	int i = 0;
4141 	while (i < 6)
4142 	    PL_colors[i++] = (char *)"";
4143     }
4144     PL_colorset = 1;
4145 }
4146 #endif
4147 
4148 
4149 #ifdef TRIE_STUDY_OPT
4150 #define CHECK_RESTUDY_GOTO                                  \
4151         if (                                                \
4152               (data.flags & SCF_TRIE_RESTUDY)               \
4153               && ! restudied++                              \
4154         )     goto reStudy
4155 #else
4156 #define CHECK_RESTUDY_GOTO
4157 #endif
4158 
4159 /*
4160  - pregcomp - compile a regular expression into internal code
4161  *
4162  * We can't allocate space until we know how big the compiled form will be,
4163  * but we can't compile it (and thus know how big it is) until we've got a
4164  * place to put the code.  So we cheat:  we compile it twice, once with code
4165  * generation turned off and size counting turned on, and once "for real".
4166  * This also means that we don't allocate space until we are sure that the
4167  * thing really will compile successfully, and we never have to move the
4168  * code and thus invalidate pointers into it.  (Note that it has to be in
4169  * one piece because free() must be able to free it all.) [NB: not true in perl]
4170  *
4171  * Beware that the optimization-preparation code in here knows about some
4172  * of the structure of the compiled regexp.  [I'll say.]
4173  */
4174 
4175 
4176 
4177 #ifndef PERL_IN_XSUB_RE
4178 #define RE_ENGINE_PTR &PL_core_reg_engine
4179 #else
4180 extern const struct regexp_engine my_reg_engine;
4181 #define RE_ENGINE_PTR &my_reg_engine
4182 #endif
4183 
4184 #ifndef PERL_IN_XSUB_RE
4185 REGEXP *
4186 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4187 {
4188     dVAR;
4189     HV * const table = GvHV(PL_hintgv);
4190 
4191     PERL_ARGS_ASSERT_PREGCOMP;
4192 
4193     /* Dispatch a request to compile a regexp to correct
4194        regexp engine. */
4195     if (table) {
4196         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4197         GET_RE_DEBUG_FLAGS_DECL;
4198         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4199             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4200             DEBUG_COMPILE_r({
4201                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4202                     SvIV(*ptr));
4203             });
4204             return CALLREGCOMP_ENG(eng, pattern, flags);
4205         }
4206     }
4207     return Perl_re_compile(aTHX_ pattern, flags);
4208 }
4209 #endif
4210 
4211 REGEXP *
4212 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4213 {
4214     dVAR;
4215     REGEXP *rx;
4216     struct regexp *r;
4217     register regexp_internal *ri;
4218     STRLEN plen;
4219     char  *exp = SvPV(pattern, plen);
4220     char* xend = exp + plen;
4221     regnode *scan;
4222     I32 flags;
4223     I32 minlen = 0;
4224     I32 sawplus = 0;
4225     I32 sawopen = 0;
4226     scan_data_t data;
4227     RExC_state_t RExC_state;
4228     RExC_state_t * const pRExC_state = &RExC_state;
4229 #ifdef TRIE_STUDY_OPT
4230     int restudied= 0;
4231     RExC_state_t copyRExC_state;
4232 #endif
4233     GET_RE_DEBUG_FLAGS_DECL;
4234 
4235     PERL_ARGS_ASSERT_RE_COMPILE;
4236 
4237     DEBUG_r(if (!PL_colorset) reginitcolors());
4238 
4239     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4240 
4241     DEBUG_COMPILE_r({
4242         SV *dsv= sv_newmortal();
4243         RE_PV_QUOTED_DECL(s, RExC_utf8,
4244             dsv, exp, plen, 60);
4245         PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4246 		       PL_colors[4],PL_colors[5],s);
4247     });
4248 
4249 redo_first_pass:
4250     RExC_precomp = exp;
4251     RExC_flags = pm_flags;
4252     RExC_sawback = 0;
4253 
4254     RExC_seen = 0;
4255     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4256     RExC_seen_evals = 0;
4257     RExC_extralen = 0;
4258 
4259     /* First pass: determine size, legality. */
4260     RExC_parse = exp;
4261     RExC_start = exp;
4262     RExC_end = xend;
4263     RExC_naughty = 0;
4264     RExC_npar = 1;
4265     RExC_nestroot = 0;
4266     RExC_size = 0L;
4267     RExC_emit = &PL_regdummy;
4268     RExC_whilem_seen = 0;
4269     RExC_open_parens = NULL;
4270     RExC_close_parens = NULL;
4271     RExC_opend = NULL;
4272     RExC_paren_names = NULL;
4273 #ifdef DEBUGGING
4274     RExC_paren_name_list = NULL;
4275 #endif
4276     RExC_recurse = NULL;
4277     RExC_recurse_count = 0;
4278 
4279 #if 0 /* REGC() is (currently) a NOP at the first pass.
4280        * Clever compilers notice this and complain. --jhi */
4281     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4282 #endif
4283     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4284     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4285 	RExC_precomp = NULL;
4286 	return(NULL);
4287     }
4288     if (RExC_utf8 && !RExC_orig_utf8) {
4289         /* It's possible to write a regexp in ascii that represents Unicode
4290         codepoints outside of the byte range, such as via \x{100}. If we
4291         detect such a sequence we have to convert the entire pattern to utf8
4292         and then recompile, as our sizing calculation will have been based
4293         on 1 byte == 1 character, but we will need to use utf8 to encode
4294         at least some part of the pattern, and therefore must convert the whole
4295         thing.
4296         XXX: somehow figure out how to make this less expensive...
4297         -- dmq */
4298         STRLEN len = plen;
4299         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4300 	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4301         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4302         xend = exp + len;
4303         RExC_orig_utf8 = RExC_utf8;
4304         SAVEFREEPV(exp);
4305         goto redo_first_pass;
4306     }
4307     DEBUG_PARSE_r({
4308         PerlIO_printf(Perl_debug_log,
4309             "Required size %"IVdf" nodes\n"
4310             "Starting second pass (creation)\n",
4311             (IV)RExC_size);
4312         RExC_lastnum=0;
4313         RExC_lastparse=NULL;
4314     });
4315     /* Small enough for pointer-storage convention?
4316        If extralen==0, this means that we will not need long jumps. */
4317     if (RExC_size >= 0x10000L && RExC_extralen)
4318         RExC_size += RExC_extralen;
4319     else
4320 	RExC_extralen = 0;
4321     if (RExC_whilem_seen > 15)
4322 	RExC_whilem_seen = 15;
4323 
4324     /* Allocate space and zero-initialize. Note, the two step process
4325        of zeroing when in debug mode, thus anything assigned has to
4326        happen after that */
4327     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4328     r = (struct regexp*)SvANY(rx);
4329     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4330 	 char, regexp_internal);
4331     if ( r == NULL || ri == NULL )
4332 	FAIL("Regexp out of space");
4333 #ifdef DEBUGGING
4334     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4335     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4336 #else
4337     /* bulk initialize base fields with 0. */
4338     Zero(ri, sizeof(regexp_internal), char);
4339 #endif
4340 
4341     /* non-zero initialization begins here */
4342     RXi_SET( r, ri );
4343     r->engine= RE_ENGINE_PTR;
4344     r->extflags = pm_flags;
4345     {
4346         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4347 	bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4348 	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4349 	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4350 			    >> RXf_PMf_STD_PMMOD_SHIFT);
4351 	const char *fptr = STD_PAT_MODS;        /*"msix"*/
4352 	char *p;
4353 	const STRLEN wraplen = plen + has_minus + has_p + has_runon
4354             + (sizeof(STD_PAT_MODS) - 1)
4355             + (sizeof("(?:)") - 1);
4356 
4357 	p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
4358 	SvCUR_set(rx, wraplen);
4359 	SvPOK_on(rx);
4360 	SvFLAGS(rx) |= SvUTF8(pattern);
4361         *p++='('; *p++='?';
4362         if (has_p)
4363             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4364         {
4365             char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4366             char *colon = r + 1;
4367             char ch;
4368 
4369             while((ch = *fptr++)) {
4370                 if(reganch & 1)
4371                     *p++ = ch;
4372                 else
4373                     *r-- = ch;
4374                 reganch >>= 1;
4375             }
4376             if(has_minus) {
4377                 *r = '-';
4378                 p = colon;
4379             }
4380         }
4381 
4382         *p++ = ':';
4383         Copy(RExC_precomp, p, plen, char);
4384 	assert ((RX_WRAPPED(rx) - p) < 16);
4385 	r->pre_prefix = p - RX_WRAPPED(rx);
4386         p += plen;
4387         if (has_runon)
4388             *p++ = '\n';
4389         *p++ = ')';
4390         *p = 0;
4391     }
4392 
4393     r->intflags = 0;
4394     r->nparens = RExC_npar - 1;	/* set early to validate backrefs */
4395 
4396     if (RExC_seen & REG_SEEN_RECURSE) {
4397         Newxz(RExC_open_parens, RExC_npar,regnode *);
4398         SAVEFREEPV(RExC_open_parens);
4399         Newxz(RExC_close_parens,RExC_npar,regnode *);
4400         SAVEFREEPV(RExC_close_parens);
4401     }
4402 
4403     /* Useful during FAIL. */
4404 #ifdef RE_TRACK_PATTERN_OFFSETS
4405     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4406     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4407                           "%s %"UVuf" bytes for offset annotations.\n",
4408                           ri->u.offsets ? "Got" : "Couldn't get",
4409                           (UV)((2*RExC_size+1) * sizeof(U32))));
4410 #endif
4411     SetProgLen(ri,RExC_size);
4412     RExC_rx_sv = rx;
4413     RExC_rx = r;
4414     RExC_rxi = ri;
4415 
4416     /* Second pass: emit code. */
4417     RExC_flags = pm_flags;	/* don't let top level (?i) bleed */
4418     RExC_parse = exp;
4419     RExC_end = xend;
4420     RExC_naughty = 0;
4421     RExC_npar = 1;
4422     RExC_emit_start = ri->program;
4423     RExC_emit = ri->program;
4424     RExC_emit_bound = ri->program + RExC_size + 1;
4425 
4426     /* Store the count of eval-groups for security checks: */
4427     RExC_rx->seen_evals = RExC_seen_evals;
4428     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4429     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4430 	ReREFCNT_dec(rx);
4431 	return(NULL);
4432     }
4433     /* XXXX To minimize changes to RE engine we always allocate
4434        3-units-long substrs field. */
4435     Newx(r->substrs, 1, struct reg_substr_data);
4436     if (RExC_recurse_count) {
4437         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4438         SAVEFREEPV(RExC_recurse);
4439     }
4440 
4441 reStudy:
4442     r->minlen = minlen = sawplus = sawopen = 0;
4443     Zero(r->substrs, 1, struct reg_substr_data);
4444 
4445 #ifdef TRIE_STUDY_OPT
4446     if (!restudied) {
4447         StructCopy(&zero_scan_data, &data, scan_data_t);
4448         copyRExC_state = RExC_state;
4449     } else {
4450         U32 seen=RExC_seen;
4451         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4452 
4453         RExC_state = copyRExC_state;
4454         if (seen & REG_TOP_LEVEL_BRANCHES)
4455             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4456         else
4457             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4458         if (data.last_found) {
4459             SvREFCNT_dec(data.longest_fixed);
4460 	    SvREFCNT_dec(data.longest_float);
4461 	    SvREFCNT_dec(data.last_found);
4462 	}
4463 	StructCopy(&zero_scan_data, &data, scan_data_t);
4464     }
4465 #else
4466     StructCopy(&zero_scan_data, &data, scan_data_t);
4467 #endif
4468 
4469     /* Dig out information for optimizations. */
4470     r->extflags = RExC_flags; /* was pm_op */
4471     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4472 
4473     if (UTF)
4474 	SvUTF8_on(rx);	/* Unicode in it? */
4475     ri->regstclass = NULL;
4476     if (RExC_naughty >= 10)	/* Probably an expensive pattern. */
4477 	r->intflags |= PREGf_NAUGHTY;
4478     scan = ri->program + 1;		/* First BRANCH. */
4479 
4480     /* testing for BRANCH here tells us whether there is "must appear"
4481        data in the pattern. If there is then we can use it for optimisations */
4482     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4483 	I32 fake;
4484 	STRLEN longest_float_length, longest_fixed_length;
4485 	struct regnode_charclass_class ch_class; /* pointed to by data */
4486 	int stclass_flag;
4487 	I32 last_close = 0; /* pointed to by data */
4488         regnode *first= scan;
4489         regnode *first_next= regnext(first);
4490 
4491 	/*
4492 	 * Skip introductions and multiplicators >= 1
4493 	 * so that we can extract the 'meat' of the pattern that must
4494 	 * match in the large if() sequence following.
4495 	 * NOTE that EXACT is NOT covered here, as it is normally
4496 	 * picked up by the optimiser separately.
4497 	 *
4498 	 * This is unfortunate as the optimiser isnt handling lookahead
4499 	 * properly currently.
4500 	 *
4501 	 */
4502 	while ((OP(first) == OPEN && (sawopen = 1)) ||
4503 	       /* An OR of *one* alternative - should not happen now. */
4504 	    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4505 	    /* for now we can't handle lookbehind IFMATCH*/
4506 	    (OP(first) == IFMATCH && !first->flags) ||
4507 	    (OP(first) == PLUS) ||
4508 	    (OP(first) == MINMOD) ||
4509 	       /* An {n,m} with n>0 */
4510 	    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4511 	    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4512 	{
4513 		/*
4514 		 * the only op that could be a regnode is PLUS, all the rest
4515 		 * will be regnode_1 or regnode_2.
4516 		 *
4517 		 */
4518 		if (OP(first) == PLUS)
4519 		    sawplus = 1;
4520 		else
4521 		    first += regarglen[OP(first)];
4522 
4523 		first = NEXTOPER(first);
4524 		first_next= regnext(first);
4525 	}
4526 
4527 	/* Starting-point info. */
4528       again:
4529         DEBUG_PEEP("first:",first,0);
4530         /* Ignore EXACT as we deal with it later. */
4531 	if (PL_regkind[OP(first)] == EXACT) {
4532 	    if (OP(first) == EXACT)
4533 		NOOP;	/* Empty, get anchored substr later. */
4534 	    else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4535 		ri->regstclass = first;
4536 	}
4537 #ifdef TRIE_STCLASS
4538 	else if (PL_regkind[OP(first)] == TRIE &&
4539 	        ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4540 	{
4541 	    regnode *trie_op;
4542 	    /* this can happen only on restudy */
4543 	    if ( OP(first) == TRIE ) {
4544                 struct regnode_1 *trieop = (struct regnode_1 *)
4545 		    PerlMemShared_calloc(1, sizeof(struct regnode_1));
4546                 StructCopy(first,trieop,struct regnode_1);
4547                 trie_op=(regnode *)trieop;
4548             } else {
4549                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4550 		    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4551                 StructCopy(first,trieop,struct regnode_charclass);
4552                 trie_op=(regnode *)trieop;
4553             }
4554             OP(trie_op)+=2;
4555             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4556 	    ri->regstclass = trie_op;
4557 	}
4558 #endif
4559 	else if (strchr((const char*)PL_simple,OP(first)))
4560 	    ri->regstclass = first;
4561 	else if (PL_regkind[OP(first)] == BOUND ||
4562 		 PL_regkind[OP(first)] == NBOUND)
4563 	    ri->regstclass = first;
4564 	else if (PL_regkind[OP(first)] == BOL) {
4565 	    r->extflags |= (OP(first) == MBOL
4566 			   ? RXf_ANCH_MBOL
4567 			   : (OP(first) == SBOL
4568 			      ? RXf_ANCH_SBOL
4569 			      : RXf_ANCH_BOL));
4570 	    first = NEXTOPER(first);
4571 	    goto again;
4572 	}
4573 	else if (OP(first) == GPOS) {
4574 	    r->extflags |= RXf_ANCH_GPOS;
4575 	    first = NEXTOPER(first);
4576 	    goto again;
4577 	}
4578 	else if ((!sawopen || !RExC_sawback) &&
4579 	    (OP(first) == STAR &&
4580 	    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4581 	    !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4582 	{
4583 	    /* turn .* into ^.* with an implied $*=1 */
4584 	    const int type =
4585 		(OP(NEXTOPER(first)) == REG_ANY)
4586 		    ? RXf_ANCH_MBOL
4587 		    : RXf_ANCH_SBOL;
4588 	    r->extflags |= type;
4589 	    r->intflags |= PREGf_IMPLICIT;
4590 	    first = NEXTOPER(first);
4591 	    goto again;
4592 	}
4593 	if (sawplus && (!sawopen || !RExC_sawback)
4594 	    && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4595 	    /* x+ must match at the 1st pos of run of x's */
4596 	    r->intflags |= PREGf_SKIP;
4597 
4598 	/* Scan is after the zeroth branch, first is atomic matcher. */
4599 #ifdef TRIE_STUDY_OPT
4600 	DEBUG_PARSE_r(
4601 	    if (!restudied)
4602 	        PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4603 			      (IV)(first - scan + 1))
4604         );
4605 #else
4606 	DEBUG_PARSE_r(
4607 	    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4608 	        (IV)(first - scan + 1))
4609         );
4610 #endif
4611 
4612 
4613 	/*
4614 	* If there's something expensive in the r.e., find the
4615 	* longest literal string that must appear and make it the
4616 	* regmust.  Resolve ties in favor of later strings, since
4617 	* the regstart check works with the beginning of the r.e.
4618 	* and avoiding duplication strengthens checking.  Not a
4619 	* strong reason, but sufficient in the absence of others.
4620 	* [Now we resolve ties in favor of the earlier string if
4621 	* it happens that c_offset_min has been invalidated, since the
4622 	* earlier string may buy us something the later one won't.]
4623 	*/
4624 
4625 	data.longest_fixed = newSVpvs("");
4626 	data.longest_float = newSVpvs("");
4627 	data.last_found = newSVpvs("");
4628 	data.longest = &(data.longest_fixed);
4629 	first = scan;
4630 	if (!ri->regstclass) {
4631 	    cl_init(pRExC_state, &ch_class);
4632 	    data.start_class = &ch_class;
4633 	    stclass_flag = SCF_DO_STCLASS_AND;
4634 	} else				/* XXXX Check for BOUND? */
4635 	    stclass_flag = 0;
4636 	data.last_closep = &last_close;
4637 
4638 	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4639             &data, -1, NULL, NULL,
4640             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4641 
4642 
4643         CHECK_RESTUDY_GOTO;
4644 
4645 
4646 	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4647 	     && data.last_start_min == 0 && data.last_end > 0
4648 	     && !RExC_seen_zerolen
4649 	     && !(RExC_seen & REG_SEEN_VERBARG)
4650 	     && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4651 	    r->extflags |= RXf_CHECK_ALL;
4652 	scan_commit(pRExC_state, &data,&minlen,0);
4653 	SvREFCNT_dec(data.last_found);
4654 
4655         /* Note that code very similar to this but for anchored string
4656            follows immediately below, changes may need to be made to both.
4657            Be careful.
4658          */
4659 	longest_float_length = CHR_SVLEN(data.longest_float);
4660 	if (longest_float_length
4661 	    || (data.flags & SF_FL_BEFORE_EOL
4662 		&& (!(data.flags & SF_FL_BEFORE_MEOL)
4663 		    || (RExC_flags & RXf_PMf_MULTILINE))))
4664         {
4665             I32 t,ml;
4666 
4667 	    if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4668 		&& data.offset_fixed == data.offset_float_min
4669 		&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4670 		    goto remove_float;		/* As in (a)+. */
4671 
4672             /* copy the information about the longest float from the reg_scan_data
4673                over to the program. */
4674 	    if (SvUTF8(data.longest_float)) {
4675 		r->float_utf8 = data.longest_float;
4676 		r->float_substr = NULL;
4677 	    } else {
4678 		r->float_substr = data.longest_float;
4679 		r->float_utf8 = NULL;
4680 	    }
4681 	    /* float_end_shift is how many chars that must be matched that
4682 	       follow this item. We calculate it ahead of time as once the
4683 	       lookbehind offset is added in we lose the ability to correctly
4684 	       calculate it.*/
4685 	    ml = data.minlen_float ? *(data.minlen_float)
4686 	                           : (I32)longest_float_length;
4687 	    r->float_end_shift = ml - data.offset_float_min
4688 	        - longest_float_length + (SvTAIL(data.longest_float) != 0)
4689 	        + data.lookbehind_float;
4690 	    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4691 	    r->float_max_offset = data.offset_float_max;
4692 	    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4693 	        r->float_max_offset -= data.lookbehind_float;
4694 
4695 	    t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4696 		       && (!(data.flags & SF_FL_BEFORE_MEOL)
4697 			   || (RExC_flags & RXf_PMf_MULTILINE)));
4698 	    fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4699 	}
4700 	else {
4701 	  remove_float:
4702 	    r->float_substr = r->float_utf8 = NULL;
4703 	    SvREFCNT_dec(data.longest_float);
4704 	    longest_float_length = 0;
4705 	}
4706 
4707         /* Note that code very similar to this but for floating string
4708            is immediately above, changes may need to be made to both.
4709            Be careful.
4710          */
4711 	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4712 	if (longest_fixed_length
4713 	    || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4714 		&& (!(data.flags & SF_FIX_BEFORE_MEOL)
4715 		    || (RExC_flags & RXf_PMf_MULTILINE))))
4716         {
4717             I32 t,ml;
4718 
4719             /* copy the information about the longest fixed
4720                from the reg_scan_data over to the program. */
4721 	    if (SvUTF8(data.longest_fixed)) {
4722 		r->anchored_utf8 = data.longest_fixed;
4723 		r->anchored_substr = NULL;
4724 	    } else {
4725 		r->anchored_substr = data.longest_fixed;
4726 		r->anchored_utf8 = NULL;
4727 	    }
4728 	    /* fixed_end_shift is how many chars that must be matched that
4729 	       follow this item. We calculate it ahead of time as once the
4730 	       lookbehind offset is added in we lose the ability to correctly
4731 	       calculate it.*/
4732             ml = data.minlen_fixed ? *(data.minlen_fixed)
4733                                    : (I32)longest_fixed_length;
4734             r->anchored_end_shift = ml - data.offset_fixed
4735 	        - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4736 	        + data.lookbehind_fixed;
4737 	    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4738 
4739 	    t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4740 		 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4741 		     || (RExC_flags & RXf_PMf_MULTILINE)));
4742 	    fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4743 	}
4744 	else {
4745 	    r->anchored_substr = r->anchored_utf8 = NULL;
4746 	    SvREFCNT_dec(data.longest_fixed);
4747 	    longest_fixed_length = 0;
4748 	}
4749 	if (ri->regstclass
4750 	    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4751 	    ri->regstclass = NULL;
4752 	if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4753 	    && stclass_flag
4754 	    && !(data.start_class->flags & ANYOF_EOS)
4755 	    && !cl_is_anything(data.start_class))
4756 	{
4757 	    const U32 n = add_data(pRExC_state, 1, "f");
4758 
4759 	    Newx(RExC_rxi->data->data[n], 1,
4760 		struct regnode_charclass_class);
4761 	    StructCopy(data.start_class,
4762 		       (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4763 		       struct regnode_charclass_class);
4764 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4765 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
4766 	    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4767 	              regprop(r, sv, (regnode*)data.start_class);
4768 		      PerlIO_printf(Perl_debug_log,
4769 				    "synthetic stclass \"%s\".\n",
4770 				    SvPVX_const(sv));});
4771 	}
4772 
4773 	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4774 	if (longest_fixed_length > longest_float_length) {
4775 	    r->check_end_shift = r->anchored_end_shift;
4776 	    r->check_substr = r->anchored_substr;
4777 	    r->check_utf8 = r->anchored_utf8;
4778 	    r->check_offset_min = r->check_offset_max = r->anchored_offset;
4779 	    if (r->extflags & RXf_ANCH_SINGLE)
4780 		r->extflags |= RXf_NOSCAN;
4781 	}
4782 	else {
4783 	    r->check_end_shift = r->float_end_shift;
4784 	    r->check_substr = r->float_substr;
4785 	    r->check_utf8 = r->float_utf8;
4786 	    r->check_offset_min = r->float_min_offset;
4787 	    r->check_offset_max = r->float_max_offset;
4788 	}
4789 	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4790 	   This should be changed ASAP!  */
4791 	if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4792 	    r->extflags |= RXf_USE_INTUIT;
4793 	    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4794 		r->extflags |= RXf_INTUIT_TAIL;
4795 	}
4796 	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4797 	if ( (STRLEN)minlen < longest_float_length )
4798             minlen= longest_float_length;
4799         if ( (STRLEN)minlen < longest_fixed_length )
4800             minlen= longest_fixed_length;
4801         */
4802     }
4803     else {
4804 	/* Several toplevels. Best we can is to set minlen. */
4805 	I32 fake;
4806 	struct regnode_charclass_class ch_class;
4807 	I32 last_close = 0;
4808 
4809 	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4810 
4811 	scan = ri->program + 1;
4812 	cl_init(pRExC_state, &ch_class);
4813 	data.start_class = &ch_class;
4814 	data.last_closep = &last_close;
4815 
4816 
4817 	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4818 	    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4819 
4820         CHECK_RESTUDY_GOTO;
4821 
4822 	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4823 		= r->float_substr = r->float_utf8 = NULL;
4824 	if (!(data.start_class->flags & ANYOF_EOS)
4825 	    && !cl_is_anything(data.start_class))
4826 	{
4827 	    const U32 n = add_data(pRExC_state, 1, "f");
4828 
4829 	    Newx(RExC_rxi->data->data[n], 1,
4830 		struct regnode_charclass_class);
4831 	    StructCopy(data.start_class,
4832 		       (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4833 		       struct regnode_charclass_class);
4834 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4835 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
4836 	    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4837 	              regprop(r, sv, (regnode*)data.start_class);
4838 		      PerlIO_printf(Perl_debug_log,
4839 				    "synthetic stclass \"%s\".\n",
4840 				    SvPVX_const(sv));});
4841 	}
4842     }
4843 
4844     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4845        the "real" pattern. */
4846     DEBUG_OPTIMISE_r({
4847 	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4848 		      (IV)minlen, (IV)r->minlen);
4849     });
4850     r->minlenret = minlen;
4851     if (r->minlen < minlen)
4852         r->minlen = minlen;
4853 
4854     if (RExC_seen & REG_SEEN_GPOS)
4855 	r->extflags |= RXf_GPOS_SEEN;
4856     if (RExC_seen & REG_SEEN_LOOKBEHIND)
4857 	r->extflags |= RXf_LOOKBEHIND_SEEN;
4858     if (RExC_seen & REG_SEEN_EVAL)
4859 	r->extflags |= RXf_EVAL_SEEN;
4860     if (RExC_seen & REG_SEEN_CANY)
4861 	r->extflags |= RXf_CANY_SEEN;
4862     if (RExC_seen & REG_SEEN_VERBARG)
4863 	r->intflags |= PREGf_VERBARG_SEEN;
4864     if (RExC_seen & REG_SEEN_CUTGROUP)
4865 	r->intflags |= PREGf_CUTGROUP_SEEN;
4866     if (RExC_paren_names)
4867         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4868     else
4869         RXp_PAREN_NAMES(r) = NULL;
4870 
4871 #ifdef STUPID_PATTERN_CHECKS
4872     if (RX_PRELEN(rx) == 0)
4873         r->extflags |= RXf_NULL;
4874     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4875         /* XXX: this should happen BEFORE we compile */
4876         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4877     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
4878         r->extflags |= RXf_WHITE;
4879     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
4880         r->extflags |= RXf_START_ONLY;
4881 #else
4882     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
4883             /* XXX: this should happen BEFORE we compile */
4884             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4885     else {
4886         regnode *first = ri->program + 1;
4887         U8 fop = OP(first);
4888         U8 nop = OP(NEXTOPER(first));
4889 
4890         if (PL_regkind[fop] == NOTHING && nop == END)
4891             r->extflags |= RXf_NULL;
4892         else if (PL_regkind[fop] == BOL && nop == END)
4893             r->extflags |= RXf_START_ONLY;
4894         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4895             r->extflags |= RXf_WHITE;
4896     }
4897 #endif
4898 #ifdef DEBUGGING
4899     if (RExC_paren_names) {
4900         ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4901         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4902     } else
4903 #endif
4904         ri->name_list_idx = 0;
4905 
4906     if (RExC_recurse_count) {
4907         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4908             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4909             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4910         }
4911     }
4912     Newxz(r->offs, RExC_npar, regexp_paren_pair);
4913     /* assume we don't need to swap parens around before we match */
4914 
4915     DEBUG_DUMP_r({
4916         PerlIO_printf(Perl_debug_log,"Final program:\n");
4917         regdump(r);
4918     });
4919 #ifdef RE_TRACK_PATTERN_OFFSETS
4920     DEBUG_OFFSETS_r(if (ri->u.offsets) {
4921         const U32 len = ri->u.offsets[0];
4922         U32 i;
4923         GET_RE_DEBUG_FLAGS_DECL;
4924         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4925         for (i = 1; i <= len; i++) {
4926             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4927                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4928                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4929             }
4930         PerlIO_printf(Perl_debug_log, "\n");
4931     });
4932 #endif
4933     return rx;
4934 }
4935 
4936 #undef RE_ENGINE_PTR
4937 
4938 
4939 SV*
4940 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4941                     const U32 flags)
4942 {
4943     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4944 
4945     PERL_UNUSED_ARG(value);
4946 
4947     if (flags & RXapif_FETCH) {
4948         return reg_named_buff_fetch(rx, key, flags);
4949     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4950         Perl_croak(aTHX_ "%s", PL_no_modify);
4951         return NULL;
4952     } else if (flags & RXapif_EXISTS) {
4953         return reg_named_buff_exists(rx, key, flags)
4954             ? &PL_sv_yes
4955             : &PL_sv_no;
4956     } else if (flags & RXapif_REGNAMES) {
4957         return reg_named_buff_all(rx, flags);
4958     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4959         return reg_named_buff_scalar(rx, flags);
4960     } else {
4961         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4962         return NULL;
4963     }
4964 }
4965 
4966 SV*
4967 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4968                          const U32 flags)
4969 {
4970     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4971     PERL_UNUSED_ARG(lastkey);
4972 
4973     if (flags & RXapif_FIRSTKEY)
4974         return reg_named_buff_firstkey(rx, flags);
4975     else if (flags & RXapif_NEXTKEY)
4976         return reg_named_buff_nextkey(rx, flags);
4977     else {
4978         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4979         return NULL;
4980     }
4981 }
4982 
4983 SV*
4984 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4985 			  const U32 flags)
4986 {
4987     AV *retarray = NULL;
4988     SV *ret;
4989     struct regexp *const rx = (struct regexp *)SvANY(r);
4990 
4991     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4992 
4993     if (flags & RXapif_ALL)
4994         retarray=newAV();
4995 
4996     if (rx && RXp_PAREN_NAMES(rx)) {
4997         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
4998         if (he_str) {
4999             IV i;
5000             SV* sv_dat=HeVAL(he_str);
5001             I32 *nums=(I32*)SvPVX(sv_dat);
5002             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5003                 if ((I32)(rx->nparens) >= nums[i]
5004                     && rx->offs[nums[i]].start != -1
5005                     && rx->offs[nums[i]].end != -1)
5006                 {
5007                     ret = newSVpvs("");
5008                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5009                     if (!retarray)
5010                         return ret;
5011                 } else {
5012                     ret = newSVsv(&PL_sv_undef);
5013                 }
5014                 if (retarray)
5015                     av_push(retarray, ret);
5016             }
5017             if (retarray)
5018                 return newRV_noinc(MUTABLE_SV(retarray));
5019         }
5020     }
5021     return NULL;
5022 }
5023 
5024 bool
5025 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5026                            const U32 flags)
5027 {
5028     struct regexp *const rx = (struct regexp *)SvANY(r);
5029 
5030     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5031 
5032     if (rx && RXp_PAREN_NAMES(rx)) {
5033         if (flags & RXapif_ALL) {
5034             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5035         } else {
5036 	    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5037             if (sv) {
5038 		SvREFCNT_dec(sv);
5039                 return TRUE;
5040             } else {
5041                 return FALSE;
5042             }
5043         }
5044     } else {
5045         return FALSE;
5046     }
5047 }
5048 
5049 SV*
5050 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5051 {
5052     struct regexp *const rx = (struct regexp *)SvANY(r);
5053 
5054     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5055 
5056     if ( rx && RXp_PAREN_NAMES(rx) ) {
5057 	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
5058 
5059 	return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5060     } else {
5061 	return FALSE;
5062     }
5063 }
5064 
5065 SV*
5066 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5067 {
5068     struct regexp *const rx = (struct regexp *)SvANY(r);
5069     GET_RE_DEBUG_FLAGS_DECL;
5070 
5071     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5072 
5073     if (rx && RXp_PAREN_NAMES(rx)) {
5074         HV *hv = RXp_PAREN_NAMES(rx);
5075         HE *temphe;
5076         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5077             IV i;
5078             IV parno = 0;
5079             SV* sv_dat = HeVAL(temphe);
5080             I32 *nums = (I32*)SvPVX(sv_dat);
5081             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5082                 if ((I32)(rx->lastparen) >= nums[i] &&
5083                     rx->offs[nums[i]].start != -1 &&
5084                     rx->offs[nums[i]].end != -1)
5085                 {
5086                     parno = nums[i];
5087                     break;
5088                 }
5089             }
5090             if (parno || flags & RXapif_ALL) {
5091 		return newSVhek(HeKEY_hek(temphe));
5092             }
5093         }
5094     }
5095     return NULL;
5096 }
5097 
5098 SV*
5099 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5100 {
5101     SV *ret;
5102     AV *av;
5103     I32 length;
5104     struct regexp *const rx = (struct regexp *)SvANY(r);
5105 
5106     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5107 
5108     if (rx && RXp_PAREN_NAMES(rx)) {
5109         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5110             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5111         } else if (flags & RXapif_ONE) {
5112             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5113             av = MUTABLE_AV(SvRV(ret));
5114             length = av_len(av);
5115 	    SvREFCNT_dec(ret);
5116             return newSViv(length + 1);
5117         } else {
5118             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5119             return NULL;
5120         }
5121     }
5122     return &PL_sv_undef;
5123 }
5124 
5125 SV*
5126 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5127 {
5128     struct regexp *const rx = (struct regexp *)SvANY(r);
5129     AV *av = newAV();
5130 
5131     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5132 
5133     if (rx && RXp_PAREN_NAMES(rx)) {
5134         HV *hv= RXp_PAREN_NAMES(rx);
5135         HE *temphe;
5136         (void)hv_iterinit(hv);
5137         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5138             IV i;
5139             IV parno = 0;
5140             SV* sv_dat = HeVAL(temphe);
5141             I32 *nums = (I32*)SvPVX(sv_dat);
5142             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5143                 if ((I32)(rx->lastparen) >= nums[i] &&
5144                     rx->offs[nums[i]].start != -1 &&
5145                     rx->offs[nums[i]].end != -1)
5146                 {
5147                     parno = nums[i];
5148                     break;
5149                 }
5150             }
5151             if (parno || flags & RXapif_ALL) {
5152                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5153             }
5154         }
5155     }
5156 
5157     return newRV_noinc(MUTABLE_SV(av));
5158 }
5159 
5160 void
5161 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5162 			     SV * const sv)
5163 {
5164     struct regexp *const rx = (struct regexp *)SvANY(r);
5165     char *s = NULL;
5166     I32 i = 0;
5167     I32 s1, t1;
5168 
5169     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5170 
5171     if (!rx->subbeg) {
5172         sv_setsv(sv,&PL_sv_undef);
5173         return;
5174     }
5175     else
5176     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5177         /* $` */
5178 	i = rx->offs[0].start;
5179 	s = rx->subbeg;
5180     }
5181     else
5182     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5183         /* $' */
5184 	s = rx->subbeg + rx->offs[0].end;
5185 	i = rx->sublen - rx->offs[0].end;
5186     }
5187     else
5188     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5189         (s1 = rx->offs[paren].start) != -1 &&
5190         (t1 = rx->offs[paren].end) != -1)
5191     {
5192         /* $& $1 ... */
5193         i = t1 - s1;
5194         s = rx->subbeg + s1;
5195     } else {
5196         sv_setsv(sv,&PL_sv_undef);
5197         return;
5198     }
5199     assert(rx->sublen >= (s - rx->subbeg) + i );
5200     if (i >= 0) {
5201         const int oldtainted = PL_tainted;
5202         TAINT_NOT;
5203         sv_setpvn(sv, s, i);
5204         PL_tainted = oldtainted;
5205         if ( (rx->extflags & RXf_CANY_SEEN)
5206             ? (RXp_MATCH_UTF8(rx)
5207                         && (!i || is_utf8_string((U8*)s, i)))
5208             : (RXp_MATCH_UTF8(rx)) )
5209         {
5210             SvUTF8_on(sv);
5211         }
5212         else
5213             SvUTF8_off(sv);
5214         if (PL_tainting) {
5215             if (RXp_MATCH_TAINTED(rx)) {
5216                 if (SvTYPE(sv) >= SVt_PVMG) {
5217                     MAGIC* const mg = SvMAGIC(sv);
5218                     MAGIC* mgt;
5219                     PL_tainted = 1;
5220                     SvMAGIC_set(sv, mg->mg_moremagic);
5221                     SvTAINT(sv);
5222                     if ((mgt = SvMAGIC(sv))) {
5223                         mg->mg_moremagic = mgt;
5224                         SvMAGIC_set(sv, mg);
5225                     }
5226                 } else {
5227                     PL_tainted = 1;
5228                     SvTAINT(sv);
5229                 }
5230             } else
5231                 SvTAINTED_off(sv);
5232         }
5233     } else {
5234         sv_setsv(sv,&PL_sv_undef);
5235         return;
5236     }
5237 }
5238 
5239 void
5240 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5241 							 SV const * const value)
5242 {
5243     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5244 
5245     PERL_UNUSED_ARG(rx);
5246     PERL_UNUSED_ARG(paren);
5247     PERL_UNUSED_ARG(value);
5248 
5249     if (!PL_localizing)
5250         Perl_croak(aTHX_ "%s", PL_no_modify);
5251 }
5252 
5253 I32
5254 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5255                               const I32 paren)
5256 {
5257     struct regexp *const rx = (struct regexp *)SvANY(r);
5258     I32 i;
5259     I32 s1, t1;
5260 
5261     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5262 
5263     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5264 	switch (paren) {
5265       /* $` / ${^PREMATCH} */
5266       case RX_BUFF_IDX_PREMATCH:
5267         if (rx->offs[0].start != -1) {
5268 			i = rx->offs[0].start;
5269 			if (i > 0) {
5270 				s1 = 0;
5271 				t1 = i;
5272 				goto getlen;
5273 			}
5274 	    }
5275         return 0;
5276       /* $' / ${^POSTMATCH} */
5277       case RX_BUFF_IDX_POSTMATCH:
5278 	    if (rx->offs[0].end != -1) {
5279 			i = rx->sublen - rx->offs[0].end;
5280 			if (i > 0) {
5281 				s1 = rx->offs[0].end;
5282 				t1 = rx->sublen;
5283 				goto getlen;
5284 			}
5285 	    }
5286         return 0;
5287       /* $& / ${^MATCH}, $1, $2, ... */
5288       default:
5289 	    if (paren <= (I32)rx->nparens &&
5290             (s1 = rx->offs[paren].start) != -1 &&
5291             (t1 = rx->offs[paren].end) != -1)
5292 	    {
5293             i = t1 - s1;
5294             goto getlen;
5295         } else {
5296             if (ckWARN(WARN_UNINITIALIZED))
5297                 report_uninit((const SV *)sv);
5298             return 0;
5299         }
5300     }
5301   getlen:
5302     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5303         const char * const s = rx->subbeg + s1;
5304         const U8 *ep;
5305         STRLEN el;
5306 
5307         i = t1 - s1;
5308         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5309 			i = el;
5310     }
5311     return i;
5312 }
5313 
5314 SV*
5315 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5316 {
5317     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5318 	PERL_UNUSED_ARG(rx);
5319 	if (0)
5320 	    return NULL;
5321 	else
5322 	    return newSVpvs("Regexp");
5323 }
5324 
5325 /* Scans the name of a named buffer from the pattern.
5326  * If flags is REG_RSN_RETURN_NULL returns null.
5327  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5328  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5329  * to the parsed name as looked up in the RExC_paren_names hash.
5330  * If there is an error throws a vFAIL().. type exception.
5331  */
5332 
5333 #define REG_RSN_RETURN_NULL    0
5334 #define REG_RSN_RETURN_NAME    1
5335 #define REG_RSN_RETURN_DATA    2
5336 
5337 STATIC SV*
5338 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5339 {
5340     char *name_start = RExC_parse;
5341 
5342     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5343 
5344     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5345 	 /* skip IDFIRST by using do...while */
5346 	if (UTF)
5347 	    do {
5348 		RExC_parse += UTF8SKIP(RExC_parse);
5349 	    } while (isALNUM_utf8((U8*)RExC_parse));
5350 	else
5351 	    do {
5352 		RExC_parse++;
5353 	    } while (isALNUM(*RExC_parse));
5354     }
5355 
5356     if ( flags ) {
5357         SV* sv_name
5358 	    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5359 			     SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5360         if ( flags == REG_RSN_RETURN_NAME)
5361             return sv_name;
5362         else if (flags==REG_RSN_RETURN_DATA) {
5363             HE *he_str = NULL;
5364             SV *sv_dat = NULL;
5365             if ( ! sv_name )      /* should not happen*/
5366                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5367             if (RExC_paren_names)
5368                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5369             if ( he_str )
5370                 sv_dat = HeVAL(he_str);
5371             if ( ! sv_dat )
5372                 vFAIL("Reference to nonexistent named group");
5373             return sv_dat;
5374         }
5375         else {
5376             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5377         }
5378         /* NOT REACHED */
5379     }
5380     return NULL;
5381 }
5382 
5383 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5384     int rem=(int)(RExC_end - RExC_parse);                       \
5385     int cut;                                                    \
5386     int num;                                                    \
5387     int iscut=0;                                                \
5388     if (rem>10) {                                               \
5389         rem=10;                                                 \
5390         iscut=1;                                                \
5391     }                                                           \
5392     cut=10-rem;                                                 \
5393     if (RExC_lastparse!=RExC_parse)                             \
5394         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5395             rem, RExC_parse,                                    \
5396             cut + 4,                                            \
5397             iscut ? "..." : "<"                                 \
5398         );                                                      \
5399     else                                                        \
5400         PerlIO_printf(Perl_debug_log,"%16s","");                \
5401                                                                 \
5402     if (SIZE_ONLY)                                              \
5403        num = RExC_size + 1;                                     \
5404     else                                                        \
5405        num=REG_NODE_NUM(RExC_emit);                             \
5406     if (RExC_lastnum!=num)                                      \
5407        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5408     else                                                        \
5409        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5410     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5411         (int)((depth*2)), "",                                   \
5412         (funcname)                                              \
5413     );                                                          \
5414     RExC_lastnum=num;                                           \
5415     RExC_lastparse=RExC_parse;                                  \
5416 })
5417 
5418 
5419 
5420 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5421     DEBUG_PARSE_MSG((funcname));                            \
5422     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5423 })
5424 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5425     DEBUG_PARSE_MSG((funcname));                            \
5426     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5427 })
5428 /*
5429  - reg - regular expression, i.e. main body or parenthesized thing
5430  *
5431  * Caller must absorb opening parenthesis.
5432  *
5433  * Combining parenthesis handling with the base level of regular expression
5434  * is a trifle forced, but the need to tie the tails of the branches to what
5435  * follows makes it hard to avoid.
5436  */
5437 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5438 #ifdef DEBUGGING
5439 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5440 #else
5441 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5442 #endif
5443 
5444 STATIC regnode *
5445 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5446     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5447 {
5448     dVAR;
5449     register regnode *ret;		/* Will be the head of the group. */
5450     register regnode *br;
5451     register regnode *lastbr;
5452     register regnode *ender = NULL;
5453     register I32 parno = 0;
5454     I32 flags;
5455     U32 oregflags = RExC_flags;
5456     bool have_branch = 0;
5457     bool is_open = 0;
5458     I32 freeze_paren = 0;
5459     I32 after_freeze = 0;
5460 
5461     /* for (?g), (?gc), and (?o) warnings; warning
5462        about (?c) will warn about (?g) -- japhy    */
5463 
5464 #define WASTED_O  0x01
5465 #define WASTED_G  0x02
5466 #define WASTED_C  0x04
5467 #define WASTED_GC (0x02|0x04)
5468     I32 wastedflags = 0x00;
5469 
5470     char * parse_start = RExC_parse; /* MJD */
5471     char * const oregcomp_parse = RExC_parse;
5472 
5473     GET_RE_DEBUG_FLAGS_DECL;
5474 
5475     PERL_ARGS_ASSERT_REG;
5476     DEBUG_PARSE("reg ");
5477 
5478     *flagp = 0;				/* Tentatively. */
5479 
5480 
5481     /* Make an OPEN node, if parenthesized. */
5482     if (paren) {
5483         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5484 	    char *start_verb = RExC_parse;
5485 	    STRLEN verb_len = 0;
5486 	    char *start_arg = NULL;
5487 	    unsigned char op = 0;
5488 	    int argok = 1;
5489 	    int internal_argval = 0; /* internal_argval is only useful if !argok */
5490 	    while ( *RExC_parse && *RExC_parse != ')' ) {
5491 	        if ( *RExC_parse == ':' ) {
5492 	            start_arg = RExC_parse + 1;
5493 	            break;
5494 	        }
5495 	        RExC_parse++;
5496 	    }
5497 	    ++start_verb;
5498 	    verb_len = RExC_parse - start_verb;
5499 	    if ( start_arg ) {
5500 	        RExC_parse++;
5501 	        while ( *RExC_parse && *RExC_parse != ')' )
5502 	            RExC_parse++;
5503 	        if ( *RExC_parse != ')' )
5504 	            vFAIL("Unterminated verb pattern argument");
5505 	        if ( RExC_parse == start_arg )
5506 	            start_arg = NULL;
5507 	    } else {
5508 	        if ( *RExC_parse != ')' )
5509 	            vFAIL("Unterminated verb pattern");
5510 	    }
5511 
5512 	    switch ( *start_verb ) {
5513             case 'A':  /* (*ACCEPT) */
5514                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5515 		    op = ACCEPT;
5516 		    internal_argval = RExC_nestroot;
5517 		}
5518 		break;
5519             case 'C':  /* (*COMMIT) */
5520                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5521                     op = COMMIT;
5522                 break;
5523             case 'F':  /* (*FAIL) */
5524                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5525 		    op = OPFAIL;
5526 		    argok = 0;
5527 		}
5528 		break;
5529             case ':':  /* (*:NAME) */
5530 	    case 'M':  /* (*MARK:NAME) */
5531 	        if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5532                     op = MARKPOINT;
5533                     argok = -1;
5534                 }
5535                 break;
5536             case 'P':  /* (*PRUNE) */
5537                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5538                     op = PRUNE;
5539                 break;
5540             case 'S':   /* (*SKIP) */
5541                 if ( memEQs(start_verb,verb_len,"SKIP") )
5542                     op = SKIP;
5543                 break;
5544             case 'T':  /* (*THEN) */
5545                 /* [19:06] <TimToady> :: is then */
5546                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5547                     op = CUTGROUP;
5548                     RExC_seen |= REG_SEEN_CUTGROUP;
5549                 }
5550                 break;
5551 	    }
5552 	    if ( ! op ) {
5553 	        RExC_parse++;
5554 	        vFAIL3("Unknown verb pattern '%.*s'",
5555 	            verb_len, start_verb);
5556 	    }
5557 	    if ( argok ) {
5558                 if ( start_arg && internal_argval ) {
5559 	            vFAIL3("Verb pattern '%.*s' may not have an argument",
5560 	                verb_len, start_verb);
5561 	        } else if ( argok < 0 && !start_arg ) {
5562                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5563 	                verb_len, start_verb);
5564 	        } else {
5565 	            ret = reganode(pRExC_state, op, internal_argval);
5566 	            if ( ! internal_argval && ! SIZE_ONLY ) {
5567                         if (start_arg) {
5568                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5569                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5570                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5571                             ret->flags = 0;
5572                         } else {
5573                             ret->flags = 1;
5574                         }
5575                     }
5576 	        }
5577 	        if (!internal_argval)
5578 	            RExC_seen |= REG_SEEN_VERBARG;
5579 	    } else if ( start_arg ) {
5580 	        vFAIL3("Verb pattern '%.*s' may not have an argument",
5581 	                verb_len, start_verb);
5582 	    } else {
5583 	        ret = reg_node(pRExC_state, op);
5584 	    }
5585 	    nextchar(pRExC_state);
5586 	    return ret;
5587         } else
5588 	if (*RExC_parse == '?') { /* (?...) */
5589 	    bool is_logical = 0;
5590 	    const char * const seqstart = RExC_parse;
5591 
5592 	    RExC_parse++;
5593 	    paren = *RExC_parse++;
5594 	    ret = NULL;			/* For look-ahead/behind. */
5595 	    switch (paren) {
5596 
5597 	    case 'P':	/* (?P...) variants for those used to PCRE/Python */
5598 	        paren = *RExC_parse++;
5599 		if ( paren == '<')         /* (?P<...>) named capture */
5600 		    goto named_capture;
5601                 else if (paren == '>') {   /* (?P>name) named recursion */
5602                     goto named_recursion;
5603                 }
5604                 else if (paren == '=') {   /* (?P=...)  named backref */
5605                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5606                        you change this make sure you change that */
5607                     char* name_start = RExC_parse;
5608 		    U32 num = 0;
5609                     SV *sv_dat = reg_scan_name(pRExC_state,
5610                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5611                     if (RExC_parse == name_start || *RExC_parse != ')')
5612                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5613 
5614                     if (!SIZE_ONLY) {
5615                         num = add_data( pRExC_state, 1, "S" );
5616                         RExC_rxi->data->data[num]=(void*)sv_dat;
5617                         SvREFCNT_inc_simple_void(sv_dat);
5618                     }
5619                     RExC_sawback = 1;
5620                     ret = reganode(pRExC_state,
5621                     	   (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5622                     	   num);
5623                     *flagp |= HASWIDTH;
5624 
5625                     Set_Node_Offset(ret, parse_start+1);
5626                     Set_Node_Cur_Length(ret); /* MJD */
5627 
5628                     nextchar(pRExC_state);
5629                     return ret;
5630                 }
5631                 RExC_parse++;
5632 		vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5633 		/*NOTREACHED*/
5634             case '<':           /* (?<...) */
5635 		if (*RExC_parse == '!')
5636 		    paren = ',';
5637 		else if (*RExC_parse != '=')
5638               named_capture:
5639 		{               /* (?<...>) */
5640 		    char *name_start;
5641 		    SV *svname;
5642 		    paren= '>';
5643             case '\'':          /* (?'...') */
5644     		    name_start= RExC_parse;
5645     		    svname = reg_scan_name(pRExC_state,
5646     		        SIZE_ONLY ?  /* reverse test from the others */
5647     		        REG_RSN_RETURN_NAME :
5648     		        REG_RSN_RETURN_NULL);
5649 		    if (RExC_parse == name_start) {
5650 		        RExC_parse++;
5651 		        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5652 		        /*NOTREACHED*/
5653                     }
5654 		    if (*RExC_parse != paren)
5655 		        vFAIL2("Sequence (?%c... not terminated",
5656 		            paren=='>' ? '<' : paren);
5657 		    if (SIZE_ONLY) {
5658 			HE *he_str;
5659 			SV *sv_dat = NULL;
5660                         if (!svname) /* shouldnt happen */
5661                             Perl_croak(aTHX_
5662                                 "panic: reg_scan_name returned NULL");
5663                         if (!RExC_paren_names) {
5664                             RExC_paren_names= newHV();
5665                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5666 #ifdef DEBUGGING
5667                             RExC_paren_name_list= newAV();
5668                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5669 #endif
5670                         }
5671                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5672                         if ( he_str )
5673                             sv_dat = HeVAL(he_str);
5674                         if ( ! sv_dat ) {
5675                             /* croak baby croak */
5676                             Perl_croak(aTHX_
5677                                 "panic: paren_name hash element allocation failed");
5678                         } else if ( SvPOK(sv_dat) ) {
5679                             /* (?|...) can mean we have dupes so scan to check
5680                                its already been stored. Maybe a flag indicating
5681                                we are inside such a construct would be useful,
5682                                but the arrays are likely to be quite small, so
5683                                for now we punt -- dmq */
5684                             IV count = SvIV(sv_dat);
5685                             I32 *pv = (I32*)SvPVX(sv_dat);
5686                             IV i;
5687                             for ( i = 0 ; i < count ; i++ ) {
5688                                 if ( pv[i] == RExC_npar ) {
5689                                     count = 0;
5690                                     break;
5691                                 }
5692                             }
5693                             if ( count ) {
5694                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5695                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5696                                 pv[count] = RExC_npar;
5697                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5698                             }
5699                         } else {
5700                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5701                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5702                             SvIOK_on(sv_dat);
5703                             SvIV_set(sv_dat, 1);
5704                         }
5705 #ifdef DEBUGGING
5706                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5707                             SvREFCNT_dec(svname);
5708 #endif
5709 
5710                         /*sv_dump(sv_dat);*/
5711                     }
5712                     nextchar(pRExC_state);
5713 		    paren = 1;
5714 		    goto capturing_parens;
5715 		}
5716                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5717 		RExC_parse++;
5718 	    case '=':           /* (?=...) */
5719 		RExC_seen_zerolen++;
5720 			break;
5721 	    case '!':           /* (?!...) */
5722 		RExC_seen_zerolen++;
5723 	        if (*RExC_parse == ')') {
5724 	            ret=reg_node(pRExC_state, OPFAIL);
5725 	            nextchar(pRExC_state);
5726 	            return ret;
5727 	        }
5728 	        break;
5729 	    case '|':           /* (?|...) */
5730 	        /* branch reset, behave like a (?:...) except that
5731 	           buffers in alternations share the same numbers */
5732 	        paren = ':';
5733 	        after_freeze = freeze_paren = RExC_npar;
5734 	        break;
5735 	    case ':':           /* (?:...) */
5736 	    case '>':           /* (?>...) */
5737 		break;
5738 	    case '$':           /* (?$...) */
5739 	    case '@':           /* (?@...) */
5740 		vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5741 		break;
5742 	    case '#':           /* (?#...) */
5743 		while (*RExC_parse && *RExC_parse != ')')
5744 		    RExC_parse++;
5745 		if (*RExC_parse != ')')
5746 		    FAIL("Sequence (?#... not terminated");
5747 		nextchar(pRExC_state);
5748 		*flagp = TRYAGAIN;
5749 		return NULL;
5750 	    case '0' :           /* (?0) */
5751 	    case 'R' :           /* (?R) */
5752 		if (*RExC_parse != ')')
5753 		    FAIL("Sequence (?R) not terminated");
5754 		ret = reg_node(pRExC_state, GOSTART);
5755 		*flagp |= POSTPONED;
5756 		nextchar(pRExC_state);
5757 		return ret;
5758 		/*notreached*/
5759             { /* named and numeric backreferences */
5760                 I32 num;
5761             case '&':            /* (?&NAME) */
5762                 parse_start = RExC_parse - 1;
5763               named_recursion:
5764                 {
5765     		    SV *sv_dat = reg_scan_name(pRExC_state,
5766     		        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5767     		     num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5768                 }
5769                 goto gen_recurse_regop;
5770                 /* NOT REACHED */
5771             case '+':
5772                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5773                     RExC_parse++;
5774                     vFAIL("Illegal pattern");
5775                 }
5776                 goto parse_recursion;
5777                 /* NOT REACHED*/
5778             case '-': /* (?-1) */
5779                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5780                     RExC_parse--; /* rewind to let it be handled later */
5781                     goto parse_flags;
5782                 }
5783                 /*FALLTHROUGH */
5784             case '1': case '2': case '3': case '4': /* (?1) */
5785 	    case '5': case '6': case '7': case '8': case '9':
5786 	        RExC_parse--;
5787               parse_recursion:
5788 		num = atoi(RExC_parse);
5789   	        parse_start = RExC_parse - 1; /* MJD */
5790 	        if (*RExC_parse == '-')
5791 	            RExC_parse++;
5792 		while (isDIGIT(*RExC_parse))
5793 			RExC_parse++;
5794 	        if (*RExC_parse!=')')
5795 	            vFAIL("Expecting close bracket");
5796 
5797               gen_recurse_regop:
5798                 if ( paren == '-' ) {
5799                     /*
5800                     Diagram of capture buffer numbering.
5801                     Top line is the normal capture buffer numbers
5802                     Botton line is the negative indexing as from
5803                     the X (the (?-2))
5804 
5805                     +   1 2    3 4 5 X          6 7
5806                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5807                     -   5 4    3 2 1 X          x x
5808 
5809                     */
5810                     num = RExC_npar + num;
5811                     if (num < 1)  {
5812                         RExC_parse++;
5813                         vFAIL("Reference to nonexistent group");
5814                     }
5815                 } else if ( paren == '+' ) {
5816                     num = RExC_npar + num - 1;
5817                 }
5818 
5819                 ret = reganode(pRExC_state, GOSUB, num);
5820                 if (!SIZE_ONLY) {
5821 		    if (num > (I32)RExC_rx->nparens) {
5822 			RExC_parse++;
5823 			vFAIL("Reference to nonexistent group");
5824 	            }
5825 	            ARG2L_SET( ret, RExC_recurse_count++);
5826                     RExC_emit++;
5827 		    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5828 			"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5829 		} else {
5830 		    RExC_size++;
5831     		}
5832     		RExC_seen |= REG_SEEN_RECURSE;
5833                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5834 		Set_Node_Offset(ret, parse_start); /* MJD */
5835 
5836                 *flagp |= POSTPONED;
5837                 nextchar(pRExC_state);
5838                 return ret;
5839             } /* named and numeric backreferences */
5840             /* NOT REACHED */
5841 
5842 	    case '?':           /* (??...) */
5843 		is_logical = 1;
5844 		if (*RExC_parse != '{') {
5845 		    RExC_parse++;
5846 		    vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5847 		    /*NOTREACHED*/
5848 		}
5849 		*flagp |= POSTPONED;
5850 		paren = *RExC_parse++;
5851 		/* FALL THROUGH */
5852 	    case '{':           /* (?{...}) */
5853 	    {
5854 		I32 count = 1;
5855 		U32 n = 0;
5856 		char c;
5857 		char *s = RExC_parse;
5858 
5859 		RExC_seen_zerolen++;
5860 		RExC_seen |= REG_SEEN_EVAL;
5861 		while (count && (c = *RExC_parse)) {
5862 		    if (c == '\\') {
5863 			if (RExC_parse[1])
5864 			    RExC_parse++;
5865 		    }
5866 		    else if (c == '{')
5867 			count++;
5868 		    else if (c == '}')
5869 			count--;
5870 		    RExC_parse++;
5871 		}
5872 		if (*RExC_parse != ')') {
5873 		    RExC_parse = s;
5874 		    vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5875 		}
5876 		if (!SIZE_ONLY) {
5877 		    PAD *pad;
5878 		    OP_4tree *sop, *rop;
5879 		    SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5880 
5881 		    ENTER;
5882 		    Perl_save_re_context(aTHX);
5883 		    rop = sv_compile_2op(sv, &sop, "re", &pad);
5884 		    sop->op_private |= OPpREFCOUNTED;
5885 		    /* re_dup will OpREFCNT_inc */
5886 		    OpREFCNT_set(sop, 1);
5887 		    LEAVE;
5888 
5889 		    n = add_data(pRExC_state, 3, "nop");
5890 		    RExC_rxi->data->data[n] = (void*)rop;
5891 		    RExC_rxi->data->data[n+1] = (void*)sop;
5892 		    RExC_rxi->data->data[n+2] = (void*)pad;
5893 		    SvREFCNT_dec(sv);
5894 		}
5895 		else {						/* First pass */
5896 		    if (PL_reginterp_cnt < ++RExC_seen_evals
5897 			&& IN_PERL_RUNTIME)
5898 			/* No compiled RE interpolated, has runtime
5899 			   components ===> unsafe.  */
5900 			FAIL("Eval-group not allowed at runtime, use re 'eval'");
5901 		    if (PL_tainting && PL_tainted)
5902 			FAIL("Eval-group in insecure regular expression");
5903 #if PERL_VERSION > 8
5904 		    if (IN_PERL_COMPILETIME)
5905 			PL_cv_has_eval = 1;
5906 #endif
5907 		}
5908 
5909 		nextchar(pRExC_state);
5910 		if (is_logical) {
5911 		    ret = reg_node(pRExC_state, LOGICAL);
5912 		    if (!SIZE_ONLY)
5913 			ret->flags = 2;
5914                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5915                     /* deal with the length of this later - MJD */
5916 		    return ret;
5917 		}
5918 		ret = reganode(pRExC_state, EVAL, n);
5919 		Set_Node_Length(ret, RExC_parse - parse_start + 1);
5920 		Set_Node_Offset(ret, parse_start);
5921 		return ret;
5922 	    }
5923 	    case '(':           /* (?(?{...})...) and (?(?=...)...) */
5924 	    {
5925 	        int is_define= 0;
5926 		if (RExC_parse[0] == '?') {        /* (?(?...)) */
5927 		    if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5928 			|| RExC_parse[1] == '<'
5929 			|| RExC_parse[1] == '{') { /* Lookahead or eval. */
5930 			I32 flag;
5931 
5932 			ret = reg_node(pRExC_state, LOGICAL);
5933 			if (!SIZE_ONLY)
5934 			    ret->flags = 1;
5935                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5936 			goto insert_if;
5937 		    }
5938 		}
5939 		else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
5940 		         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5941 	        {
5942 	            char ch = RExC_parse[0] == '<' ? '>' : '\'';
5943 	            char *name_start= RExC_parse++;
5944 	            U32 num = 0;
5945 	            SV *sv_dat=reg_scan_name(pRExC_state,
5946 	                SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5947 	            if (RExC_parse == name_start || *RExC_parse != ch)
5948                         vFAIL2("Sequence (?(%c... not terminated",
5949                             (ch == '>' ? '<' : ch));
5950                     RExC_parse++;
5951 	            if (!SIZE_ONLY) {
5952                         num = add_data( pRExC_state, 1, "S" );
5953                         RExC_rxi->data->data[num]=(void*)sv_dat;
5954                         SvREFCNT_inc_simple_void(sv_dat);
5955                     }
5956                     ret = reganode(pRExC_state,NGROUPP,num);
5957                     goto insert_if_check_paren;
5958 		}
5959 		else if (RExC_parse[0] == 'D' &&
5960 		         RExC_parse[1] == 'E' &&
5961 		         RExC_parse[2] == 'F' &&
5962 		         RExC_parse[3] == 'I' &&
5963 		         RExC_parse[4] == 'N' &&
5964 		         RExC_parse[5] == 'E')
5965 		{
5966 		    ret = reganode(pRExC_state,DEFINEP,0);
5967 		    RExC_parse +=6 ;
5968 		    is_define = 1;
5969 		    goto insert_if_check_paren;
5970 		}
5971 		else if (RExC_parse[0] == 'R') {
5972 		    RExC_parse++;
5973 		    parno = 0;
5974 		    if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5975 		        parno = atoi(RExC_parse++);
5976 		        while (isDIGIT(*RExC_parse))
5977 			    RExC_parse++;
5978 		    } else if (RExC_parse[0] == '&') {
5979 		        SV *sv_dat;
5980 		        RExC_parse++;
5981 		        sv_dat = reg_scan_name(pRExC_state,
5982     		            SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5983     		        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5984 		    }
5985 		    ret = reganode(pRExC_state,INSUBP,parno);
5986 		    goto insert_if_check_paren;
5987 		}
5988 		else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5989                     /* (?(1)...) */
5990 		    char c;
5991 		    parno = atoi(RExC_parse++);
5992 
5993 		    while (isDIGIT(*RExC_parse))
5994 			RExC_parse++;
5995                     ret = reganode(pRExC_state, GROUPP, parno);
5996 
5997                  insert_if_check_paren:
5998 		    if ((c = *nextchar(pRExC_state)) != ')')
5999 			vFAIL("Switch condition not recognized");
6000 		  insert_if:
6001                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6002                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6003 		    if (br == NULL)
6004 			br = reganode(pRExC_state, LONGJMP, 0);
6005 		    else
6006                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6007 		    c = *nextchar(pRExC_state);
6008 		    if (flags&HASWIDTH)
6009 			*flagp |= HASWIDTH;
6010 		    if (c == '|') {
6011 		        if (is_define)
6012 		            vFAIL("(?(DEFINE)....) does not allow branches");
6013 			lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6014                         regbranch(pRExC_state, &flags, 1,depth+1);
6015                         REGTAIL(pRExC_state, ret, lastbr);
6016 		 	if (flags&HASWIDTH)
6017 			    *flagp |= HASWIDTH;
6018 			c = *nextchar(pRExC_state);
6019 		    }
6020 		    else
6021 			lastbr = NULL;
6022 		    if (c != ')')
6023 			vFAIL("Switch (?(condition)... contains too many branches");
6024 		    ender = reg_node(pRExC_state, TAIL);
6025                     REGTAIL(pRExC_state, br, ender);
6026 		    if (lastbr) {
6027                         REGTAIL(pRExC_state, lastbr, ender);
6028                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6029 		    }
6030 		    else
6031                         REGTAIL(pRExC_state, ret, ender);
6032                     RExC_size++; /* XXX WHY do we need this?!!
6033                                     For large programs it seems to be required
6034                                     but I can't figure out why. -- dmq*/
6035 		    return ret;
6036 		}
6037 		else {
6038 		    vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6039 		}
6040 	    }
6041             case 0:
6042 		RExC_parse--; /* for vFAIL to print correctly */
6043                 vFAIL("Sequence (? incomplete");
6044                 break;
6045 	    default:
6046 	        --RExC_parse;
6047 	        parse_flags:      /* (?i) */
6048 	    {
6049                 U32 posflags = 0, negflags = 0;
6050 	        U32 *flagsp = &posflags;
6051 
6052 		while (*RExC_parse) {
6053 		    /* && strchr("iogcmsx", *RExC_parse) */
6054 		    /* (?g), (?gc) and (?o) are useless here
6055 		       and must be globally applied -- japhy */
6056                     switch (*RExC_parse) {
6057 	            CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6058                     case ONCE_PAT_MOD: /* 'o' */
6059                     case GLOBAL_PAT_MOD: /* 'g' */
6060 			if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6061 			    const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6062 			    if (! (wastedflags & wflagbit) ) {
6063 				wastedflags |= wflagbit;
6064 				vWARN5(
6065 				    RExC_parse + 1,
6066 				    "Useless (%s%c) - %suse /%c modifier",
6067 				    flagsp == &negflags ? "?-" : "?",
6068 				    *RExC_parse,
6069 				    flagsp == &negflags ? "don't " : "",
6070 				    *RExC_parse
6071 				);
6072 			    }
6073 			}
6074 			break;
6075 
6076 		    case CONTINUE_PAT_MOD: /* 'c' */
6077 			if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6078 			    if (! (wastedflags & WASTED_C) ) {
6079 				wastedflags |= WASTED_GC;
6080 				vWARN3(
6081 				    RExC_parse + 1,
6082 				    "Useless (%sc) - %suse /gc modifier",
6083 				    flagsp == &negflags ? "?-" : "?",
6084 				    flagsp == &negflags ? "don't " : ""
6085 				);
6086 			    }
6087 			}
6088 			break;
6089 	            case KEEPCOPY_PAT_MOD: /* 'p' */
6090                         if (flagsp == &negflags) {
6091                             if (SIZE_ONLY)
6092                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6093                         } else {
6094                             *flagsp |= RXf_PMf_KEEPCOPY;
6095                         }
6096 	                break;
6097                     case '-':
6098                         if (flagsp == &negflags) {
6099                             RExC_parse++;
6100 		            vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6101 		            /*NOTREACHED*/
6102 		        }
6103 			flagsp = &negflags;
6104 		        wastedflags = 0;  /* reset so (?g-c) warns twice */
6105 		        break;
6106                     case ':':
6107 		        paren = ':';
6108 		        /*FALLTHROUGH*/
6109                     case ')':
6110                         RExC_flags |= posflags;
6111                         RExC_flags &= ~negflags;
6112                         if (paren != ':') {
6113                             oregflags |= posflags;
6114                             oregflags &= ~negflags;
6115                         }
6116                         nextchar(pRExC_state);
6117 		        if (paren != ':') {
6118 		            *flagp = TRYAGAIN;
6119 		            return NULL;
6120 		        } else {
6121                             ret = NULL;
6122 		            goto parse_rest;
6123 		        }
6124 		        /*NOTREACHED*/
6125                     default:
6126 		        RExC_parse++;
6127 		        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6128 		        /*NOTREACHED*/
6129                     }
6130 		    ++RExC_parse;
6131 		}
6132 	    }} /* one for the default block, one for the switch */
6133 	}
6134 	else {                  /* (...) */
6135 	  capturing_parens:
6136 	    parno = RExC_npar;
6137 	    RExC_npar++;
6138 
6139 	    ret = reganode(pRExC_state, OPEN, parno);
6140 	    if (!SIZE_ONLY ){
6141 	        if (!RExC_nestroot)
6142 	            RExC_nestroot = parno;
6143 	        if (RExC_seen & REG_SEEN_RECURSE
6144 	            && !RExC_open_parens[parno-1])
6145 	        {
6146 		    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6147 			"Setting open paren #%"IVdf" to %d\n",
6148 			(IV)parno, REG_NODE_NUM(ret)));
6149 	            RExC_open_parens[parno-1]= ret;
6150 	        }
6151 	    }
6152             Set_Node_Length(ret, 1); /* MJD */
6153             Set_Node_Offset(ret, RExC_parse); /* MJD */
6154 	    is_open = 1;
6155 	}
6156     }
6157     else                        /* ! paren */
6158 	ret = NULL;
6159 
6160    parse_rest:
6161     /* Pick up the branches, linking them together. */
6162     parse_start = RExC_parse;   /* MJD */
6163     br = regbranch(pRExC_state, &flags, 1,depth+1);
6164 
6165     if (freeze_paren) {
6166         if (RExC_npar > after_freeze)
6167             after_freeze = RExC_npar;
6168         RExC_npar = freeze_paren;
6169     }
6170 
6171     /*     branch_len = (paren != 0); */
6172 
6173     if (br == NULL)
6174 	return(NULL);
6175     if (*RExC_parse == '|') {
6176 	if (!SIZE_ONLY && RExC_extralen) {
6177 	    reginsert(pRExC_state, BRANCHJ, br, depth+1);
6178 	}
6179 	else {                  /* MJD */
6180 	    reginsert(pRExC_state, BRANCH, br, depth+1);
6181             Set_Node_Length(br, paren != 0);
6182             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6183         }
6184 	have_branch = 1;
6185 	if (SIZE_ONLY)
6186 	    RExC_extralen += 1;		/* For BRANCHJ-BRANCH. */
6187     }
6188     else if (paren == ':') {
6189 	*flagp |= flags&SIMPLE;
6190     }
6191     if (is_open) {				/* Starts with OPEN. */
6192         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6193     }
6194     else if (paren != '?')		/* Not Conditional */
6195 	ret = br;
6196     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6197     lastbr = br;
6198     while (*RExC_parse == '|') {
6199 	if (!SIZE_ONLY && RExC_extralen) {
6200 	    ender = reganode(pRExC_state, LONGJMP,0);
6201             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6202 	}
6203 	if (SIZE_ONLY)
6204 	    RExC_extralen += 2;		/* Account for LONGJMP. */
6205 	nextchar(pRExC_state);
6206 	if (freeze_paren) {
6207 	    if (RExC_npar > after_freeze)
6208 	        after_freeze = RExC_npar;
6209             RExC_npar = freeze_paren;
6210         }
6211         br = regbranch(pRExC_state, &flags, 0, depth+1);
6212 
6213 	if (br == NULL)
6214 	    return(NULL);
6215         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6216 	lastbr = br;
6217 	*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6218     }
6219 
6220     if (have_branch || paren != ':') {
6221 	/* Make a closing node, and hook it on the end. */
6222 	switch (paren) {
6223 	case ':':
6224 	    ender = reg_node(pRExC_state, TAIL);
6225 	    break;
6226 	case 1:
6227 	    ender = reganode(pRExC_state, CLOSE, parno);
6228 	    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6229 		DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6230 			"Setting close paren #%"IVdf" to %d\n",
6231 			(IV)parno, REG_NODE_NUM(ender)));
6232 	        RExC_close_parens[parno-1]= ender;
6233 	        if (RExC_nestroot == parno)
6234 	            RExC_nestroot = 0;
6235 	    }
6236             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6237             Set_Node_Length(ender,1); /* MJD */
6238 	    break;
6239 	case '<':
6240 	case ',':
6241 	case '=':
6242 	case '!':
6243 	    *flagp &= ~HASWIDTH;
6244 	    /* FALL THROUGH */
6245 	case '>':
6246 	    ender = reg_node(pRExC_state, SUCCEED);
6247 	    break;
6248 	case 0:
6249 	    ender = reg_node(pRExC_state, END);
6250 	    if (!SIZE_ONLY) {
6251                 assert(!RExC_opend); /* there can only be one! */
6252                 RExC_opend = ender;
6253             }
6254 	    break;
6255 	}
6256         REGTAIL(pRExC_state, lastbr, ender);
6257 
6258 	if (have_branch && !SIZE_ONLY) {
6259 	    if (depth==1)
6260 	        RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6261 
6262 	    /* Hook the tails of the branches to the closing node. */
6263 	    for (br = ret; br; br = regnext(br)) {
6264 		const U8 op = PL_regkind[OP(br)];
6265 		if (op == BRANCH) {
6266                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6267 		}
6268 		else if (op == BRANCHJ) {
6269                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6270 		}
6271 	    }
6272 	}
6273     }
6274 
6275     {
6276         const char *p;
6277         static const char parens[] = "=!<,>";
6278 
6279 	if (paren && (p = strchr(parens, paren))) {
6280 	    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6281 	    int flag = (p - parens) > 1;
6282 
6283 	    if (paren == '>')
6284 		node = SUSPEND, flag = 0;
6285 	    reginsert(pRExC_state, node,ret, depth+1);
6286 	    Set_Node_Cur_Length(ret);
6287 	    Set_Node_Offset(ret, parse_start + 1);
6288 	    ret->flags = flag;
6289             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6290 	}
6291     }
6292 
6293     /* Check for proper termination. */
6294     if (paren) {
6295 	RExC_flags = oregflags;
6296 	if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6297 	    RExC_parse = oregcomp_parse;
6298 	    vFAIL("Unmatched (");
6299 	}
6300     }
6301     else if (!paren && RExC_parse < RExC_end) {
6302 	if (*RExC_parse == ')') {
6303 	    RExC_parse++;
6304 	    vFAIL("Unmatched )");
6305 	}
6306 	else
6307 	    FAIL("Junk on end of regexp");	/* "Can't happen". */
6308 	/* NOTREACHED */
6309     }
6310     if (after_freeze)
6311         RExC_npar = after_freeze;
6312     return(ret);
6313 }
6314 
6315 /*
6316  - regbranch - one alternative of an | operator
6317  *
6318  * Implements the concatenation operator.
6319  */
6320 STATIC regnode *
6321 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6322 {
6323     dVAR;
6324     register regnode *ret;
6325     register regnode *chain = NULL;
6326     register regnode *latest;
6327     I32 flags = 0, c = 0;
6328     GET_RE_DEBUG_FLAGS_DECL;
6329 
6330     PERL_ARGS_ASSERT_REGBRANCH;
6331 
6332     DEBUG_PARSE("brnc");
6333 
6334     if (first)
6335 	ret = NULL;
6336     else {
6337 	if (!SIZE_ONLY && RExC_extralen)
6338 	    ret = reganode(pRExC_state, BRANCHJ,0);
6339 	else {
6340 	    ret = reg_node(pRExC_state, BRANCH);
6341             Set_Node_Length(ret, 1);
6342         }
6343     }
6344 
6345     if (!first && SIZE_ONLY)
6346 	RExC_extralen += 1;			/* BRANCHJ */
6347 
6348     *flagp = WORST;			/* Tentatively. */
6349 
6350     RExC_parse--;
6351     nextchar(pRExC_state);
6352     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6353 	flags &= ~TRYAGAIN;
6354         latest = regpiece(pRExC_state, &flags,depth+1);
6355 	if (latest == NULL) {
6356 	    if (flags & TRYAGAIN)
6357 		continue;
6358 	    return(NULL);
6359 	}
6360 	else if (ret == NULL)
6361 	    ret = latest;
6362 	*flagp |= flags&(HASWIDTH|POSTPONED);
6363 	if (chain == NULL) 	/* First piece. */
6364 	    *flagp |= flags&SPSTART;
6365 	else {
6366 	    RExC_naughty++;
6367             REGTAIL(pRExC_state, chain, latest);
6368 	}
6369 	chain = latest;
6370 	c++;
6371     }
6372     if (chain == NULL) {	/* Loop ran zero times. */
6373 	chain = reg_node(pRExC_state, NOTHING);
6374 	if (ret == NULL)
6375 	    ret = chain;
6376     }
6377     if (c == 1) {
6378 	*flagp |= flags&SIMPLE;
6379     }
6380 
6381     return ret;
6382 }
6383 
6384 /*
6385  - regpiece - something followed by possible [*+?]
6386  *
6387  * Note that the branching code sequences used for ? and the general cases
6388  * of * and + are somewhat optimized:  they use the same NOTHING node as
6389  * both the endmarker for their branch list and the body of the last branch.
6390  * It might seem that this node could be dispensed with entirely, but the
6391  * endmarker role is not redundant.
6392  */
6393 STATIC regnode *
6394 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6395 {
6396     dVAR;
6397     register regnode *ret;
6398     register char op;
6399     register char *next;
6400     I32 flags;
6401     const char * const origparse = RExC_parse;
6402     I32 min;
6403     I32 max = REG_INFTY;
6404     char *parse_start;
6405     const char *maxpos = NULL;
6406     GET_RE_DEBUG_FLAGS_DECL;
6407 
6408     PERL_ARGS_ASSERT_REGPIECE;
6409 
6410     DEBUG_PARSE("piec");
6411 
6412     ret = regatom(pRExC_state, &flags,depth+1);
6413     if (ret == NULL) {
6414 	if (flags & TRYAGAIN)
6415 	    *flagp |= TRYAGAIN;
6416 	return(NULL);
6417     }
6418 
6419     op = *RExC_parse;
6420 
6421     if (op == '{' && regcurly(RExC_parse)) {
6422 	maxpos = NULL;
6423         parse_start = RExC_parse; /* MJD */
6424 	next = RExC_parse + 1;
6425 	while (isDIGIT(*next) || *next == ',') {
6426 	    if (*next == ',') {
6427 		if (maxpos)
6428 		    break;
6429 		else
6430 		    maxpos = next;
6431 	    }
6432 	    next++;
6433 	}
6434 	if (*next == '}') {		/* got one */
6435 	    if (!maxpos)
6436 		maxpos = next;
6437 	    RExC_parse++;
6438 	    min = atoi(RExC_parse);
6439 	    if (*maxpos == ',')
6440 		maxpos++;
6441 	    else
6442 		maxpos = RExC_parse;
6443 	    max = atoi(maxpos);
6444 	    if (!max && *maxpos != '0')
6445 		max = REG_INFTY;		/* meaning "infinity" */
6446 	    else if (max >= REG_INFTY)
6447 		vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6448 	    RExC_parse = next;
6449 	    nextchar(pRExC_state);
6450 
6451 	do_curly:
6452 	    if ((flags&SIMPLE)) {
6453 		RExC_naughty += 2 + RExC_naughty / 2;
6454 		reginsert(pRExC_state, CURLY, ret, depth+1);
6455                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6456                 Set_Node_Cur_Length(ret);
6457 	    }
6458 	    else {
6459 		regnode * const w = reg_node(pRExC_state, WHILEM);
6460 
6461 		w->flags = 0;
6462                 REGTAIL(pRExC_state, ret, w);
6463 		if (!SIZE_ONLY && RExC_extralen) {
6464 		    reginsert(pRExC_state, LONGJMP,ret, depth+1);
6465 		    reginsert(pRExC_state, NOTHING,ret, depth+1);
6466 		    NEXT_OFF(ret) = 3;	/* Go over LONGJMP. */
6467 		}
6468 		reginsert(pRExC_state, CURLYX,ret, depth+1);
6469                                 /* MJD hk */
6470                 Set_Node_Offset(ret, parse_start+1);
6471                 Set_Node_Length(ret,
6472                                 op == '{' ? (RExC_parse - parse_start) : 1);
6473 
6474 		if (!SIZE_ONLY && RExC_extralen)
6475 		    NEXT_OFF(ret) = 3;	/* Go over NOTHING to LONGJMP. */
6476                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6477 		if (SIZE_ONLY)
6478 		    RExC_whilem_seen++, RExC_extralen += 3;
6479 		RExC_naughty += 4 + RExC_naughty;	/* compound interest */
6480 	    }
6481 	    ret->flags = 0;
6482 
6483 	    if (min > 0)
6484 		*flagp = WORST;
6485 	    if (max > 0)
6486 		*flagp |= HASWIDTH;
6487 	    if (max < min)
6488 		vFAIL("Can't do {n,m} with n > m");
6489 	    if (!SIZE_ONLY) {
6490 		ARG1_SET(ret, (U16)min);
6491 		ARG2_SET(ret, (U16)max);
6492 	    }
6493 
6494 	    goto nest_check;
6495 	}
6496     }
6497 
6498     if (!ISMULT1(op)) {
6499 	*flagp = flags;
6500 	return(ret);
6501     }
6502 
6503 #if 0				/* Now runtime fix should be reliable. */
6504 
6505     /* if this is reinstated, don't forget to put this back into perldiag:
6506 
6507 	    =item Regexp *+ operand could be empty at {#} in regex m/%s/
6508 
6509 	   (F) The part of the regexp subject to either the * or + quantifier
6510            could match an empty string. The {#} shows in the regular
6511            expression about where the problem was discovered.
6512 
6513     */
6514 
6515     if (!(flags&HASWIDTH) && op != '?')
6516       vFAIL("Regexp *+ operand could be empty");
6517 #endif
6518 
6519     parse_start = RExC_parse;
6520     nextchar(pRExC_state);
6521 
6522     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6523 
6524     if (op == '*' && (flags&SIMPLE)) {
6525 	reginsert(pRExC_state, STAR, ret, depth+1);
6526 	ret->flags = 0;
6527 	RExC_naughty += 4;
6528     }
6529     else if (op == '*') {
6530 	min = 0;
6531 	goto do_curly;
6532     }
6533     else if (op == '+' && (flags&SIMPLE)) {
6534 	reginsert(pRExC_state, PLUS, ret, depth+1);
6535 	ret->flags = 0;
6536 	RExC_naughty += 3;
6537     }
6538     else if (op == '+') {
6539 	min = 1;
6540 	goto do_curly;
6541     }
6542     else if (op == '?') {
6543 	min = 0; max = 1;
6544 	goto do_curly;
6545     }
6546   nest_check:
6547     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6548 	ckWARN3reg(RExC_parse,
6549 		   "%.*s matches null string many times",
6550 		   (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6551 		   origparse);
6552     }
6553 
6554     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6555 	nextchar(pRExC_state);
6556 	reginsert(pRExC_state, MINMOD, ret, depth+1);
6557         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6558     }
6559 #ifndef REG_ALLOW_MINMOD_SUSPEND
6560     else
6561 #endif
6562     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6563         regnode *ender;
6564         nextchar(pRExC_state);
6565         ender = reg_node(pRExC_state, SUCCEED);
6566         REGTAIL(pRExC_state, ret, ender);
6567         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6568         ret->flags = 0;
6569         ender = reg_node(pRExC_state, TAIL);
6570         REGTAIL(pRExC_state, ret, ender);
6571         /*ret= ender;*/
6572     }
6573 
6574     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6575 	RExC_parse++;
6576 	vFAIL("Nested quantifiers");
6577     }
6578 
6579     return(ret);
6580 }
6581 
6582 
6583 /* reg_namedseq(pRExC_state,UVp)
6584 
6585    This is expected to be called by a parser routine that has
6586    recognized '\N' and needs to handle the rest. RExC_parse is
6587    expected to point at the first char following the N at the time
6588    of the call.
6589 
6590    The \N may be inside (indicated by valuep not being NULL) or outside a
6591    character class.
6592 
6593    \N may begin either a named sequence, or if outside a character class, mean
6594    to match a non-newline.  For non single-quoted regexes, the tokenizer has
6595    attempted to decide which, and in the case of a named sequence converted it
6596    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6597    where c1... are the characters in the sequence.  For single-quoted regexes,
6598    the tokenizer passes the \N sequence through unchanged; this code will not
6599    attempt to determine this nor expand those.  The net effect is that if the
6600    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6601    signals that this \N occurrence means to match a non-newline.
6602 
6603    Only the \N{U+...} form should occur in a character class, for the same
6604    reason that '.' inside a character class means to just match a period: it
6605    just doesn't make sense.
6606 
6607    If valuep is non-null then it is assumed that we are parsing inside
6608    of a charclass definition and the first codepoint in the resolved
6609    string is returned via *valuep and the routine will return NULL.
6610    In this mode if a multichar string is returned from the charnames
6611    handler, a warning will be issued, and only the first char in the
6612    sequence will be examined. If the string returned is zero length
6613    then the value of *valuep is undefined and NON-NULL will
6614    be returned to indicate failure. (This will NOT be a valid pointer
6615    to a regnode.)
6616 
6617    If valuep is null then it is assumed that we are parsing normal text and a
6618    new EXACT node is inserted into the program containing the resolved string,
6619    and a pointer to the new node is returned.  But if the string is zero length
6620    a NOTHING node is emitted instead.
6621 
6622    On success RExC_parse is set to the char following the endbrace.
6623    Parsing failures will generate a fatal error via vFAIL(...)
6624  */
6625 STATIC regnode *
6626 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6627 {
6628     char * endbrace;    /* '}' following the name */
6629     regnode *ret = NULL;
6630 #ifdef DEBUGGING
6631     char* parse_start = RExC_parse - 2;	    /* points to the '\N' */
6632 #endif
6633     char* p;
6634 
6635     GET_RE_DEBUG_FLAGS_DECL;
6636 
6637     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6638 
6639     GET_RE_DEBUG_FLAGS;
6640 
6641     /* The [^\n] meaning of \N ignores spaces and comments under the /x
6642      * modifier.  The other meaning does not */
6643     p = (RExC_flags & RXf_PMf_EXTENDED)
6644 	? regwhite( pRExC_state, RExC_parse )
6645 	: RExC_parse;
6646 
6647     /* Disambiguate between \N meaning a named character versus \N meaning
6648      * [^\n].  The former is assumed when it can't be the latter. */
6649     if (*p != '{' || regcurly(p)) {
6650 	RExC_parse = p;
6651 	if (valuep) {
6652 	    /* no bare \N in a charclass */
6653 	    vFAIL("\\N in a character class must be a named character: \\N{...}");
6654 	}
6655 	nextchar(pRExC_state);
6656 	ret = reg_node(pRExC_state, REG_ANY);
6657 	*flagp |= HASWIDTH|SIMPLE;
6658 	RExC_naughty++;
6659 	RExC_parse--;
6660         Set_Node_Length(ret, 1); /* MJD */
6661 	return ret;
6662     }
6663 
6664     /* Here, we have decided it should be a named sequence */
6665 
6666     /* The test above made sure that the next real character is a '{', but
6667      * under the /x modifier, it could be separated by space (or a comment and
6668      * \n) and this is not allowed (for consistency with \x{...} and the
6669      * tokenizer handling of \N{NAME}). */
6670     if (*RExC_parse != '{') {
6671 	vFAIL("Missing braces on \\N{}");
6672     }
6673 
6674     RExC_parse++;	/* Skip past the '{' */
6675 
6676     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6677 	|| ! (endbrace == RExC_parse		/* nothing between the {} */
6678 	      || (endbrace - RExC_parse >= 2	/* U+ (bad hex is checked below */
6679 		  && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6680     {
6681 	if (endbrace) RExC_parse = endbrace;	/* position msg's '<--HERE' */
6682 	vFAIL("\\N{NAME} must be resolved by the lexer");
6683     }
6684 
6685     if (endbrace == RExC_parse) {   /* empty: \N{} */
6686 	if (! valuep) {
6687 	    RExC_parse = endbrace + 1;
6688 	    return reg_node(pRExC_state,NOTHING);
6689 	}
6690 
6691 	if (SIZE_ONLY) {
6692 	    ckWARNreg(RExC_parse,
6693 		    "Ignoring zero length \\N{} in character class"
6694 	    );
6695 	    RExC_parse = endbrace + 1;
6696 	}
6697 	*valuep = 0;
6698 	return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6699     }
6700 
6701     RExC_utf8 = 1;	/* named sequences imply Unicode semantics */
6702     RExC_parse += 2;	/* Skip past the 'U+' */
6703 
6704     if (valuep) {   /* In a bracketed char class */
6705 	/* We only pay attention to the first char of
6706 	multichar strings being returned. I kinda wonder
6707 	if this makes sense as it does change the behaviour
6708 	from earlier versions, OTOH that behaviour was broken
6709 	as well. XXX Solution is to recharacterize as
6710 	[rest-of-class]|multi1|multi2... */
6711 
6712 	STRLEN length_of_hex;
6713 	I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6714 	    | PERL_SCAN_DISALLOW_PREFIX
6715 	    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6716 
6717 	char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6718 	if (endchar < endbrace) {
6719 	    ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6720 	}
6721 
6722 	length_of_hex = (STRLEN)(endchar - RExC_parse);
6723 	*valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6724 
6725 	/* The tokenizer should have guaranteed validity, but it's possible to
6726 	 * bypass it by using single quoting, so check */
6727 	if (length_of_hex == 0
6728 	    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6729 	{
6730 	    RExC_parse += length_of_hex;	/* Includes all the valid */
6731 	    RExC_parse += (RExC_orig_utf8)	/* point to after 1st invalid */
6732 			    ? UTF8SKIP(RExC_parse)
6733 			    : 1;
6734 	    /* Guard against malformed utf8 */
6735 	    if (RExC_parse >= endchar) RExC_parse = endchar;
6736 	    vFAIL("Invalid hexadecimal number in \\N{U+...}");
6737 	}
6738 
6739 	RExC_parse = endbrace + 1;
6740 	if (endchar == endbrace) return NULL;
6741 
6742         ret = (regnode *) &RExC_parse;	/* Invalid regnode pointer */
6743     }
6744     else {	/* Not a char class */
6745 	char *s;	    /* String to put in generated EXACT node */
6746 	STRLEN len = 0;	    /* Its current length */
6747 	char *endchar;	    /* Points to '.' or '}' ending cur char in the input
6748 			       stream */
6749 
6750 	ret = reg_node(pRExC_state,
6751 			(U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6752 	s= STRING(ret);
6753 
6754 	/* Exact nodes can hold only a U8 length's of text = 255.  Loop through
6755 	 * the input which is of the form now 'c1.c2.c3...}' until find the
6756 	 * ending brace or exeed length 255.  The characters that exceed this
6757 	 * limit are dropped.  The limit could be relaxed should it become
6758 	 * desirable by reparsing this as (?:\N{NAME}), so could generate
6759 	 * multiple EXACT nodes, as is done for just regular input.  But this
6760 	 * is primarily a named character, and not intended to be a huge long
6761 	 * string, so 255 bytes should be good enough */
6762 	while (1) {
6763 	    STRLEN length_of_hex;
6764 	    I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6765 			    | PERL_SCAN_DISALLOW_PREFIX
6766 			    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6767 	    UV cp;  /* Ord of current character */
6768 
6769 	    /* Code points are separated by dots.  If none, there is only one
6770 	     * code point, and is terminated by the brace */
6771 	    endchar = RExC_parse + strcspn(RExC_parse, ".}");
6772 
6773 	    /* The values are Unicode even on EBCDIC machines */
6774 	    length_of_hex = (STRLEN)(endchar - RExC_parse);
6775 	    cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6776 	    if ( length_of_hex == 0
6777 		|| length_of_hex != (STRLEN)(endchar - RExC_parse) )
6778 	    {
6779 		RExC_parse += length_of_hex;	    /* Includes all the valid */
6780 		RExC_parse += (RExC_orig_utf8)	/* point to after 1st invalid */
6781 				? UTF8SKIP(RExC_parse)
6782 				: 1;
6783 		/* Guard against malformed utf8 */
6784 		if (RExC_parse >= endchar) RExC_parse = endchar;
6785 		vFAIL("Invalid hexadecimal number in \\N{U+...}");
6786 	    }
6787 
6788 	    if (! FOLD) {	/* Not folding, just append to the string */
6789 		STRLEN unilen;
6790 
6791 		/* Quit before adding this character if would exceed limit */
6792 		if (len + UNISKIP(cp) > U8_MAX) break;
6793 
6794 		unilen = reguni(pRExC_state, cp, s);
6795 		if (unilen > 0) {
6796 		    s   += unilen;
6797 		    len += unilen;
6798 		}
6799 	    } else {	/* Folding, output the folded equivalent */
6800 		STRLEN foldlen,numlen;
6801 		U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6802 		cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6803 
6804 		/* Quit before exceeding size limit */
6805 		if (len + foldlen > U8_MAX) break;
6806 
6807 		for (foldbuf = tmpbuf;
6808 		    foldlen;
6809 		    foldlen -= numlen)
6810 		{
6811 		    cp = utf8_to_uvchr(foldbuf, &numlen);
6812 		    if (numlen > 0) {
6813 			const STRLEN unilen = reguni(pRExC_state, cp, s);
6814 			s       += unilen;
6815 			len     += unilen;
6816 			/* In EBCDIC the numlen and unilen can differ. */
6817 			foldbuf += numlen;
6818 			if (numlen >= foldlen)
6819 			    break;
6820 		    }
6821 		    else
6822 			break; /* "Can't happen." */
6823 		}
6824 	    }
6825 
6826 	    /* Point to the beginning of the next character in the sequence. */
6827 	    RExC_parse = endchar + 1;
6828 
6829 	    /* Quit if no more characters */
6830 	    if (RExC_parse >= endbrace) break;
6831 	}
6832 
6833 
6834 	if (SIZE_ONLY) {
6835 	    if (RExC_parse < endbrace) {
6836 		ckWARNreg(RExC_parse - 1,
6837 			  "Using just the first characters returned by \\N{}");
6838 	    }
6839 
6840 	    RExC_size += STR_SZ(len);
6841 	} else {
6842 	    STR_LEN(ret) = len;
6843 	    RExC_emit += STR_SZ(len);
6844 	}
6845 
6846 	RExC_parse = endbrace + 1;
6847 
6848 	*flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6849 			       with malformed in t/re/pat_advanced.t */
6850 	RExC_parse --;
6851 	Set_Node_Cur_Length(ret); /* MJD */
6852 	nextchar(pRExC_state);
6853     }
6854 
6855     return ret;
6856 }
6857 
6858 
6859 /*
6860  * reg_recode
6861  *
6862  * It returns the code point in utf8 for the value in *encp.
6863  *    value: a code value in the source encoding
6864  *    encp:  a pointer to an Encode object
6865  *
6866  * If the result from Encode is not a single character,
6867  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6868  */
6869 STATIC UV
6870 S_reg_recode(pTHX_ const char value, SV **encp)
6871 {
6872     STRLEN numlen = 1;
6873     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6874     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6875     const STRLEN newlen = SvCUR(sv);
6876     UV uv = UNICODE_REPLACEMENT;
6877 
6878     PERL_ARGS_ASSERT_REG_RECODE;
6879 
6880     if (newlen)
6881 	uv = SvUTF8(sv)
6882 	     ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6883 	     : *(U8*)s;
6884 
6885     if (!newlen || numlen != newlen) {
6886 	uv = UNICODE_REPLACEMENT;
6887 	*encp = NULL;
6888     }
6889     return uv;
6890 }
6891 
6892 
6893 /*
6894  - regatom - the lowest level
6895 
6896    Try to identify anything special at the start of the pattern. If there
6897    is, then handle it as required. This may involve generating a single regop,
6898    such as for an assertion; or it may involve recursing, such as to
6899    handle a () structure.
6900 
6901    If the string doesn't start with something special then we gobble up
6902    as much literal text as we can.
6903 
6904    Once we have been able to handle whatever type of thing started the
6905    sequence, we return.
6906 
6907    Note: we have to be careful with escapes, as they can be both literal
6908    and special, and in the case of \10 and friends can either, depending
6909    on context. Specifically there are two seperate switches for handling
6910    escape sequences, with the one for handling literal escapes requiring
6911    a dummy entry for all of the special escapes that are actually handled
6912    by the other.
6913 */
6914 
6915 STATIC regnode *
6916 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6917 {
6918     dVAR;
6919     register regnode *ret = NULL;
6920     I32 flags;
6921     char *parse_start = RExC_parse;
6922     GET_RE_DEBUG_FLAGS_DECL;
6923     DEBUG_PARSE("atom");
6924     *flagp = WORST;		/* Tentatively. */
6925 
6926     PERL_ARGS_ASSERT_REGATOM;
6927 
6928 tryagain:
6929     switch ((U8)*RExC_parse) {
6930     case '^':
6931 	RExC_seen_zerolen++;
6932 	nextchar(pRExC_state);
6933 	if (RExC_flags & RXf_PMf_MULTILINE)
6934 	    ret = reg_node(pRExC_state, MBOL);
6935 	else if (RExC_flags & RXf_PMf_SINGLELINE)
6936 	    ret = reg_node(pRExC_state, SBOL);
6937 	else
6938 	    ret = reg_node(pRExC_state, BOL);
6939         Set_Node_Length(ret, 1); /* MJD */
6940 	break;
6941     case '$':
6942 	nextchar(pRExC_state);
6943 	if (*RExC_parse)
6944 	    RExC_seen_zerolen++;
6945 	if (RExC_flags & RXf_PMf_MULTILINE)
6946 	    ret = reg_node(pRExC_state, MEOL);
6947 	else if (RExC_flags & RXf_PMf_SINGLELINE)
6948 	    ret = reg_node(pRExC_state, SEOL);
6949 	else
6950 	    ret = reg_node(pRExC_state, EOL);
6951         Set_Node_Length(ret, 1); /* MJD */
6952 	break;
6953     case '.':
6954 	nextchar(pRExC_state);
6955 	if (RExC_flags & RXf_PMf_SINGLELINE)
6956 	    ret = reg_node(pRExC_state, SANY);
6957 	else
6958 	    ret = reg_node(pRExC_state, REG_ANY);
6959 	*flagp |= HASWIDTH|SIMPLE;
6960 	RExC_naughty++;
6961         Set_Node_Length(ret, 1); /* MJD */
6962 	break;
6963     case '[':
6964     {
6965 	char * const oregcomp_parse = ++RExC_parse;
6966         ret = regclass(pRExC_state,depth+1);
6967 	if (*RExC_parse != ']') {
6968 	    RExC_parse = oregcomp_parse;
6969 	    vFAIL("Unmatched [");
6970 	}
6971 	nextchar(pRExC_state);
6972 	*flagp |= HASWIDTH|SIMPLE;
6973         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6974 	break;
6975     }
6976     case '(':
6977 	nextchar(pRExC_state);
6978         ret = reg(pRExC_state, 1, &flags,depth+1);
6979 	if (ret == NULL) {
6980 		if (flags & TRYAGAIN) {
6981 		    if (RExC_parse == RExC_end) {
6982 			 /* Make parent create an empty node if needed. */
6983 			*flagp |= TRYAGAIN;
6984 			return(NULL);
6985 		    }
6986 		    goto tryagain;
6987 		}
6988 		return(NULL);
6989 	}
6990 	*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6991 	break;
6992     case '|':
6993     case ')':
6994 	if (flags & TRYAGAIN) {
6995 	    *flagp |= TRYAGAIN;
6996 	    return NULL;
6997 	}
6998 	vFAIL("Internal urp");
6999 				/* Supposed to be caught earlier. */
7000 	break;
7001     case '{':
7002 	if (!regcurly(RExC_parse)) {
7003 	    RExC_parse++;
7004 	    goto defchar;
7005 	}
7006 	/* FALL THROUGH */
7007     case '?':
7008     case '+':
7009     case '*':
7010 	RExC_parse++;
7011 	vFAIL("Quantifier follows nothing");
7012 	break;
7013     case 0xDF:
7014     case 0xC3:
7015     case 0xCE:
7016         do_foldchar:
7017         if (!LOC && FOLD) {
7018             U32 len,cp;
7019 	    len=0; /* silence a spurious compiler warning */
7020             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7021                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7022                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7023                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7024                 Set_Node_Length(ret, 1); /* MJD */
7025                 nextchar(pRExC_state); /* kill whitespace under /x */
7026                 return ret;
7027             }
7028         }
7029         goto outer_default;
7030     case '\\':
7031 	/* Special Escapes
7032 
7033 	   This switch handles escape sequences that resolve to some kind
7034 	   of special regop and not to literal text. Escape sequnces that
7035 	   resolve to literal text are handled below in the switch marked
7036 	   "Literal Escapes".
7037 
7038 	   Every entry in this switch *must* have a corresponding entry
7039 	   in the literal escape switch. However, the opposite is not
7040 	   required, as the default for this switch is to jump to the
7041 	   literal text handling code.
7042 	*/
7043 	switch ((U8)*++RExC_parse) {
7044 	case 0xDF:
7045 	case 0xC3:
7046 	case 0xCE:
7047 	           goto do_foldchar;
7048 	/* Special Escapes */
7049 	case 'A':
7050 	    RExC_seen_zerolen++;
7051 	    ret = reg_node(pRExC_state, SBOL);
7052 	    *flagp |= SIMPLE;
7053 	    goto finish_meta_pat;
7054 	case 'G':
7055 	    ret = reg_node(pRExC_state, GPOS);
7056 	    RExC_seen |= REG_SEEN_GPOS;
7057 	    *flagp |= SIMPLE;
7058 	    goto finish_meta_pat;
7059 	case 'K':
7060 	    RExC_seen_zerolen++;
7061 	    ret = reg_node(pRExC_state, KEEPS);
7062 	    *flagp |= SIMPLE;
7063 	    /* XXX:dmq : disabling in-place substitution seems to
7064 	     * be necessary here to avoid cases of memory corruption, as
7065 	     * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7066 	     */
7067 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
7068 	    goto finish_meta_pat;
7069 	case 'Z':
7070 	    ret = reg_node(pRExC_state, SEOL);
7071 	    *flagp |= SIMPLE;
7072 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
7073 	    goto finish_meta_pat;
7074 	case 'z':
7075 	    ret = reg_node(pRExC_state, EOS);
7076 	    *flagp |= SIMPLE;
7077 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
7078 	    goto finish_meta_pat;
7079 	case 'C':
7080 	    ret = reg_node(pRExC_state, CANY);
7081 	    RExC_seen |= REG_SEEN_CANY;
7082 	    *flagp |= HASWIDTH|SIMPLE;
7083 	    goto finish_meta_pat;
7084 	case 'X':
7085 	    ret = reg_node(pRExC_state, CLUMP);
7086 	    *flagp |= HASWIDTH;
7087 	    goto finish_meta_pat;
7088 	case 'w':
7089 	    ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
7090 	    *flagp |= HASWIDTH|SIMPLE;
7091 	    goto finish_meta_pat;
7092 	case 'W':
7093 	    ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
7094 	    *flagp |= HASWIDTH|SIMPLE;
7095 	    goto finish_meta_pat;
7096 	case 'b':
7097 	    RExC_seen_zerolen++;
7098 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
7099 	    ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
7100 	    *flagp |= SIMPLE;
7101 	    goto finish_meta_pat;
7102 	case 'B':
7103 	    RExC_seen_zerolen++;
7104 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
7105 	    ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
7106 	    *flagp |= SIMPLE;
7107 	    goto finish_meta_pat;
7108 	case 's':
7109 	    ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
7110 	    *flagp |= HASWIDTH|SIMPLE;
7111 	    goto finish_meta_pat;
7112 	case 'S':
7113 	    ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
7114 	    *flagp |= HASWIDTH|SIMPLE;
7115 	    goto finish_meta_pat;
7116 	case 'd':
7117 	    ret = reg_node(pRExC_state, DIGIT);
7118 	    *flagp |= HASWIDTH|SIMPLE;
7119 	    goto finish_meta_pat;
7120 	case 'D':
7121 	    ret = reg_node(pRExC_state, NDIGIT);
7122 	    *flagp |= HASWIDTH|SIMPLE;
7123 	    goto finish_meta_pat;
7124 	case 'R':
7125 	    ret = reg_node(pRExC_state, LNBREAK);
7126 	    *flagp |= HASWIDTH|SIMPLE;
7127 	    goto finish_meta_pat;
7128 	case 'h':
7129 	    ret = reg_node(pRExC_state, HORIZWS);
7130 	    *flagp |= HASWIDTH|SIMPLE;
7131 	    goto finish_meta_pat;
7132 	case 'H':
7133 	    ret = reg_node(pRExC_state, NHORIZWS);
7134 	    *flagp |= HASWIDTH|SIMPLE;
7135 	    goto finish_meta_pat;
7136 	case 'v':
7137 	    ret = reg_node(pRExC_state, VERTWS);
7138 	    *flagp |= HASWIDTH|SIMPLE;
7139 	    goto finish_meta_pat;
7140 	case 'V':
7141 	    ret = reg_node(pRExC_state, NVERTWS);
7142 	    *flagp |= HASWIDTH|SIMPLE;
7143          finish_meta_pat:
7144 	    nextchar(pRExC_state);
7145             Set_Node_Length(ret, 2); /* MJD */
7146 	    break;
7147 	case 'p':
7148 	case 'P':
7149 	    {
7150 		char* const oldregxend = RExC_end;
7151 #ifdef DEBUGGING
7152 		char* parse_start = RExC_parse - 2;
7153 #endif
7154 
7155 		if (RExC_parse[1] == '{') {
7156 		  /* a lovely hack--pretend we saw [\pX] instead */
7157 		    RExC_end = strchr(RExC_parse, '}');
7158 		    if (!RExC_end) {
7159 		        const U8 c = (U8)*RExC_parse;
7160 			RExC_parse += 2;
7161 			RExC_end = oldregxend;
7162 			vFAIL2("Missing right brace on \\%c{}", c);
7163 		    }
7164 		    RExC_end++;
7165 		}
7166 		else {
7167 		    RExC_end = RExC_parse + 2;
7168 		    if (RExC_end > oldregxend)
7169 			RExC_end = oldregxend;
7170 		}
7171 		RExC_parse--;
7172 
7173                 ret = regclass(pRExC_state,depth+1);
7174 
7175 		RExC_end = oldregxend;
7176 		RExC_parse--;
7177 
7178 		Set_Node_Offset(ret, parse_start + 2);
7179 		Set_Node_Cur_Length(ret);
7180 		nextchar(pRExC_state);
7181 		*flagp |= HASWIDTH|SIMPLE;
7182 	    }
7183 	    break;
7184         case 'N':
7185             /* Handle \N and \N{NAME} here and not below because it can be
7186             multicharacter. join_exact() will join them up later on.
7187             Also this makes sure that things like /\N{BLAH}+/ and
7188             \N{BLAH} being multi char Just Happen. dmq*/
7189             ++RExC_parse;
7190             ret= reg_namedseq(pRExC_state, NULL, flagp);
7191             break;
7192 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
7193 	parse_named_seq:
7194         {
7195             char ch= RExC_parse[1];
7196 	    if (ch != '<' && ch != '\'' && ch != '{') {
7197 	        RExC_parse++;
7198 	        vFAIL2("Sequence %.2s... not terminated",parse_start);
7199 	    } else {
7200 	        /* this pretty much dupes the code for (?P=...) in reg(), if
7201                    you change this make sure you change that */
7202 		char* name_start = (RExC_parse += 2);
7203 		U32 num = 0;
7204                 SV *sv_dat = reg_scan_name(pRExC_state,
7205                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7206                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7207                 if (RExC_parse == name_start || *RExC_parse != ch)
7208                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7209 
7210                 if (!SIZE_ONLY) {
7211                     num = add_data( pRExC_state, 1, "S" );
7212                     RExC_rxi->data->data[num]=(void*)sv_dat;
7213                     SvREFCNT_inc_simple_void(sv_dat);
7214                 }
7215 
7216                 RExC_sawback = 1;
7217                 ret = reganode(pRExC_state,
7218                 	   (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7219                 	   num);
7220                 *flagp |= HASWIDTH;
7221 
7222                 /* override incorrect value set in reganode MJD */
7223                 Set_Node_Offset(ret, parse_start+1);
7224                 Set_Node_Cur_Length(ret); /* MJD */
7225                 nextchar(pRExC_state);
7226 
7227             }
7228             break;
7229 	}
7230 	case 'g':
7231 	case '1': case '2': case '3': case '4':
7232 	case '5': case '6': case '7': case '8': case '9':
7233 	    {
7234 		I32 num;
7235 		bool isg = *RExC_parse == 'g';
7236 		bool isrel = 0;
7237 		bool hasbrace = 0;
7238 		if (isg) {
7239 		    RExC_parse++;
7240 		    if (*RExC_parse == '{') {
7241 		        RExC_parse++;
7242 		        hasbrace = 1;
7243 		    }
7244 		    if (*RExC_parse == '-') {
7245 		        RExC_parse++;
7246 		        isrel = 1;
7247 		    }
7248 		    if (hasbrace && !isDIGIT(*RExC_parse)) {
7249 		        if (isrel) RExC_parse--;
7250                         RExC_parse -= 2;
7251 		        goto parse_named_seq;
7252 		}   }
7253 		num = atoi(RExC_parse);
7254 		if (isg && num == 0)
7255 		    vFAIL("Reference to invalid group 0");
7256                 if (isrel) {
7257                     num = RExC_npar - num;
7258                     if (num < 1)
7259                         vFAIL("Reference to nonexistent or unclosed group");
7260                 }
7261 		if (!isg && num > 9 && num >= RExC_npar)
7262 		    goto defchar;
7263 		else {
7264 		    char * const parse_start = RExC_parse - 1; /* MJD */
7265 		    while (isDIGIT(*RExC_parse))
7266 			RExC_parse++;
7267 	            if (parse_start == RExC_parse - 1)
7268 	                vFAIL("Unterminated \\g... pattern");
7269                     if (hasbrace) {
7270                         if (*RExC_parse != '}')
7271                             vFAIL("Unterminated \\g{...} pattern");
7272                         RExC_parse++;
7273                     }
7274 		    if (!SIZE_ONLY) {
7275 		        if (num > (I32)RExC_rx->nparens)
7276 			    vFAIL("Reference to nonexistent group");
7277 		    }
7278 		    RExC_sawback = 1;
7279 		    ret = reganode(pRExC_state,
7280 				   (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7281 				   num);
7282 		    *flagp |= HASWIDTH;
7283 
7284                     /* override incorrect value set in reganode MJD */
7285                     Set_Node_Offset(ret, parse_start+1);
7286                     Set_Node_Cur_Length(ret); /* MJD */
7287 		    RExC_parse--;
7288 		    nextchar(pRExC_state);
7289 		}
7290 	    }
7291 	    break;
7292 	case '\0':
7293 	    if (RExC_parse >= RExC_end)
7294 		FAIL("Trailing \\");
7295 	    /* FALL THROUGH */
7296 	default:
7297 	    /* Do not generate "unrecognized" warnings here, we fall
7298 	       back into the quick-grab loop below */
7299 	    parse_start--;
7300 	    goto defchar;
7301 	}
7302 	break;
7303 
7304     case '#':
7305 	if (RExC_flags & RXf_PMf_EXTENDED) {
7306 	    if ( reg_skipcomment( pRExC_state ) )
7307 		goto tryagain;
7308 	}
7309 	/* FALL THROUGH */
7310 
7311     default:
7312         outer_default:{
7313 	    register STRLEN len;
7314 	    register UV ender;
7315 	    register char *p;
7316 	    char *s;
7317 	    STRLEN foldlen;
7318 	    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7319 
7320             parse_start = RExC_parse - 1;
7321 
7322 	    RExC_parse++;
7323 
7324 	defchar:
7325 	    ender = 0;
7326 	    ret = reg_node(pRExC_state,
7327 			   (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7328 	    s = STRING(ret);
7329 	    for (len = 0, p = RExC_parse - 1;
7330 	      len < 127 && p < RExC_end;
7331 	      len++)
7332 	    {
7333 		char * const oldp = p;
7334 
7335 		if (RExC_flags & RXf_PMf_EXTENDED)
7336 		    p = regwhite( pRExC_state, p );
7337 		switch ((U8)*p) {
7338 		case 0xDF:
7339 		case 0xC3:
7340 		case 0xCE:
7341 		           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7342 		                goto normal_default;
7343 		case '^':
7344 		case '$':
7345 		case '.':
7346 		case '[':
7347 		case '(':
7348 		case ')':
7349 		case '|':
7350 		    goto loopdone;
7351 		case '\\':
7352 		    /* Literal Escapes Switch
7353 
7354 		       This switch is meant to handle escape sequences that
7355 		       resolve to a literal character.
7356 
7357 		       Every escape sequence that represents something
7358 		       else, like an assertion or a char class, is handled
7359 		       in the switch marked 'Special Escapes' above in this
7360 		       routine, but also has an entry here as anything that
7361 		       isn't explicitly mentioned here will be treated as
7362 		       an unescaped equivalent literal.
7363 		    */
7364 
7365 		    switch ((U8)*++p) {
7366 		    /* These are all the special escapes. */
7367     		    case 0xDF:
7368     		    case 0xC3:
7369     		    case 0xCE:
7370     		           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7371     		                goto normal_default;
7372 		    case 'A':             /* Start assertion */
7373 		    case 'b': case 'B':   /* Word-boundary assertion*/
7374 		    case 'C':             /* Single char !DANGEROUS! */
7375 		    case 'd': case 'D':   /* digit class */
7376 		    case 'g': case 'G':   /* generic-backref, pos assertion */
7377 		    case 'h': case 'H':   /* HORIZWS */
7378 		    case 'k': case 'K':   /* named backref, keep marker */
7379 		    case 'N':             /* named char sequence */
7380 		    case 'p': case 'P':   /* Unicode property */
7381 		              case 'R':   /* LNBREAK */
7382 		    case 's': case 'S':   /* space class */
7383 		    case 'v': case 'V':   /* VERTWS */
7384 		    case 'w': case 'W':   /* word class */
7385 		    case 'X':             /* eXtended Unicode "combining character sequence" */
7386 		    case 'z': case 'Z':   /* End of line/string assertion */
7387 			--p;
7388 			goto loopdone;
7389 
7390 	            /* Anything after here is an escape that resolves to a
7391 	               literal. (Except digits, which may or may not)
7392 	             */
7393 		    case 'n':
7394 			ender = '\n';
7395 			p++;
7396 			break;
7397 		    case 'r':
7398 			ender = '\r';
7399 			p++;
7400 			break;
7401 		    case 't':
7402 			ender = '\t';
7403 			p++;
7404 			break;
7405 		    case 'f':
7406 			ender = '\f';
7407 			p++;
7408 			break;
7409 		    case 'e':
7410 			  ender = ASCII_TO_NATIVE('\033');
7411 			p++;
7412 			break;
7413 		    case 'a':
7414 			  ender = ASCII_TO_NATIVE('\007');
7415 			p++;
7416 			break;
7417 		    case 'x':
7418 			if (*++p == '{') {
7419 			    char* const e = strchr(p, '}');
7420 
7421 			    if (!e) {
7422 				RExC_parse = p + 1;
7423 				vFAIL("Missing right brace on \\x{}");
7424 			    }
7425 			    else {
7426                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7427                                     | PERL_SCAN_DISALLOW_PREFIX;
7428                                 STRLEN numlen = e - p - 1;
7429 				ender = grok_hex(p + 1, &numlen, &flags, NULL);
7430 				if (ender > 0xff)
7431 				    RExC_utf8 = 1;
7432 				p = e + 1;
7433 			    }
7434 			}
7435 			else {
7436                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7437 			    STRLEN numlen = 2;
7438 			    ender = grok_hex(p, &numlen, &flags, NULL);
7439 			    p += numlen;
7440 			}
7441 			if (PL_encoding && ender < 0x100)
7442 			    goto recode_encoding;
7443 			break;
7444 		    case 'c':
7445 			p++;
7446 			ender = UCHARAT(p++);
7447 			ender = toCTRL(ender);
7448 			break;
7449 		    case '0': case '1': case '2': case '3':case '4':
7450 		    case '5': case '6': case '7': case '8':case '9':
7451 			if (*p == '0' ||
7452 			  (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7453                             I32 flags = 0;
7454 			    STRLEN numlen = 3;
7455 			    ender = grok_oct(p, &numlen, &flags, NULL);
7456 
7457 			    /* An octal above 0xff is interpreted differently
7458 			     * depending on if the re is in utf8 or not.  If it
7459 			     * is in utf8, the value will be itself, otherwise
7460 			     * it is interpreted as modulo 0x100.  It has been
7461 			     * decided to discourage the use of octal above the
7462 			     * single-byte range.  For now, warn only when
7463 			     * it ends up modulo */
7464 			    if (SIZE_ONLY && ender >= 0x100
7465 				    && ! UTF && ! PL_encoding) {
7466 				ckWARNregdep(p, "Use of octal value above 377 is deprecated");
7467 			    }
7468 			    p += numlen;
7469 			}
7470 			else {
7471 			    --p;
7472 			    goto loopdone;
7473 			}
7474 			if (PL_encoding && ender < 0x100)
7475 			    goto recode_encoding;
7476 			break;
7477 		    recode_encoding:
7478 			{
7479 			    SV* enc = PL_encoding;
7480 			    ender = reg_recode((const char)(U8)ender, &enc);
7481 			    if (!enc && SIZE_ONLY)
7482 				ckWARNreg(p, "Invalid escape in the specified encoding");
7483 			    RExC_utf8 = 1;
7484 			}
7485 			break;
7486 		    case '\0':
7487 			if (p >= RExC_end)
7488 			    FAIL("Trailing \\");
7489 			/* FALL THROUGH */
7490 		    default:
7491 			if (!SIZE_ONLY&& isALPHA(*p))
7492 			    ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7493 			goto normal_default;
7494 		    }
7495 		    break;
7496 		default:
7497 		  normal_default:
7498 		    if (UTF8_IS_START(*p) && UTF) {
7499 			STRLEN numlen;
7500 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7501 					       &numlen, UTF8_ALLOW_DEFAULT);
7502 			p += numlen;
7503 		    }
7504 		    else
7505 			ender = *p++;
7506 		    break;
7507 		}
7508 		if ( RExC_flags & RXf_PMf_EXTENDED)
7509 		    p = regwhite( pRExC_state, p );
7510 		if (UTF && FOLD) {
7511 		    /* Prime the casefolded buffer. */
7512 		    ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7513 		}
7514 		if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7515 		    if (len)
7516 			p = oldp;
7517 		    else if (UTF) {
7518 			 if (FOLD) {
7519 			      /* Emit all the Unicode characters. */
7520 			      STRLEN numlen;
7521 			      for (foldbuf = tmpbuf;
7522 				   foldlen;
7523 				   foldlen -= numlen) {
7524 				   ender = utf8_to_uvchr(foldbuf, &numlen);
7525 				   if (numlen > 0) {
7526 					const STRLEN unilen = reguni(pRExC_state, ender, s);
7527 					s       += unilen;
7528 					len     += unilen;
7529 					/* In EBCDIC the numlen
7530 					 * and unilen can differ. */
7531 					foldbuf += numlen;
7532 					if (numlen >= foldlen)
7533 					     break;
7534 				   }
7535 				   else
7536 					break; /* "Can't happen." */
7537 			      }
7538 			 }
7539 			 else {
7540 			      const STRLEN unilen = reguni(pRExC_state, ender, s);
7541 			      if (unilen > 0) {
7542 				   s   += unilen;
7543 				   len += unilen;
7544 			      }
7545 			 }
7546 		    }
7547 		    else {
7548 			len++;
7549 			REGC((char)ender, s++);
7550 		    }
7551 		    break;
7552 		}
7553 		if (UTF) {
7554 		     if (FOLD) {
7555 		          /* Emit all the Unicode characters. */
7556 			  STRLEN numlen;
7557 			  for (foldbuf = tmpbuf;
7558 			       foldlen;
7559 			       foldlen -= numlen) {
7560 			       ender = utf8_to_uvchr(foldbuf, &numlen);
7561 			       if (numlen > 0) {
7562 				    const STRLEN unilen = reguni(pRExC_state, ender, s);
7563 				    len     += unilen;
7564 				    s       += unilen;
7565 				    /* In EBCDIC the numlen
7566 				     * and unilen can differ. */
7567 				    foldbuf += numlen;
7568 				    if (numlen >= foldlen)
7569 					 break;
7570 			       }
7571 			       else
7572 				    break;
7573 			  }
7574 		     }
7575 		     else {
7576 			  const STRLEN unilen = reguni(pRExC_state, ender, s);
7577 			  if (unilen > 0) {
7578 			       s   += unilen;
7579 			       len += unilen;
7580 			  }
7581 		     }
7582 		     len--;
7583 		}
7584 		else
7585 		    REGC((char)ender, s++);
7586 	    }
7587 	loopdone:
7588 	    RExC_parse = p - 1;
7589             Set_Node_Cur_Length(ret); /* MJD */
7590 	    nextchar(pRExC_state);
7591 	    {
7592 		/* len is STRLEN which is unsigned, need to copy to signed */
7593 		IV iv = len;
7594 		if (iv < 0)
7595 		    vFAIL("Internal disaster");
7596 	    }
7597 	    if (len > 0)
7598 		*flagp |= HASWIDTH;
7599 	    if (len == 1 && UNI_IS_INVARIANT(ender))
7600 		*flagp |= SIMPLE;
7601 
7602 	    if (SIZE_ONLY)
7603 		RExC_size += STR_SZ(len);
7604 	    else {
7605 		STR_LEN(ret) = len;
7606 		RExC_emit += STR_SZ(len);
7607             }
7608 	}
7609 	break;
7610     }
7611 
7612     return(ret);
7613 }
7614 
7615 STATIC char *
7616 S_regwhite( RExC_state_t *pRExC_state, char *p )
7617 {
7618     const char *e = RExC_end;
7619 
7620     PERL_ARGS_ASSERT_REGWHITE;
7621 
7622     while (p < e) {
7623 	if (isSPACE(*p))
7624 	    ++p;
7625 	else if (*p == '#') {
7626             bool ended = 0;
7627 	    do {
7628 		if (*p++ == '\n') {
7629 		    ended = 1;
7630 		    break;
7631 		}
7632 	    } while (p < e);
7633 	    if (!ended)
7634 	        RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7635 	}
7636 	else
7637 	    break;
7638     }
7639     return p;
7640 }
7641 
7642 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7643    Character classes ([:foo:]) can also be negated ([:^foo:]).
7644    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7645    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7646    but trigger failures because they are currently unimplemented. */
7647 
7648 #define POSIXCC_DONE(c)   ((c) == ':')
7649 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7650 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7651 
7652 STATIC I32
7653 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7654 {
7655     dVAR;
7656     I32 namedclass = OOB_NAMEDCLASS;
7657 
7658     PERL_ARGS_ASSERT_REGPPOSIXCC;
7659 
7660     if (value == '[' && RExC_parse + 1 < RExC_end &&
7661 	/* I smell either [: or [= or [. -- POSIX has been here, right? */
7662 	POSIXCC(UCHARAT(RExC_parse))) {
7663 	const char c = UCHARAT(RExC_parse);
7664 	char* const s = RExC_parse++;
7665 
7666 	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7667 	    RExC_parse++;
7668 	if (RExC_parse == RExC_end)
7669 	    /* Grandfather lone [:, [=, [. */
7670 	    RExC_parse = s;
7671 	else {
7672 	    const char* const t = RExC_parse++; /* skip over the c */
7673 	    assert(*t == c);
7674 
7675   	    if (UCHARAT(RExC_parse) == ']') {
7676 		const char *posixcc = s + 1;
7677   		RExC_parse++; /* skip over the ending ] */
7678 
7679 		if (*s == ':') {
7680 		    const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7681 		    const I32 skip = t - posixcc;
7682 
7683 		    /* Initially switch on the length of the name.  */
7684 		    switch (skip) {
7685 		    case 4:
7686 			if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7687 			    namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7688 			break;
7689 		    case 5:
7690 			/* Names all of length 5.  */
7691 			/* alnum alpha ascii blank cntrl digit graph lower
7692 			   print punct space upper  */
7693 			/* Offset 4 gives the best switch position.  */
7694 			switch (posixcc[4]) {
7695 			case 'a':
7696 			    if (memEQ(posixcc, "alph", 4)) /* alpha */
7697 				namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7698 			    break;
7699 			case 'e':
7700 			    if (memEQ(posixcc, "spac", 4)) /* space */
7701 				namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7702 			    break;
7703 			case 'h':
7704 			    if (memEQ(posixcc, "grap", 4)) /* graph */
7705 				namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7706 			    break;
7707 			case 'i':
7708 			    if (memEQ(posixcc, "asci", 4)) /* ascii */
7709 				namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7710 			    break;
7711 			case 'k':
7712 			    if (memEQ(posixcc, "blan", 4)) /* blank */
7713 				namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7714 			    break;
7715 			case 'l':
7716 			    if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7717 				namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7718 			    break;
7719 			case 'm':
7720 			    if (memEQ(posixcc, "alnu", 4)) /* alnum */
7721 				namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7722 			    break;
7723 			case 'r':
7724 			    if (memEQ(posixcc, "lowe", 4)) /* lower */
7725 				namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7726 			    else if (memEQ(posixcc, "uppe", 4)) /* upper */
7727 				namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7728 			    break;
7729 			case 't':
7730 			    if (memEQ(posixcc, "digi", 4)) /* digit */
7731 				namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7732 			    else if (memEQ(posixcc, "prin", 4)) /* print */
7733 				namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7734 			    else if (memEQ(posixcc, "punc", 4)) /* punct */
7735 				namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7736 			    break;
7737 			}
7738 			break;
7739 		    case 6:
7740 			if (memEQ(posixcc, "xdigit", 6))
7741 			    namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7742 			break;
7743 		    }
7744 
7745 		    if (namedclass == OOB_NAMEDCLASS)
7746 			Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7747 				      t - s - 1, s + 1);
7748 		    assert (posixcc[skip] == ':');
7749 		    assert (posixcc[skip+1] == ']');
7750 		} else if (!SIZE_ONLY) {
7751 		    /* [[=foo=]] and [[.foo.]] are still future. */
7752 
7753 		    /* adjust RExC_parse so the warning shows after
7754 		       the class closes */
7755 		    while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7756 			RExC_parse++;
7757 		    Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7758 		}
7759 	    } else {
7760 		/* Maternal grandfather:
7761 		 * "[:" ending in ":" but not in ":]" */
7762 		RExC_parse = s;
7763 	    }
7764 	}
7765     }
7766 
7767     return namedclass;
7768 }
7769 
7770 STATIC void
7771 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7772 {
7773     dVAR;
7774 
7775     PERL_ARGS_ASSERT_CHECKPOSIXCC;
7776 
7777     if (POSIXCC(UCHARAT(RExC_parse))) {
7778 	const char *s = RExC_parse;
7779 	const char  c = *s++;
7780 
7781 	while (isALNUM(*s))
7782 	    s++;
7783 	if (*s && c == *s && s[1] == ']') {
7784 	    ckWARN3reg(s+2,
7785 		       "POSIX syntax [%c %c] belongs inside character classes",
7786 		       c, c);
7787 
7788 	    /* [[=foo=]] and [[.foo.]] are still future. */
7789 	    if (POSIXCC_NOTYET(c)) {
7790 		/* adjust RExC_parse so the error shows after
7791 		   the class closes */
7792 		while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7793 		    NOOP;
7794 		Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7795 	    }
7796 	}
7797     }
7798 }
7799 
7800 
7801 #define _C_C_T_(NAME,TEST,WORD)                         \
7802 ANYOF_##NAME:                                           \
7803     if (LOC)                                            \
7804 	ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
7805     else {                                              \
7806 	for (value = 0; value < 256; value++)           \
7807 	    if (TEST)                                   \
7808 		ANYOF_BITMAP_SET(ret, value);           \
7809     }                                                   \
7810     yesno = '+';                                        \
7811     what = WORD;                                        \
7812     break;                                              \
7813 case ANYOF_N##NAME:                                     \
7814     if (LOC)                                            \
7815 	ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
7816     else {                                              \
7817 	for (value = 0; value < 256; value++)           \
7818 	    if (!TEST)                                  \
7819 		ANYOF_BITMAP_SET(ret, value);           \
7820     }                                                   \
7821     yesno = '!';                                        \
7822     what = WORD;                                        \
7823     break
7824 
7825 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
7826 ANYOF_##NAME:                                           \
7827 	for (value = 0; value < 256; value++)           \
7828 	    if (TEST)                                   \
7829 		ANYOF_BITMAP_SET(ret, value);           \
7830     yesno = '+';                                        \
7831     what = WORD;                                        \
7832     break;                                              \
7833 case ANYOF_N##NAME:                                     \
7834 	for (value = 0; value < 256; value++)           \
7835 	    if (!TEST)                                  \
7836 		ANYOF_BITMAP_SET(ret, value);           \
7837     yesno = '!';                                        \
7838     what = WORD;                                        \
7839     break
7840 
7841 /*
7842    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7843    so that it is possible to override the option here without having to
7844    rebuild the entire core. as we are required to do if we change regcomp.h
7845    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7846 */
7847 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7848 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7849 #endif
7850 
7851 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7852 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7853 #else
7854 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7855 #endif
7856 
7857 /*
7858    parse a class specification and produce either an ANYOF node that
7859    matches the pattern or if the pattern matches a single char only and
7860    that char is < 256 and we are case insensitive then we produce an
7861    EXACT node instead.
7862 */
7863 
7864 STATIC regnode *
7865 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7866 {
7867     dVAR;
7868     register UV nextvalue;
7869     register IV prevvalue = OOB_UNICODE;
7870     register IV range = 0;
7871     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7872     register regnode *ret;
7873     STRLEN numlen;
7874     IV namedclass;
7875     char *rangebegin = NULL;
7876     bool need_class = 0;
7877     SV *listsv = NULL;
7878     UV n;
7879     bool optimize_invert   = TRUE;
7880     AV* unicode_alternate  = NULL;
7881 #ifdef EBCDIC
7882     UV literal_endpoint = 0;
7883 #endif
7884     UV stored = 0;  /* number of chars stored in the class */
7885 
7886     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7887         case we need to change the emitted regop to an EXACT. */
7888     const char * orig_parse = RExC_parse;
7889     GET_RE_DEBUG_FLAGS_DECL;
7890 
7891     PERL_ARGS_ASSERT_REGCLASS;
7892 #ifndef DEBUGGING
7893     PERL_UNUSED_ARG(depth);
7894 #endif
7895 
7896     DEBUG_PARSE("clas");
7897 
7898     /* Assume we are going to generate an ANYOF node. */
7899     ret = reganode(pRExC_state, ANYOF, 0);
7900 
7901     if (!SIZE_ONLY)
7902 	ANYOF_FLAGS(ret) = 0;
7903 
7904     if (UCHARAT(RExC_parse) == '^') {	/* Complement of range. */
7905 	RExC_naughty++;
7906 	RExC_parse++;
7907 	if (!SIZE_ONLY)
7908 	    ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7909     }
7910 
7911     if (SIZE_ONLY) {
7912 	RExC_size += ANYOF_SKIP;
7913 	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7914     }
7915     else {
7916  	RExC_emit += ANYOF_SKIP;
7917 	if (FOLD)
7918 	    ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7919 	if (LOC)
7920 	    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7921 	ANYOF_BITMAP_ZERO(ret);
7922 	listsv = newSVpvs("# comment\n");
7923     }
7924 
7925     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7926 
7927     if (!SIZE_ONLY && POSIXCC(nextvalue))
7928 	checkposixcc(pRExC_state);
7929 
7930     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7931     if (UCHARAT(RExC_parse) == ']')
7932 	goto charclassloop;
7933 
7934 parseit:
7935     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7936 
7937     charclassloop:
7938 
7939 	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7940 
7941 	if (!range)
7942 	    rangebegin = RExC_parse;
7943 	if (UTF) {
7944 	    value = utf8n_to_uvchr((U8*)RExC_parse,
7945 				   RExC_end - RExC_parse,
7946 				   &numlen, UTF8_ALLOW_DEFAULT);
7947 	    RExC_parse += numlen;
7948 	}
7949 	else
7950 	    value = UCHARAT(RExC_parse++);
7951 
7952 	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7953 	if (value == '[' && POSIXCC(nextvalue))
7954 	    namedclass = regpposixcc(pRExC_state, value);
7955 	else if (value == '\\') {
7956 	    if (UTF) {
7957 		value = utf8n_to_uvchr((U8*)RExC_parse,
7958 				   RExC_end - RExC_parse,
7959 				   &numlen, UTF8_ALLOW_DEFAULT);
7960 		RExC_parse += numlen;
7961 	    }
7962 	    else
7963 		value = UCHARAT(RExC_parse++);
7964 	    /* Some compilers cannot handle switching on 64-bit integer
7965 	     * values, therefore value cannot be an UV.  Yes, this will
7966 	     * be a problem later if we want switch on Unicode.
7967 	     * A similar issue a little bit later when switching on
7968 	     * namedclass. --jhi */
7969 	    switch ((I32)value) {
7970 	    case 'w':	namedclass = ANYOF_ALNUM;	break;
7971 	    case 'W':	namedclass = ANYOF_NALNUM;	break;
7972 	    case 's':	namedclass = ANYOF_SPACE;	break;
7973 	    case 'S':	namedclass = ANYOF_NSPACE;	break;
7974 	    case 'd':	namedclass = ANYOF_DIGIT;	break;
7975 	    case 'D':	namedclass = ANYOF_NDIGIT;	break;
7976 	    case 'v':	namedclass = ANYOF_VERTWS;	break;
7977 	    case 'V':	namedclass = ANYOF_NVERTWS;	break;
7978 	    case 'h':	namedclass = ANYOF_HORIZWS;	break;
7979 	    case 'H':	namedclass = ANYOF_NHORIZWS;	break;
7980             case 'N':  /* Handle \N{NAME} in class */
7981                 {
7982                     /* We only pay attention to the first char of
7983                     multichar strings being returned. I kinda wonder
7984                     if this makes sense as it does change the behaviour
7985                     from earlier versions, OTOH that behaviour was broken
7986                     as well. */
7987                     UV v; /* value is register so we cant & it /grrr */
7988                     if (reg_namedseq(pRExC_state, &v, NULL)) {
7989                         goto parseit;
7990                     }
7991                     value= v;
7992                 }
7993                 break;
7994 	    case 'p':
7995 	    case 'P':
7996 		{
7997 		char *e;
7998 		if (RExC_parse >= RExC_end)
7999 		    vFAIL2("Empty \\%c{}", (U8)value);
8000 		if (*RExC_parse == '{') {
8001 		    const U8 c = (U8)value;
8002 		    e = strchr(RExC_parse++, '}');
8003                     if (!e)
8004                         vFAIL2("Missing right brace on \\%c{}", c);
8005 		    while (isSPACE(UCHARAT(RExC_parse)))
8006 		        RExC_parse++;
8007                     if (e == RExC_parse)
8008                         vFAIL2("Empty \\%c{}", c);
8009 		    n = e - RExC_parse;
8010 		    while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8011 		        n--;
8012 		}
8013 		else {
8014 		    e = RExC_parse;
8015 		    n = 1;
8016 		}
8017 		if (!SIZE_ONLY) {
8018 		    if (UCHARAT(RExC_parse) == '^') {
8019 			 RExC_parse++;
8020 			 n--;
8021 			 value = value == 'p' ? 'P' : 'p'; /* toggle */
8022 			 while (isSPACE(UCHARAT(RExC_parse))) {
8023 			      RExC_parse++;
8024 			      n--;
8025 			 }
8026 		    }
8027 		    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8028 			(value=='p' ? '+' : '!'), (int)n, RExC_parse);
8029 		}
8030 		RExC_parse = e + 1;
8031 		ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8032 		namedclass = ANYOF_MAX;  /* no official name, but it's named */
8033 		}
8034 		break;
8035 	    case 'n':	value = '\n';			break;
8036 	    case 'r':	value = '\r';			break;
8037 	    case 't':	value = '\t';			break;
8038 	    case 'f':	value = '\f';			break;
8039 	    case 'b':	value = '\b';			break;
8040 	    case 'e':	value = ASCII_TO_NATIVE('\033');break;
8041 	    case 'a':	value = ASCII_TO_NATIVE('\007');break;
8042 	    case 'x':
8043 		if (*RExC_parse == '{') {
8044                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8045                         | PERL_SCAN_DISALLOW_PREFIX;
8046 		    char * const e = strchr(RExC_parse++, '}');
8047                     if (!e)
8048                         vFAIL("Missing right brace on \\x{}");
8049 
8050 		    numlen = e - RExC_parse;
8051 		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8052 		    RExC_parse = e + 1;
8053 		}
8054 		else {
8055                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8056 		    numlen = 2;
8057 		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8058 		    RExC_parse += numlen;
8059 		}
8060 		if (PL_encoding && value < 0x100)
8061 		    goto recode_encoding;
8062 		break;
8063 	    case 'c':
8064 		value = UCHARAT(RExC_parse++);
8065 		value = toCTRL(value);
8066 		break;
8067 	    case '0': case '1': case '2': case '3': case '4':
8068 	    case '5': case '6': case '7': case '8': case '9':
8069 		{
8070 		    I32 flags = 0;
8071 		    numlen = 3;
8072 		    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8073 		    RExC_parse += numlen;
8074 		    if (PL_encoding && value < 0x100)
8075 			goto recode_encoding;
8076 		    break;
8077 		}
8078 	    recode_encoding:
8079 		{
8080 		    SV* enc = PL_encoding;
8081 		    value = reg_recode((const char)(U8)value, &enc);
8082 		    if (!enc && SIZE_ONLY)
8083 			ckWARNreg(RExC_parse,
8084 				  "Invalid escape in the specified encoding");
8085 		    break;
8086 		}
8087 	    default:
8088 		if (!SIZE_ONLY && isALPHA(value))
8089 		    ckWARN2reg(RExC_parse,
8090 			       "Unrecognized escape \\%c in character class passed through",
8091 			       (int)value);
8092 		break;
8093 	    }
8094 	} /* end of \blah */
8095 #ifdef EBCDIC
8096 	else
8097 	    literal_endpoint++;
8098 #endif
8099 
8100 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8101 
8102 	    if (!SIZE_ONLY && !need_class)
8103 		ANYOF_CLASS_ZERO(ret);
8104 
8105 	    need_class = 1;
8106 
8107 	    /* a bad range like a-\d, a-[:digit:] ? */
8108 	    if (range) {
8109 		if (!SIZE_ONLY) {
8110 		    const int w =
8111 			RExC_parse >= rangebegin ?
8112 			RExC_parse - rangebegin : 0;
8113 		    ckWARN4reg(RExC_parse,
8114 			       "False [] range \"%*.*s\"",
8115 			       w, w, rangebegin);
8116 
8117 		    if (prevvalue < 256) {
8118 			ANYOF_BITMAP_SET(ret, prevvalue);
8119 			ANYOF_BITMAP_SET(ret, '-');
8120 		    }
8121 		    else {
8122 			ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8123 			Perl_sv_catpvf(aTHX_ listsv,
8124 				       "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8125 		    }
8126 		}
8127 
8128 		range = 0; /* this was not a true range */
8129 	    }
8130 
8131 
8132 
8133 	    if (!SIZE_ONLY) {
8134 		const char *what = NULL;
8135 		char yesno = 0;
8136 
8137 	        if (namedclass > OOB_NAMEDCLASS)
8138 		    optimize_invert = FALSE;
8139 		/* Possible truncation here but in some 64-bit environments
8140 		 * the compiler gets heartburn about switch on 64-bit values.
8141 		 * A similar issue a little earlier when switching on value.
8142 		 * --jhi */
8143 		switch ((I32)namedclass) {
8144 
8145 		case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8146 		case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8147 		case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8148 		case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8149 		case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8150 		case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8151 		case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8152 		case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8153 		case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8154 		case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8155 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8156 		case _C_C_T_(ALNUM, isALNUM(value), "Word");
8157 		case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8158 #else
8159 		case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8160 		case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8161 #endif
8162 		case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8163 		case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8164 		case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8165 		case ANYOF_ASCII:
8166 		    if (LOC)
8167 			ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8168 		    else {
8169 #ifndef EBCDIC
8170 			for (value = 0; value < 128; value++)
8171 			    ANYOF_BITMAP_SET(ret, value);
8172 #else  /* EBCDIC */
8173 			for (value = 0; value < 256; value++) {
8174 			    if (isASCII(value))
8175 			        ANYOF_BITMAP_SET(ret, value);
8176 			}
8177 #endif /* EBCDIC */
8178 		    }
8179 		    yesno = '+';
8180 		    what = "ASCII";
8181 		    break;
8182 		case ANYOF_NASCII:
8183 		    if (LOC)
8184 			ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8185 		    else {
8186 #ifndef EBCDIC
8187 			for (value = 128; value < 256; value++)
8188 			    ANYOF_BITMAP_SET(ret, value);
8189 #else  /* EBCDIC */
8190 			for (value = 0; value < 256; value++) {
8191 			    if (!isASCII(value))
8192 			        ANYOF_BITMAP_SET(ret, value);
8193 			}
8194 #endif /* EBCDIC */
8195 		    }
8196 		    yesno = '!';
8197 		    what = "ASCII";
8198 		    break;
8199 		case ANYOF_DIGIT:
8200 		    if (LOC)
8201 			ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8202 		    else {
8203 			/* consecutive digits assumed */
8204 			for (value = '0'; value <= '9'; value++)
8205 			    ANYOF_BITMAP_SET(ret, value);
8206 		    }
8207 		    yesno = '+';
8208 		    what = POSIX_CC_UNI_NAME("Digit");
8209 		    break;
8210 		case ANYOF_NDIGIT:
8211 		    if (LOC)
8212 			ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8213 		    else {
8214 			/* consecutive digits assumed */
8215 			for (value = 0; value < '0'; value++)
8216 			    ANYOF_BITMAP_SET(ret, value);
8217 			for (value = '9' + 1; value < 256; value++)
8218 			    ANYOF_BITMAP_SET(ret, value);
8219 		    }
8220 		    yesno = '!';
8221 		    what = POSIX_CC_UNI_NAME("Digit");
8222 		    break;
8223 		case ANYOF_MAX:
8224 		    /* this is to handle \p and \P */
8225 		    break;
8226 		default:
8227 		    vFAIL("Invalid [::] class");
8228 		    break;
8229 		}
8230 		if (what) {
8231 		    /* Strings such as "+utf8::isWord\n" */
8232 		    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8233 		}
8234 		if (LOC)
8235 		    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8236 		continue;
8237 	    }
8238 	} /* end of namedclass \blah */
8239 
8240 	if (range) {
8241 	    if (prevvalue > (IV)value) /* b-a */ {
8242 		const int w = RExC_parse - rangebegin;
8243 		Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8244 		range = 0; /* not a valid range */
8245 	    }
8246 	}
8247 	else {
8248 	    prevvalue = value; /* save the beginning of the range */
8249 	    if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8250 		RExC_parse[1] != ']') {
8251 		RExC_parse++;
8252 
8253 		/* a bad range like \w-, [:word:]- ? */
8254 		if (namedclass > OOB_NAMEDCLASS) {
8255 		    if (ckWARN(WARN_REGEXP)) {
8256 			const int w =
8257 			    RExC_parse >= rangebegin ?
8258 			    RExC_parse - rangebegin : 0;
8259 			vWARN4(RExC_parse,
8260 			       "False [] range \"%*.*s\"",
8261 			       w, w, rangebegin);
8262 		    }
8263 		    if (!SIZE_ONLY)
8264 			ANYOF_BITMAP_SET(ret, '-');
8265 		} else
8266 		    range = 1;	/* yeah, it's a range! */
8267 		continue;	/* but do it the next time */
8268 	    }
8269 	}
8270 
8271 	/* now is the next time */
8272         /*stored += (value - prevvalue + 1);*/
8273 	if (!SIZE_ONLY) {
8274 	    if (prevvalue < 256) {
8275 	        const IV ceilvalue = value < 256 ? value : 255;
8276 		IV i;
8277 #ifdef EBCDIC
8278 		/* In EBCDIC [\x89-\x91] should include
8279 		 * the \x8e but [i-j] should not. */
8280 		if (literal_endpoint == 2 &&
8281 		    ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8282 		     (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8283 		{
8284 		    if (isLOWER(prevvalue)) {
8285 			for (i = prevvalue; i <= ceilvalue; i++)
8286 			    if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8287 				stored++;
8288 				ANYOF_BITMAP_SET(ret, i);
8289 			    }
8290 		    } else {
8291 			for (i = prevvalue; i <= ceilvalue; i++)
8292 			    if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8293 				stored++;
8294 				ANYOF_BITMAP_SET(ret, i);
8295 			    }
8296 		    }
8297 		}
8298 		else
8299 #endif
8300 		      for (i = prevvalue; i <= ceilvalue; i++) {
8301 		        if (!ANYOF_BITMAP_TEST(ret,i)) {
8302 		            stored++;
8303 			    ANYOF_BITMAP_SET(ret, i);
8304 		        }
8305 	              }
8306 	  }
8307 	  if (value > 255 || UTF) {
8308 	        const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8309 		const UV natvalue      = NATIVE_TO_UNI(value);
8310                 stored+=2; /* can't optimize this class */
8311 		ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8312 		if (prevnatvalue < natvalue) { /* what about > ? */
8313 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8314 				   prevnatvalue, natvalue);
8315 		}
8316 		else if (prevnatvalue == natvalue) {
8317 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8318 		    if (FOLD) {
8319 			 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8320 			 STRLEN foldlen;
8321 			 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8322 
8323 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8324 			 if (RExC_precomp[0] == ':' &&
8325 			     RExC_precomp[1] == '[' &&
8326 			     (f == 0xDF || f == 0x92)) {
8327 			     f = NATIVE_TO_UNI(f);
8328                         }
8329 #endif
8330 			 /* If folding and foldable and a single
8331 			  * character, insert also the folded version
8332 			  * to the charclass. */
8333 			 if (f != value) {
8334 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8335 			     if ((RExC_precomp[0] == ':' &&
8336 				  RExC_precomp[1] == '[' &&
8337 				  (f == 0xA2 &&
8338 				   (value == 0xFB05 || value == 0xFB06))) ?
8339 				 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8340 				 foldlen == (STRLEN)UNISKIP(f) )
8341 #else
8342 			      if (foldlen == (STRLEN)UNISKIP(f))
8343 #endif
8344 				  Perl_sv_catpvf(aTHX_ listsv,
8345 						 "%04"UVxf"\n", f);
8346 			      else {
8347 				  /* Any multicharacter foldings
8348 				   * require the following transform:
8349 				   * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8350 				   * where E folds into "pq" and F folds
8351 				   * into "rst", all other characters
8352 				   * fold to single characters.  We save
8353 				   * away these multicharacter foldings,
8354 				   * to be later saved as part of the
8355 				   * additional "s" data. */
8356 				  SV *sv;
8357 
8358 				  if (!unicode_alternate)
8359 				      unicode_alternate = newAV();
8360 				  sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8361 						     TRUE);
8362 				  av_push(unicode_alternate, sv);
8363 			      }
8364 			 }
8365 
8366 			 /* If folding and the value is one of the Greek
8367 			  * sigmas insert a few more sigmas to make the
8368 			  * folding rules of the sigmas to work right.
8369 			  * Note that not all the possible combinations
8370 			  * are handled here: some of them are handled
8371 			  * by the standard folding rules, and some of
8372 			  * them (literal or EXACTF cases) are handled
8373 			  * during runtime in regexec.c:S_find_byclass(). */
8374 			 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8375 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8376 					     (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8377 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8378 					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8379 			 }
8380 			 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8381 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8382 					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8383 		    }
8384 		}
8385 	    }
8386 #ifdef EBCDIC
8387 	    literal_endpoint = 0;
8388 #endif
8389         }
8390 
8391 	range = 0; /* this range (if it was one) is done now */
8392     }
8393 
8394     if (need_class) {
8395 	ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8396 	if (SIZE_ONLY)
8397 	    RExC_size += ANYOF_CLASS_ADD_SKIP;
8398 	else
8399 	    RExC_emit += ANYOF_CLASS_ADD_SKIP;
8400     }
8401 
8402 
8403     if (SIZE_ONLY)
8404         return ret;
8405     /****** !SIZE_ONLY AFTER HERE *********/
8406 
8407     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8408         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8409     ) {
8410         /* optimize single char class to an EXACT node
8411            but *only* when its not a UTF/high char  */
8412         const char * cur_parse= RExC_parse;
8413         RExC_emit = (regnode *)orig_emit;
8414         RExC_parse = (char *)orig_parse;
8415         ret = reg_node(pRExC_state,
8416                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8417         RExC_parse = (char *)cur_parse;
8418         *STRING(ret)= (char)value;
8419         STR_LEN(ret)= 1;
8420         RExC_emit += STR_SZ(1);
8421 	SvREFCNT_dec(listsv);
8422         return ret;
8423     }
8424     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8425     if ( /* If the only flag is folding (plus possibly inversion). */
8426 	((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8427        ) {
8428 	for (value = 0; value < 256; ++value) {
8429 	    if (ANYOF_BITMAP_TEST(ret, value)) {
8430 		UV fold = PL_fold[value];
8431 
8432 		if (fold != value)
8433 		    ANYOF_BITMAP_SET(ret, fold);
8434 	    }
8435 	}
8436 	ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8437     }
8438 
8439     /* optimize inverted simple patterns (e.g. [^a-z]) */
8440     if (optimize_invert &&
8441 	/* If the only flag is inversion. */
8442 	(ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) ==	ANYOF_INVERT) {
8443 	for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8444 	    ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8445 	ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8446     }
8447     {
8448 	AV * const av = newAV();
8449 	SV *rv;
8450 	/* The 0th element stores the character class description
8451 	 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8452 	 * to initialize the appropriate swash (which gets stored in
8453 	 * the 1st element), and also useful for dumping the regnode.
8454 	 * The 2nd element stores the multicharacter foldings,
8455 	 * used later (regexec.c:S_reginclass()). */
8456 	av_store(av, 0, listsv);
8457 	av_store(av, 1, NULL);
8458 	av_store(av, 2, MUTABLE_SV(unicode_alternate));
8459 	rv = newRV_noinc(MUTABLE_SV(av));
8460 	n = add_data(pRExC_state, 1, "s");
8461 	RExC_rxi->data->data[n] = (void*)rv;
8462 	ARG_SET(ret, n);
8463     }
8464     return ret;
8465 }
8466 #undef _C_C_T_
8467 
8468 
8469 /* reg_skipcomment()
8470 
8471    Absorbs an /x style # comments from the input stream.
8472    Returns true if there is more text remaining in the stream.
8473    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8474    terminates the pattern without including a newline.
8475 
8476    Note its the callers responsibility to ensure that we are
8477    actually in /x mode
8478 
8479 */
8480 
8481 STATIC bool
8482 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8483 {
8484     bool ended = 0;
8485 
8486     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8487 
8488     while (RExC_parse < RExC_end)
8489         if (*RExC_parse++ == '\n') {
8490             ended = 1;
8491             break;
8492         }
8493     if (!ended) {
8494         /* we ran off the end of the pattern without ending
8495            the comment, so we have to add an \n when wrapping */
8496         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8497         return 0;
8498     } else
8499         return 1;
8500 }
8501 
8502 /* nextchar()
8503 
8504    Advance that parse position, and optionall absorbs
8505    "whitespace" from the inputstream.
8506 
8507    Without /x "whitespace" means (?#...) style comments only,
8508    with /x this means (?#...) and # comments and whitespace proper.
8509 
8510    Returns the RExC_parse point from BEFORE the scan occurs.
8511 
8512    This is the /x friendly way of saying RExC_parse++.
8513 */
8514 
8515 STATIC char*
8516 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8517 {
8518     char* const retval = RExC_parse++;
8519 
8520     PERL_ARGS_ASSERT_NEXTCHAR;
8521 
8522     for (;;) {
8523 	if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8524 		RExC_parse[2] == '#') {
8525 	    while (*RExC_parse != ')') {
8526 		if (RExC_parse == RExC_end)
8527 		    FAIL("Sequence (?#... not terminated");
8528 		RExC_parse++;
8529 	    }
8530 	    RExC_parse++;
8531 	    continue;
8532 	}
8533 	if (RExC_flags & RXf_PMf_EXTENDED) {
8534 	    if (isSPACE(*RExC_parse)) {
8535 		RExC_parse++;
8536 		continue;
8537 	    }
8538 	    else if (*RExC_parse == '#') {
8539 	        if ( reg_skipcomment( pRExC_state ) )
8540 	            continue;
8541 	    }
8542 	}
8543 	return retval;
8544     }
8545 }
8546 
8547 /*
8548 - reg_node - emit a node
8549 */
8550 STATIC regnode *			/* Location. */
8551 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8552 {
8553     dVAR;
8554     register regnode *ptr;
8555     regnode * const ret = RExC_emit;
8556     GET_RE_DEBUG_FLAGS_DECL;
8557 
8558     PERL_ARGS_ASSERT_REG_NODE;
8559 
8560     if (SIZE_ONLY) {
8561 	SIZE_ALIGN(RExC_size);
8562 	RExC_size += 1;
8563 	return(ret);
8564     }
8565     if (RExC_emit >= RExC_emit_bound)
8566         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8567 
8568     NODE_ALIGN_FILL(ret);
8569     ptr = ret;
8570     FILL_ADVANCE_NODE(ptr, op);
8571 #ifdef RE_TRACK_PATTERN_OFFSETS
8572     if (RExC_offsets) {         /* MJD */
8573 	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8574               "reg_node", __LINE__,
8575               PL_reg_name[op],
8576               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8577 		? "Overwriting end of array!\n" : "OK",
8578               (UV)(RExC_emit - RExC_emit_start),
8579               (UV)(RExC_parse - RExC_start),
8580               (UV)RExC_offsets[0]));
8581 	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8582     }
8583 #endif
8584     RExC_emit = ptr;
8585     return(ret);
8586 }
8587 
8588 /*
8589 - reganode - emit a node with an argument
8590 */
8591 STATIC regnode *			/* Location. */
8592 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8593 {
8594     dVAR;
8595     register regnode *ptr;
8596     regnode * const ret = RExC_emit;
8597     GET_RE_DEBUG_FLAGS_DECL;
8598 
8599     PERL_ARGS_ASSERT_REGANODE;
8600 
8601     if (SIZE_ONLY) {
8602 	SIZE_ALIGN(RExC_size);
8603 	RExC_size += 2;
8604 	/*
8605 	   We can't do this:
8606 
8607 	   assert(2==regarglen[op]+1);
8608 
8609 	   Anything larger than this has to allocate the extra amount.
8610 	   If we changed this to be:
8611 
8612 	   RExC_size += (1 + regarglen[op]);
8613 
8614 	   then it wouldn't matter. Its not clear what side effect
8615 	   might come from that so its not done so far.
8616 	   -- dmq
8617 	*/
8618 	return(ret);
8619     }
8620     if (RExC_emit >= RExC_emit_bound)
8621         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8622 
8623     NODE_ALIGN_FILL(ret);
8624     ptr = ret;
8625     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8626 #ifdef RE_TRACK_PATTERN_OFFSETS
8627     if (RExC_offsets) {         /* MJD */
8628 	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8629               "reganode",
8630 	      __LINE__,
8631 	      PL_reg_name[op],
8632               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8633               "Overwriting end of array!\n" : "OK",
8634               (UV)(RExC_emit - RExC_emit_start),
8635               (UV)(RExC_parse - RExC_start),
8636               (UV)RExC_offsets[0]));
8637 	Set_Cur_Node_Offset;
8638     }
8639 #endif
8640     RExC_emit = ptr;
8641     return(ret);
8642 }
8643 
8644 /*
8645 - reguni - emit (if appropriate) a Unicode character
8646 */
8647 STATIC STRLEN
8648 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8649 {
8650     dVAR;
8651 
8652     PERL_ARGS_ASSERT_REGUNI;
8653 
8654     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8655 }
8656 
8657 /*
8658 - reginsert - insert an operator in front of already-emitted operand
8659 *
8660 * Means relocating the operand.
8661 */
8662 STATIC void
8663 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8664 {
8665     dVAR;
8666     register regnode *src;
8667     register regnode *dst;
8668     register regnode *place;
8669     const int offset = regarglen[(U8)op];
8670     const int size = NODE_STEP_REGNODE + offset;
8671     GET_RE_DEBUG_FLAGS_DECL;
8672 
8673     PERL_ARGS_ASSERT_REGINSERT;
8674     PERL_UNUSED_ARG(depth);
8675 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8676     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8677     if (SIZE_ONLY) {
8678 	RExC_size += size;
8679 	return;
8680     }
8681 
8682     src = RExC_emit;
8683     RExC_emit += size;
8684     dst = RExC_emit;
8685     if (RExC_open_parens) {
8686         int paren;
8687         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8688         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8689             if ( RExC_open_parens[paren] >= opnd ) {
8690                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8691                 RExC_open_parens[paren] += size;
8692             } else {
8693                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8694             }
8695             if ( RExC_close_parens[paren] >= opnd ) {
8696                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8697                 RExC_close_parens[paren] += size;
8698             } else {
8699                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8700             }
8701         }
8702     }
8703 
8704     while (src > opnd) {
8705 	StructCopy(--src, --dst, regnode);
8706 #ifdef RE_TRACK_PATTERN_OFFSETS
8707         if (RExC_offsets) {     /* MJD 20010112 */
8708 	    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8709                   "reg_insert",
8710 		  __LINE__,
8711 		  PL_reg_name[op],
8712                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8713 		    ? "Overwriting end of array!\n" : "OK",
8714                   (UV)(src - RExC_emit_start),
8715                   (UV)(dst - RExC_emit_start),
8716                   (UV)RExC_offsets[0]));
8717 	    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8718 	    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8719         }
8720 #endif
8721     }
8722 
8723 
8724     place = opnd;		/* Op node, where operand used to be. */
8725 #ifdef RE_TRACK_PATTERN_OFFSETS
8726     if (RExC_offsets) {         /* MJD */
8727 	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8728               "reginsert",
8729 	      __LINE__,
8730 	      PL_reg_name[op],
8731               (UV)(place - RExC_emit_start) > RExC_offsets[0]
8732               ? "Overwriting end of array!\n" : "OK",
8733               (UV)(place - RExC_emit_start),
8734               (UV)(RExC_parse - RExC_start),
8735               (UV)RExC_offsets[0]));
8736 	Set_Node_Offset(place, RExC_parse);
8737 	Set_Node_Length(place, 1);
8738     }
8739 #endif
8740     src = NEXTOPER(place);
8741     FILL_ADVANCE_NODE(place, op);
8742     Zero(src, offset, regnode);
8743 }
8744 
8745 /*
8746 - regtail - set the next-pointer at the end of a node chain of p to val.
8747 - SEE ALSO: regtail_study
8748 */
8749 /* TODO: All three parms should be const */
8750 STATIC void
8751 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8752 {
8753     dVAR;
8754     register regnode *scan;
8755     GET_RE_DEBUG_FLAGS_DECL;
8756 
8757     PERL_ARGS_ASSERT_REGTAIL;
8758 #ifndef DEBUGGING
8759     PERL_UNUSED_ARG(depth);
8760 #endif
8761 
8762     if (SIZE_ONLY)
8763 	return;
8764 
8765     /* Find last node. */
8766     scan = p;
8767     for (;;) {
8768 	regnode * const temp = regnext(scan);
8769         DEBUG_PARSE_r({
8770             SV * const mysv=sv_newmortal();
8771             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8772             regprop(RExC_rx, mysv, scan);
8773             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8774                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8775                     (temp == NULL ? "->" : ""),
8776                     (temp == NULL ? PL_reg_name[OP(val)] : "")
8777             );
8778         });
8779         if (temp == NULL)
8780             break;
8781         scan = temp;
8782     }
8783 
8784     if (reg_off_by_arg[OP(scan)]) {
8785         ARG_SET(scan, val - scan);
8786     }
8787     else {
8788         NEXT_OFF(scan) = val - scan;
8789     }
8790 }
8791 
8792 #ifdef DEBUGGING
8793 /*
8794 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8795 - Look for optimizable sequences at the same time.
8796 - currently only looks for EXACT chains.
8797 
8798 This is expermental code. The idea is to use this routine to perform
8799 in place optimizations on branches and groups as they are constructed,
8800 with the long term intention of removing optimization from study_chunk so
8801 that it is purely analytical.
8802 
8803 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8804 to control which is which.
8805 
8806 */
8807 /* TODO: All four parms should be const */
8808 
8809 STATIC U8
8810 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8811 {
8812     dVAR;
8813     register regnode *scan;
8814     U8 exact = PSEUDO;
8815 #ifdef EXPERIMENTAL_INPLACESCAN
8816     I32 min = 0;
8817 #endif
8818     GET_RE_DEBUG_FLAGS_DECL;
8819 
8820     PERL_ARGS_ASSERT_REGTAIL_STUDY;
8821 
8822 
8823     if (SIZE_ONLY)
8824         return exact;
8825 
8826     /* Find last node. */
8827 
8828     scan = p;
8829     for (;;) {
8830         regnode * const temp = regnext(scan);
8831 #ifdef EXPERIMENTAL_INPLACESCAN
8832         if (PL_regkind[OP(scan)] == EXACT)
8833             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8834                 return EXACT;
8835 #endif
8836         if ( exact ) {
8837             switch (OP(scan)) {
8838                 case EXACT:
8839                 case EXACTF:
8840                 case EXACTFL:
8841                         if( exact == PSEUDO )
8842                             exact= OP(scan);
8843                         else if ( exact != OP(scan) )
8844                             exact= 0;
8845                 case NOTHING:
8846                     break;
8847                 default:
8848                     exact= 0;
8849             }
8850         }
8851         DEBUG_PARSE_r({
8852             SV * const mysv=sv_newmortal();
8853             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8854             regprop(RExC_rx, mysv, scan);
8855             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8856                 SvPV_nolen_const(mysv),
8857                 REG_NODE_NUM(scan),
8858                 PL_reg_name[exact]);
8859         });
8860 	if (temp == NULL)
8861 	    break;
8862 	scan = temp;
8863     }
8864     DEBUG_PARSE_r({
8865         SV * const mysv_val=sv_newmortal();
8866         DEBUG_PARSE_MSG("");
8867         regprop(RExC_rx, mysv_val, val);
8868         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8869 		      SvPV_nolen_const(mysv_val),
8870 		      (IV)REG_NODE_NUM(val),
8871 		      (IV)(val - scan)
8872         );
8873     });
8874     if (reg_off_by_arg[OP(scan)]) {
8875 	ARG_SET(scan, val - scan);
8876     }
8877     else {
8878 	NEXT_OFF(scan) = val - scan;
8879     }
8880 
8881     return exact;
8882 }
8883 #endif
8884 
8885 /*
8886  - regcurly - a little FSA that accepts {\d+,?\d*}
8887  */
8888 #ifndef PERL_IN_XSUB_RE
8889 I32
8890 Perl_regcurly(register const char *s)
8891 {
8892     PERL_ARGS_ASSERT_REGCURLY;
8893 
8894     if (*s++ != '{')
8895 	return FALSE;
8896     if (!isDIGIT(*s))
8897 	return FALSE;
8898     while (isDIGIT(*s))
8899 	s++;
8900     if (*s == ',')
8901 	s++;
8902     while (isDIGIT(*s))
8903 	s++;
8904     if (*s != '}')
8905 	return FALSE;
8906     return TRUE;
8907 }
8908 #endif
8909 
8910 /*
8911  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8912  */
8913 #ifdef DEBUGGING
8914 static void
8915 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8916 {
8917     int bit;
8918     int set=0;
8919 
8920     for (bit=0; bit<32; bit++) {
8921         if (flags & (1<<bit)) {
8922             if (!set++ && lead)
8923                 PerlIO_printf(Perl_debug_log, "%s",lead);
8924             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8925         }
8926     }
8927     if (lead)  {
8928         if (set)
8929             PerlIO_printf(Perl_debug_log, "\n");
8930         else
8931             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8932     }
8933 }
8934 #endif
8935 
8936 void
8937 Perl_regdump(pTHX_ const regexp *r)
8938 {
8939 #ifdef DEBUGGING
8940     dVAR;
8941     SV * const sv = sv_newmortal();
8942     SV *dsv= sv_newmortal();
8943     RXi_GET_DECL(r,ri);
8944     GET_RE_DEBUG_FLAGS_DECL;
8945 
8946     PERL_ARGS_ASSERT_REGDUMP;
8947 
8948     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8949 
8950     /* Header fields of interest. */
8951     if (r->anchored_substr) {
8952 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8953 	    RE_SV_DUMPLEN(r->anchored_substr), 30);
8954 	PerlIO_printf(Perl_debug_log,
8955 		      "anchored %s%s at %"IVdf" ",
8956 		      s, RE_SV_TAIL(r->anchored_substr),
8957 		      (IV)r->anchored_offset);
8958     } else if (r->anchored_utf8) {
8959 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8960 	    RE_SV_DUMPLEN(r->anchored_utf8), 30);
8961 	PerlIO_printf(Perl_debug_log,
8962 		      "anchored utf8 %s%s at %"IVdf" ",
8963 		      s, RE_SV_TAIL(r->anchored_utf8),
8964 		      (IV)r->anchored_offset);
8965     }
8966     if (r->float_substr) {
8967 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8968 	    RE_SV_DUMPLEN(r->float_substr), 30);
8969 	PerlIO_printf(Perl_debug_log,
8970 		      "floating %s%s at %"IVdf"..%"UVuf" ",
8971 		      s, RE_SV_TAIL(r->float_substr),
8972 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
8973     } else if (r->float_utf8) {
8974 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8975 	    RE_SV_DUMPLEN(r->float_utf8), 30);
8976 	PerlIO_printf(Perl_debug_log,
8977 		      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8978 		      s, RE_SV_TAIL(r->float_utf8),
8979 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
8980     }
8981     if (r->check_substr || r->check_utf8)
8982 	PerlIO_printf(Perl_debug_log,
8983 		      (const char *)
8984 		      (r->check_substr == r->float_substr
8985 		       && r->check_utf8 == r->float_utf8
8986 		       ? "(checking floating" : "(checking anchored"));
8987     if (r->extflags & RXf_NOSCAN)
8988 	PerlIO_printf(Perl_debug_log, " noscan");
8989     if (r->extflags & RXf_CHECK_ALL)
8990 	PerlIO_printf(Perl_debug_log, " isall");
8991     if (r->check_substr || r->check_utf8)
8992 	PerlIO_printf(Perl_debug_log, ") ");
8993 
8994     if (ri->regstclass) {
8995 	regprop(r, sv, ri->regstclass);
8996 	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8997     }
8998     if (r->extflags & RXf_ANCH) {
8999 	PerlIO_printf(Perl_debug_log, "anchored");
9000 	if (r->extflags & RXf_ANCH_BOL)
9001 	    PerlIO_printf(Perl_debug_log, "(BOL)");
9002 	if (r->extflags & RXf_ANCH_MBOL)
9003 	    PerlIO_printf(Perl_debug_log, "(MBOL)");
9004 	if (r->extflags & RXf_ANCH_SBOL)
9005 	    PerlIO_printf(Perl_debug_log, "(SBOL)");
9006 	if (r->extflags & RXf_ANCH_GPOS)
9007 	    PerlIO_printf(Perl_debug_log, "(GPOS)");
9008 	PerlIO_putc(Perl_debug_log, ' ');
9009     }
9010     if (r->extflags & RXf_GPOS_SEEN)
9011 	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9012     if (r->intflags & PREGf_SKIP)
9013 	PerlIO_printf(Perl_debug_log, "plus ");
9014     if (r->intflags & PREGf_IMPLICIT)
9015 	PerlIO_printf(Perl_debug_log, "implicit ");
9016     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9017     if (r->extflags & RXf_EVAL_SEEN)
9018 	PerlIO_printf(Perl_debug_log, "with eval ");
9019     PerlIO_printf(Perl_debug_log, "\n");
9020     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9021 #else
9022     PERL_ARGS_ASSERT_REGDUMP;
9023     PERL_UNUSED_CONTEXT;
9024     PERL_UNUSED_ARG(r);
9025 #endif	/* DEBUGGING */
9026 }
9027 
9028 /*
9029 - regprop - printable representation of opcode
9030 */
9031 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9032 STMT_START { \
9033         if (do_sep) {                           \
9034             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9035             if (flags & ANYOF_INVERT)           \
9036                 /*make sure the invert info is in each */ \
9037                 sv_catpvs(sv, "^");             \
9038             do_sep = 0;                         \
9039         }                                       \
9040 } STMT_END
9041 
9042 void
9043 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9044 {
9045 #ifdef DEBUGGING
9046     dVAR;
9047     register int k;
9048     RXi_GET_DECL(prog,progi);
9049     GET_RE_DEBUG_FLAGS_DECL;
9050 
9051     PERL_ARGS_ASSERT_REGPROP;
9052 
9053     sv_setpvs(sv, "");
9054 
9055     if (OP(o) > REGNODE_MAX)		/* regnode.type is unsigned */
9056 	/* It would be nice to FAIL() here, but this may be called from
9057 	   regexec.c, and it would be hard to supply pRExC_state. */
9058 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9059     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9060 
9061     k = PL_regkind[OP(o)];
9062 
9063     if (k == EXACT) {
9064 	sv_catpvs(sv, " ");
9065 	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9066 	 * is a crude hack but it may be the best for now since
9067 	 * we have no flag "this EXACTish node was UTF-8"
9068 	 * --jhi */
9069 	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9070 		  PERL_PV_ESCAPE_UNI_DETECT |
9071 		  PERL_PV_PRETTY_ELLIPSES   |
9072 		  PERL_PV_PRETTY_LTGT       |
9073 		  PERL_PV_PRETTY_NOCLEAR
9074 		  );
9075     } else if (k == TRIE) {
9076 	/* print the details of the trie in dumpuntil instead, as
9077 	 * progi->data isn't available here */
9078         const char op = OP(o);
9079         const U32 n = ARG(o);
9080         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9081                (reg_ac_data *)progi->data->data[n] :
9082                NULL;
9083         const reg_trie_data * const trie
9084 	    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9085 
9086         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9087         DEBUG_TRIE_COMPILE_r(
9088             Perl_sv_catpvf(aTHX_ sv,
9089                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9090                 (UV)trie->startstate,
9091                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9092                 (UV)trie->wordcount,
9093                 (UV)trie->minlen,
9094                 (UV)trie->maxlen,
9095                 (UV)TRIE_CHARCOUNT(trie),
9096                 (UV)trie->uniquecharcount
9097             )
9098         );
9099         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9100             int i;
9101             int rangestart = -1;
9102             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9103             sv_catpvs(sv, "[");
9104             for (i = 0; i <= 256; i++) {
9105                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9106                     if (rangestart == -1)
9107                         rangestart = i;
9108                 } else if (rangestart != -1) {
9109                     if (i <= rangestart + 3)
9110                         for (; rangestart < i; rangestart++)
9111                             put_byte(sv, rangestart);
9112                     else {
9113                         put_byte(sv, rangestart);
9114                         sv_catpvs(sv, "-");
9115                         put_byte(sv, i - 1);
9116                     }
9117                     rangestart = -1;
9118                 }
9119             }
9120             sv_catpvs(sv, "]");
9121         }
9122 
9123     } else if (k == CURLY) {
9124 	if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9125 	    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9126 	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9127     }
9128     else if (k == WHILEM && o->flags)			/* Ordinal/of */
9129 	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9130     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9131 	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));	/* Parenth number */
9132 	if ( RXp_PAREN_NAMES(prog) ) {
9133             if ( k != REF || OP(o) < NREF) {
9134 	        AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9135 	        SV **name= av_fetch(list, ARG(o), 0 );
9136 	        if (name)
9137 	            Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9138             }
9139             else {
9140                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9141                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9142                 I32 *nums=(I32*)SvPVX(sv_dat);
9143                 SV **name= av_fetch(list, nums[0], 0 );
9144                 I32 n;
9145                 if (name) {
9146                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9147                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9148 			   	    (n ? "," : ""), (IV)nums[n]);
9149                     }
9150                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9151                 }
9152             }
9153         }
9154     } else if (k == GOSUB)
9155 	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));	/* Paren and offset */
9156     else if (k == VERB) {
9157         if (!o->flags)
9158             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9159 			   SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9160     } else if (k == LOGICAL)
9161 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);	/* 2: embedded, otherwise 1 */
9162     else if (k == FOLDCHAR)
9163 	Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9164     else if (k == ANYOF) {
9165 	int i, rangestart = -1;
9166 	const U8 flags = ANYOF_FLAGS(o);
9167 	int do_sep = 0;
9168 
9169 	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9170 	static const char * const anyofs[] = {
9171 	    "\\w",
9172 	    "\\W",
9173 	    "\\s",
9174 	    "\\S",
9175 	    "\\d",
9176 	    "\\D",
9177 	    "[:alnum:]",
9178 	    "[:^alnum:]",
9179 	    "[:alpha:]",
9180 	    "[:^alpha:]",
9181 	    "[:ascii:]",
9182 	    "[:^ascii:]",
9183 	    "[:cntrl:]",
9184 	    "[:^cntrl:]",
9185 	    "[:graph:]",
9186 	    "[:^graph:]",
9187 	    "[:lower:]",
9188 	    "[:^lower:]",
9189 	    "[:print:]",
9190 	    "[:^print:]",
9191 	    "[:punct:]",
9192 	    "[:^punct:]",
9193 	    "[:upper:]",
9194 	    "[:^upper:]",
9195 	    "[:xdigit:]",
9196 	    "[:^xdigit:]",
9197 	    "[:space:]",
9198 	    "[:^space:]",
9199 	    "[:blank:]",
9200 	    "[:^blank:]"
9201 	};
9202 
9203 	if (flags & ANYOF_LOCALE)
9204 	    sv_catpvs(sv, "{loc}");
9205 	if (flags & ANYOF_FOLD)
9206 	    sv_catpvs(sv, "{i}");
9207 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9208 	if (flags & ANYOF_INVERT)
9209 	    sv_catpvs(sv, "^");
9210 
9211 	/* output what the standard cp 0-255 bitmap matches */
9212 	for (i = 0; i <= 256; i++) {
9213 	    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9214 		if (rangestart == -1)
9215 		    rangestart = i;
9216 	    } else if (rangestart != -1) {
9217 		if (i <= rangestart + 3)
9218 		    for (; rangestart < i; rangestart++)
9219 			put_byte(sv, rangestart);
9220 		else {
9221 		    put_byte(sv, rangestart);
9222 		    sv_catpvs(sv, "-");
9223 		    put_byte(sv, i - 1);
9224 		}
9225 		do_sep = 1;
9226 		rangestart = -1;
9227 	    }
9228 	}
9229 
9230         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9231         /* output any special charclass tests (used mostly under use locale) */
9232 	if (o->flags & ANYOF_CLASS)
9233 	    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9234 		if (ANYOF_CLASS_TEST(o,i)) {
9235 		    sv_catpv(sv, anyofs[i]);
9236 		    do_sep = 1;
9237 		}
9238 
9239         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9240 
9241         /* output information about the unicode matching */
9242 	if (flags & ANYOF_UNICODE)
9243 	    sv_catpvs(sv, "{unicode}");
9244 	else if (flags & ANYOF_UNICODE_ALL)
9245 	    sv_catpvs(sv, "{unicode_all}");
9246 
9247 	{
9248 	    SV *lv;
9249 	    SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9250 
9251 	    if (lv) {
9252 		if (sw) {
9253 		    U8 s[UTF8_MAXBYTES_CASE+1];
9254 
9255 		    for (i = 0; i <= 256; i++) { /* just the first 256 */
9256 			uvchr_to_utf8(s, i);
9257 
9258 			if (i < 256 && swash_fetch(sw, s, TRUE)) {
9259 			    if (rangestart == -1)
9260 				rangestart = i;
9261 			} else if (rangestart != -1) {
9262 			    if (i <= rangestart + 3)
9263 				for (; rangestart < i; rangestart++) {
9264 				    const U8 * const e = uvchr_to_utf8(s,rangestart);
9265 				    U8 *p;
9266 				    for(p = s; p < e; p++)
9267 					put_byte(sv, *p);
9268 				}
9269 			    else {
9270 				const U8 *e = uvchr_to_utf8(s,rangestart);
9271 				U8 *p;
9272 				for (p = s; p < e; p++)
9273 				    put_byte(sv, *p);
9274 				sv_catpvs(sv, "-");
9275 				e = uvchr_to_utf8(s, i-1);
9276 				for (p = s; p < e; p++)
9277 				    put_byte(sv, *p);
9278 				}
9279 				rangestart = -1;
9280 			    }
9281 			}
9282 
9283 		    sv_catpvs(sv, "..."); /* et cetera */
9284 		}
9285 
9286 		{
9287 		    char *s = savesvpv(lv);
9288 		    char * const origs = s;
9289 
9290 		    while (*s && *s != '\n')
9291 			s++;
9292 
9293 		    if (*s == '\n') {
9294 			const char * const t = ++s;
9295 
9296 			while (*s) {
9297 			    if (*s == '\n')
9298 				*s = ' ';
9299 			    s++;
9300 			}
9301 			if (s[-1] == ' ')
9302 			    s[-1] = 0;
9303 
9304 			sv_catpv(sv, t);
9305 		    }
9306 
9307 		    Safefree(origs);
9308 		}
9309 	    }
9310 	}
9311 
9312 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9313     }
9314     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9315 	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9316 #else
9317     PERL_UNUSED_CONTEXT;
9318     PERL_UNUSED_ARG(sv);
9319     PERL_UNUSED_ARG(o);
9320     PERL_UNUSED_ARG(prog);
9321 #endif	/* DEBUGGING */
9322 }
9323 
9324 SV *
9325 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9326 {				/* Assume that RE_INTUIT is set */
9327     dVAR;
9328     struct regexp *const prog = (struct regexp *)SvANY(r);
9329     GET_RE_DEBUG_FLAGS_DECL;
9330 
9331     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9332     PERL_UNUSED_CONTEXT;
9333 
9334     DEBUG_COMPILE_r(
9335 	{
9336 	    const char * const s = SvPV_nolen_const(prog->check_substr
9337 		      ? prog->check_substr : prog->check_utf8);
9338 
9339 	    if (!PL_colorset) reginitcolors();
9340 	    PerlIO_printf(Perl_debug_log,
9341 		      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9342 		      PL_colors[4],
9343 		      prog->check_substr ? "" : "utf8 ",
9344 		      PL_colors[5],PL_colors[0],
9345 		      s,
9346 		      PL_colors[1],
9347 		      (strlen(s) > 60 ? "..." : ""));
9348 	} );
9349 
9350     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9351 }
9352 
9353 /*
9354    pregfree()
9355 
9356    handles refcounting and freeing the perl core regexp structure. When
9357    it is necessary to actually free the structure the first thing it
9358    does is call the 'free' method of the regexp_engine associated to to
9359    the regexp, allowing the handling of the void *pprivate; member
9360    first. (This routine is not overridable by extensions, which is why
9361    the extensions free is called first.)
9362 
9363    See regdupe and regdupe_internal if you change anything here.
9364 */
9365 #ifndef PERL_IN_XSUB_RE
9366 void
9367 Perl_pregfree(pTHX_ REGEXP *r)
9368 {
9369     SvREFCNT_dec(r);
9370 }
9371 
9372 void
9373 Perl_pregfree2(pTHX_ REGEXP *rx)
9374 {
9375     dVAR;
9376     struct regexp *const r = (struct regexp *)SvANY(rx);
9377     GET_RE_DEBUG_FLAGS_DECL;
9378 
9379     PERL_ARGS_ASSERT_PREGFREE2;
9380 
9381     if (r->mother_re) {
9382         ReREFCNT_dec(r->mother_re);
9383     } else {
9384         CALLREGFREE_PVT(rx); /* free the private data */
9385         SvREFCNT_dec(RXp_PAREN_NAMES(r));
9386     }
9387     if (r->substrs) {
9388         SvREFCNT_dec(r->anchored_substr);
9389         SvREFCNT_dec(r->anchored_utf8);
9390         SvREFCNT_dec(r->float_substr);
9391         SvREFCNT_dec(r->float_utf8);
9392 	Safefree(r->substrs);
9393     }
9394     RX_MATCH_COPY_FREE(rx);
9395 #ifdef PERL_OLD_COPY_ON_WRITE
9396     SvREFCNT_dec(r->saved_copy);
9397 #endif
9398     Safefree(r->offs);
9399 }
9400 
9401 /*  reg_temp_copy()
9402 
9403     This is a hacky workaround to the structural issue of match results
9404     being stored in the regexp structure which is in turn stored in
9405     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9406     could be PL_curpm in multiple contexts, and could require multiple
9407     result sets being associated with the pattern simultaneously, such
9408     as when doing a recursive match with (??{$qr})
9409 
9410     The solution is to make a lightweight copy of the regexp structure
9411     when a qr// is returned from the code executed by (??{$qr}) this
9412     lightweight copy doesnt actually own any of its data except for
9413     the starp/end and the actual regexp structure itself.
9414 
9415 */
9416 
9417 
9418 REGEXP *
9419 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9420 {
9421     struct regexp *ret;
9422     struct regexp *const r = (struct regexp *)SvANY(rx);
9423     register const I32 npar = r->nparens+1;
9424 
9425     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9426 
9427     if (!ret_x)
9428 	ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9429     ret = (struct regexp *)SvANY(ret_x);
9430 
9431     (void)ReREFCNT_inc(rx);
9432     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9433        by pointing directly at the buffer, but flagging that the allocated
9434        space in the copy is zero. As we've just done a struct copy, it's now
9435        a case of zero-ing that, rather than copying the current length.  */
9436     SvPV_set(ret_x, RX_WRAPPED(rx));
9437     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9438     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9439 	   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9440     SvLEN_set(ret_x, 0);
9441     SvSTASH_set(ret_x, NULL);
9442     SvMAGIC_set(ret_x, NULL);
9443     Newx(ret->offs, npar, regexp_paren_pair);
9444     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9445     if (r->substrs) {
9446         Newx(ret->substrs, 1, struct reg_substr_data);
9447 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9448 
9449 	SvREFCNT_inc_void(ret->anchored_substr);
9450 	SvREFCNT_inc_void(ret->anchored_utf8);
9451 	SvREFCNT_inc_void(ret->float_substr);
9452 	SvREFCNT_inc_void(ret->float_utf8);
9453 
9454 	/* check_substr and check_utf8, if non-NULL, point to either their
9455 	   anchored or float namesakes, and don't hold a second reference.  */
9456     }
9457     RX_MATCH_COPIED_off(ret_x);
9458 #ifdef PERL_OLD_COPY_ON_WRITE
9459     ret->saved_copy = NULL;
9460 #endif
9461     ret->mother_re = rx;
9462 
9463     return ret_x;
9464 }
9465 #endif
9466 
9467 /* regfree_internal()
9468 
9469    Free the private data in a regexp. This is overloadable by
9470    extensions. Perl takes care of the regexp structure in pregfree(),
9471    this covers the *pprivate pointer which technically perldoesnt
9472    know about, however of course we have to handle the
9473    regexp_internal structure when no extension is in use.
9474 
9475    Note this is called before freeing anything in the regexp
9476    structure.
9477  */
9478 
9479 void
9480 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9481 {
9482     dVAR;
9483     struct regexp *const r = (struct regexp *)SvANY(rx);
9484     RXi_GET_DECL(r,ri);
9485     GET_RE_DEBUG_FLAGS_DECL;
9486 
9487     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9488 
9489     DEBUG_COMPILE_r({
9490 	if (!PL_colorset)
9491 	    reginitcolors();
9492 	{
9493 	    SV *dsv= sv_newmortal();
9494             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9495                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9496             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9497                 PL_colors[4],PL_colors[5],s);
9498         }
9499     });
9500 #ifdef RE_TRACK_PATTERN_OFFSETS
9501     if (ri->u.offsets)
9502         Safefree(ri->u.offsets);             /* 20010421 MJD */
9503 #endif
9504     if (ri->data) {
9505 	int n = ri->data->count;
9506 	PAD* new_comppad = NULL;
9507 	PAD* old_comppad;
9508 	PADOFFSET refcnt;
9509 
9510 	while (--n >= 0) {
9511           /* If you add a ->what type here, update the comment in regcomp.h */
9512 	    switch (ri->data->what[n]) {
9513 	    case 's':
9514 	    case 'S':
9515 	    case 'u':
9516 		SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9517 		break;
9518 	    case 'f':
9519 		Safefree(ri->data->data[n]);
9520 		break;
9521 	    case 'p':
9522 		new_comppad = MUTABLE_AV(ri->data->data[n]);
9523 		break;
9524 	    case 'o':
9525 		if (new_comppad == NULL)
9526 		    Perl_croak(aTHX_ "panic: pregfree comppad");
9527 		PAD_SAVE_LOCAL(old_comppad,
9528 		    /* Watch out for global destruction's random ordering. */
9529 		    (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9530 		);
9531 		OP_REFCNT_LOCK;
9532 		refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9533 		OP_REFCNT_UNLOCK;
9534 		if (!refcnt)
9535                     op_free((OP_4tree*)ri->data->data[n]);
9536 
9537 		PAD_RESTORE_LOCAL(old_comppad);
9538 		SvREFCNT_dec(MUTABLE_SV(new_comppad));
9539 		new_comppad = NULL;
9540 		break;
9541 	    case 'n':
9542 	        break;
9543             case 'T':
9544                 { /* Aho Corasick add-on structure for a trie node.
9545                      Used in stclass optimization only */
9546                     U32 refcount;
9547                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9548                     OP_REFCNT_LOCK;
9549                     refcount = --aho->refcount;
9550                     OP_REFCNT_UNLOCK;
9551                     if ( !refcount ) {
9552                         PerlMemShared_free(aho->states);
9553                         PerlMemShared_free(aho->fail);
9554 			 /* do this last!!!! */
9555                         PerlMemShared_free(ri->data->data[n]);
9556                         PerlMemShared_free(ri->regstclass);
9557                     }
9558                 }
9559                 break;
9560 	    case 't':
9561 	        {
9562 	            /* trie structure. */
9563 	            U32 refcount;
9564 	            reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9565                     OP_REFCNT_LOCK;
9566                     refcount = --trie->refcount;
9567                     OP_REFCNT_UNLOCK;
9568                     if ( !refcount ) {
9569                         PerlMemShared_free(trie->charmap);
9570                         PerlMemShared_free(trie->states);
9571                         PerlMemShared_free(trie->trans);
9572                         if (trie->bitmap)
9573                             PerlMemShared_free(trie->bitmap);
9574                         if (trie->wordlen)
9575                             PerlMemShared_free(trie->wordlen);
9576                         if (trie->jump)
9577                             PerlMemShared_free(trie->jump);
9578                         if (trie->nextword)
9579                             PerlMemShared_free(trie->nextword);
9580                         /* do this last!!!! */
9581                         PerlMemShared_free(ri->data->data[n]);
9582 		    }
9583 		}
9584 		break;
9585 	    default:
9586 		Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9587 	    }
9588 	}
9589 	Safefree(ri->data->what);
9590 	Safefree(ri->data);
9591     }
9592 
9593     Safefree(ri);
9594 }
9595 
9596 #define sv_dup_inc(s,t)	SvREFCNT_inc(sv_dup(s,t))
9597 #define av_dup_inc(s,t)	MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9598 #define hv_dup_inc(s,t)	MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9599 #define SAVEPVN(p,n)	((p) ? savepvn(p,n) : NULL)
9600 
9601 /*
9602    re_dup - duplicate a regexp.
9603 
9604    This routine is expected to clone a given regexp structure. It is only
9605    compiled under USE_ITHREADS.
9606 
9607    After all of the core data stored in struct regexp is duplicated
9608    the regexp_engine.dupe method is used to copy any private data
9609    stored in the *pprivate pointer. This allows extensions to handle
9610    any duplication it needs to do.
9611 
9612    See pregfree() and regfree_internal() if you change anything here.
9613 */
9614 #if defined(USE_ITHREADS)
9615 #ifndef PERL_IN_XSUB_RE
9616 void
9617 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9618 {
9619     dVAR;
9620     I32 npar;
9621     const struct regexp *r = (const struct regexp *)SvANY(sstr);
9622     struct regexp *ret = (struct regexp *)SvANY(dstr);
9623 
9624     PERL_ARGS_ASSERT_RE_DUP_GUTS;
9625 
9626     npar = r->nparens+1;
9627     Newx(ret->offs, npar, regexp_paren_pair);
9628     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9629     if(ret->swap) {
9630         /* no need to copy these */
9631         Newx(ret->swap, npar, regexp_paren_pair);
9632     }
9633 
9634     if (ret->substrs) {
9635 	/* Do it this way to avoid reading from *r after the StructCopy().
9636 	   That way, if any of the sv_dup_inc()s dislodge *r from the L1
9637 	   cache, it doesn't matter.  */
9638 	const bool anchored = r->check_substr
9639 	    ? r->check_substr == r->anchored_substr
9640 	    : r->check_utf8 == r->anchored_utf8;
9641         Newx(ret->substrs, 1, struct reg_substr_data);
9642 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9643 
9644 	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9645 	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9646 	ret->float_substr = sv_dup_inc(ret->float_substr, param);
9647 	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9648 
9649 	/* check_substr and check_utf8, if non-NULL, point to either their
9650 	   anchored or float namesakes, and don't hold a second reference.  */
9651 
9652 	if (ret->check_substr) {
9653 	    if (anchored) {
9654 		assert(r->check_utf8 == r->anchored_utf8);
9655 		ret->check_substr = ret->anchored_substr;
9656 		ret->check_utf8 = ret->anchored_utf8;
9657 	    } else {
9658 		assert(r->check_substr == r->float_substr);
9659 		assert(r->check_utf8 == r->float_utf8);
9660 		ret->check_substr = ret->float_substr;
9661 		ret->check_utf8 = ret->float_utf8;
9662 	    }
9663 	} else if (ret->check_utf8) {
9664 	    if (anchored) {
9665 		ret->check_utf8 = ret->anchored_utf8;
9666 	    } else {
9667 		ret->check_utf8 = ret->float_utf8;
9668 	    }
9669 	}
9670     }
9671 
9672     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9673 
9674     if (ret->pprivate)
9675 	RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9676 
9677     if (RX_MATCH_COPIED(dstr))
9678 	ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9679     else
9680 	ret->subbeg = NULL;
9681 #ifdef PERL_OLD_COPY_ON_WRITE
9682     ret->saved_copy = NULL;
9683 #endif
9684 
9685     if (ret->mother_re) {
9686 	if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9687 	    /* Our storage points directly to our mother regexp, but that's
9688 	       1: a buffer in a different thread
9689 	       2: something we no longer hold a reference on
9690 	       so we need to copy it locally.  */
9691 	    /* Note we need to sue SvCUR() on our mother_re, because it, in
9692 	       turn, may well be pointing to its own mother_re.  */
9693 	    SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9694 				   SvCUR(ret->mother_re)+1));
9695 	    SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9696 	}
9697 	ret->mother_re      = NULL;
9698     }
9699     ret->gofs = 0;
9700 }
9701 #endif /* PERL_IN_XSUB_RE */
9702 
9703 /*
9704    regdupe_internal()
9705 
9706    This is the internal complement to regdupe() which is used to copy
9707    the structure pointed to by the *pprivate pointer in the regexp.
9708    This is the core version of the extension overridable cloning hook.
9709    The regexp structure being duplicated will be copied by perl prior
9710    to this and will be provided as the regexp *r argument, however
9711    with the /old/ structures pprivate pointer value. Thus this routine
9712    may override any copying normally done by perl.
9713 
9714    It returns a pointer to the new regexp_internal structure.
9715 */
9716 
9717 void *
9718 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9719 {
9720     dVAR;
9721     struct regexp *const r = (struct regexp *)SvANY(rx);
9722     regexp_internal *reti;
9723     int len, npar;
9724     RXi_GET_DECL(r,ri);
9725 
9726     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9727 
9728     npar = r->nparens+1;
9729     len = ProgLen(ri);
9730 
9731     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9732     Copy(ri->program, reti->program, len+1, regnode);
9733 
9734 
9735     reti->regstclass = NULL;
9736 
9737     if (ri->data) {
9738 	struct reg_data *d;
9739         const int count = ri->data->count;
9740 	int i;
9741 
9742 	Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9743 		char, struct reg_data);
9744 	Newx(d->what, count, U8);
9745 
9746 	d->count = count;
9747 	for (i = 0; i < count; i++) {
9748 	    d->what[i] = ri->data->what[i];
9749 	    switch (d->what[i]) {
9750 	        /* legal options are one of: sSfpontTu
9751 	           see also regcomp.h and pregfree() */
9752 	    case 's':
9753 	    case 'S':
9754 	    case 'p': /* actually an AV, but the dup function is identical.  */
9755 	    case 'u': /* actually an HV, but the dup function is identical.  */
9756 		d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9757 		break;
9758 	    case 'f':
9759 		/* This is cheating. */
9760 		Newx(d->data[i], 1, struct regnode_charclass_class);
9761 		StructCopy(ri->data->data[i], d->data[i],
9762 			    struct regnode_charclass_class);
9763 		reti->regstclass = (regnode*)d->data[i];
9764 		break;
9765 	    case 'o':
9766 		/* Compiled op trees are readonly and in shared memory,
9767 		   and can thus be shared without duplication. */
9768 		OP_REFCNT_LOCK;
9769 		d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9770 		OP_REFCNT_UNLOCK;
9771 		break;
9772 	    case 'T':
9773 		/* Trie stclasses are readonly and can thus be shared
9774 		 * without duplication. We free the stclass in pregfree
9775 		 * when the corresponding reg_ac_data struct is freed.
9776 		 */
9777 		reti->regstclass= ri->regstclass;
9778 		/* Fall through */
9779 	    case 't':
9780 		OP_REFCNT_LOCK;
9781 		((reg_trie_data*)ri->data->data[i])->refcount++;
9782 		OP_REFCNT_UNLOCK;
9783 		/* Fall through */
9784 	    case 'n':
9785 		d->data[i] = ri->data->data[i];
9786 		break;
9787             default:
9788 		Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9789 	    }
9790 	}
9791 
9792 	reti->data = d;
9793     }
9794     else
9795 	reti->data = NULL;
9796 
9797     reti->name_list_idx = ri->name_list_idx;
9798 
9799 #ifdef RE_TRACK_PATTERN_OFFSETS
9800     if (ri->u.offsets) {
9801         Newx(reti->u.offsets, 2*len+1, U32);
9802         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9803     }
9804 #else
9805     SetProgLen(reti,len);
9806 #endif
9807 
9808     return (void*)reti;
9809 }
9810 
9811 #endif    /* USE_ITHREADS */
9812 
9813 #ifndef PERL_IN_XSUB_RE
9814 
9815 /*
9816  - regnext - dig the "next" pointer out of a node
9817  */
9818 regnode *
9819 Perl_regnext(pTHX_ register regnode *p)
9820 {
9821     dVAR;
9822     register I32 offset;
9823 
9824     if (!p)
9825 	return(NULL);
9826 
9827     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9828     if (offset == 0)
9829 	return(NULL);
9830 
9831     return(p+offset);
9832 }
9833 #endif
9834 
9835 STATIC void
9836 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9837 {
9838     va_list args;
9839     STRLEN l1 = strlen(pat1);
9840     STRLEN l2 = strlen(pat2);
9841     char buf[512];
9842     SV *msv;
9843     const char *message;
9844 
9845     PERL_ARGS_ASSERT_RE_CROAK2;
9846 
9847     if (l1 > 510)
9848 	l1 = 510;
9849     if (l1 + l2 > 510)
9850 	l2 = 510 - l1;
9851     Copy(pat1, buf, l1 , char);
9852     Copy(pat2, buf + l1, l2 , char);
9853     buf[l1 + l2] = '\n';
9854     buf[l1 + l2 + 1] = '\0';
9855 #ifdef I_STDARG
9856     /* ANSI variant takes additional second argument */
9857     va_start(args, pat2);
9858 #else
9859     va_start(args);
9860 #endif
9861     msv = vmess(buf, &args);
9862     va_end(args);
9863     message = SvPV_const(msv,l1);
9864     if (l1 > 512)
9865 	l1 = 512;
9866     Copy(message, buf, l1 , char);
9867     buf[l1-1] = '\0';			/* Overwrite \n */
9868     Perl_croak(aTHX_ "%s", buf);
9869 }
9870 
9871 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
9872 
9873 #ifndef PERL_IN_XSUB_RE
9874 void
9875 Perl_save_re_context(pTHX)
9876 {
9877     dVAR;
9878 
9879     struct re_save_state *state;
9880 
9881     SAVEVPTR(PL_curcop);
9882     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9883 
9884     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9885     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9886     SSPUSHINT(SAVEt_RE_STATE);
9887 
9888     Copy(&PL_reg_state, state, 1, struct re_save_state);
9889 
9890     PL_reg_start_tmp = 0;
9891     PL_reg_start_tmpl = 0;
9892     PL_reg_oldsaved = NULL;
9893     PL_reg_oldsavedlen = 0;
9894     PL_reg_maxiter = 0;
9895     PL_reg_leftiter = 0;
9896     PL_reg_poscache = NULL;
9897     PL_reg_poscache_size = 0;
9898 #ifdef PERL_OLD_COPY_ON_WRITE
9899     PL_nrs = NULL;
9900 #endif
9901 
9902     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9903     if (PL_curpm) {
9904 	const REGEXP * const rx = PM_GETRE(PL_curpm);
9905 	if (rx) {
9906 	    U32 i;
9907 	    for (i = 1; i <= RX_NPARENS(rx); i++) {
9908 		char digits[TYPE_CHARS(long)];
9909 		const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9910 		GV *const *const gvp
9911 		    = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9912 
9913 		if (gvp) {
9914 		    GV * const gv = *gvp;
9915 		    if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9916 			save_scalar(gv);
9917 		}
9918 	    }
9919 	}
9920     }
9921 }
9922 #endif
9923 
9924 static void
9925 clear_re(pTHX_ void *r)
9926 {
9927     dVAR;
9928     ReREFCNT_dec((REGEXP *)r);
9929 }
9930 
9931 #ifdef DEBUGGING
9932 
9933 STATIC void
9934 S_put_byte(pTHX_ SV *sv, int c)
9935 {
9936     PERL_ARGS_ASSERT_PUT_BYTE;
9937 
9938     /* Our definition of isPRINT() ignores locales, so only bytes that are
9939        not part of UTF-8 are considered printable. I assume that the same
9940        holds for UTF-EBCDIC.
9941        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9942        which Wikipedia says:
9943 
9944        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9945        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9946        identical, to the ASCII delete (DEL) or rubout control character.
9947        ) So the old condition can be simplified to !isPRINT(c)  */
9948     if (!isPRINT(c))
9949 	Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9950     else {
9951 	const char string = c;
9952 	if (c == '-' || c == ']' || c == '\\' || c == '^')
9953 	    sv_catpvs(sv, "\\");
9954 	sv_catpvn(sv, &string, 1);
9955     }
9956 }
9957 
9958 
9959 #define CLEAR_OPTSTART \
9960     if (optstart) STMT_START { \
9961 	    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9962 	    optstart=NULL; \
9963     } STMT_END
9964 
9965 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9966 
9967 STATIC const regnode *
9968 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9969 	    const regnode *last, const regnode *plast,
9970 	    SV* sv, I32 indent, U32 depth)
9971 {
9972     dVAR;
9973     register U8 op = PSEUDO;	/* Arbitrary non-END op. */
9974     register const regnode *next;
9975     const regnode *optstart= NULL;
9976 
9977     RXi_GET_DECL(r,ri);
9978     GET_RE_DEBUG_FLAGS_DECL;
9979 
9980     PERL_ARGS_ASSERT_DUMPUNTIL;
9981 
9982 #ifdef DEBUG_DUMPUNTIL
9983     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9984         last ? last-start : 0,plast ? plast-start : 0);
9985 #endif
9986 
9987     if (plast && plast < last)
9988         last= plast;
9989 
9990     while (PL_regkind[op] != END && (!last || node < last)) {
9991 	/* While that wasn't END last time... */
9992 	NODE_ALIGN(node);
9993 	op = OP(node);
9994 	if (op == CLOSE || op == WHILEM)
9995 	    indent--;
9996 	next = regnext((regnode *)node);
9997 
9998 	/* Where, what. */
9999 	if (OP(node) == OPTIMIZED) {
10000 	    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10001 	        optstart = node;
10002 	    else
10003 		goto after_print;
10004 	} else
10005 	    CLEAR_OPTSTART;
10006 
10007 	regprop(r, sv, node);
10008 	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10009 		      (int)(2*indent + 1), "", SvPVX_const(sv));
10010 
10011         if (OP(node) != OPTIMIZED) {
10012             if (next == NULL)		/* Next ptr. */
10013                 PerlIO_printf(Perl_debug_log, " (0)");
10014             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10015                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10016             else
10017                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10018             (void)PerlIO_putc(Perl_debug_log, '\n');
10019         }
10020 
10021       after_print:
10022 	if (PL_regkind[(U8)op] == BRANCHJ) {
10023 	    assert(next);
10024 	    {
10025                 register const regnode *nnode = (OP(next) == LONGJMP
10026 					     ? regnext((regnode *)next)
10027 					     : next);
10028                 if (last && nnode > last)
10029                     nnode = last;
10030                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10031 	    }
10032 	}
10033 	else if (PL_regkind[(U8)op] == BRANCH) {
10034 	    assert(next);
10035 	    DUMPUNTIL(NEXTOPER(node), next);
10036 	}
10037 	else if ( PL_regkind[(U8)op]  == TRIE ) {
10038 	    const regnode *this_trie = node;
10039 	    const char op = OP(node);
10040             const U32 n = ARG(node);
10041 	    const reg_ac_data * const ac = op>=AHOCORASICK ?
10042                (reg_ac_data *)ri->data->data[n] :
10043                NULL;
10044 	    const reg_trie_data * const trie =
10045 	        (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10046 #ifdef DEBUGGING
10047 	    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10048 #endif
10049 	    const regnode *nextbranch= NULL;
10050 	    I32 word_idx;
10051             sv_setpvs(sv, "");
10052 	    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10053 		SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10054 
10055                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10056                    (int)(2*(indent+3)), "",
10057                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10058 	                    PL_colors[0], PL_colors[1],
10059 	                    (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10060 	                    PERL_PV_PRETTY_ELLIPSES    |
10061 	                    PERL_PV_PRETTY_LTGT
10062                             )
10063                             : "???"
10064                 );
10065                 if (trie->jump) {
10066                     U16 dist= trie->jump[word_idx+1];
10067 		    PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10068 				  (UV)((dist ? this_trie + dist : next) - start));
10069                     if (dist) {
10070                         if (!nextbranch)
10071                             nextbranch= this_trie + trie->jump[0];
10072 			DUMPUNTIL(this_trie + dist, nextbranch);
10073                     }
10074                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10075                         nextbranch= regnext((regnode *)nextbranch);
10076                 } else {
10077                     PerlIO_printf(Perl_debug_log, "\n");
10078 		}
10079 	    }
10080 	    if (last && next > last)
10081 	        node= last;
10082 	    else
10083 	        node= next;
10084 	}
10085 	else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10086 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10087                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10088 	}
10089 	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10090 	    assert(next);
10091 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10092 	}
10093 	else if ( op == PLUS || op == STAR) {
10094 	    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10095 	}
10096 	else if (op == ANYOF) {
10097 	    /* arglen 1 + class block */
10098 	    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10099 		    ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10100 	    node = NEXTOPER(node);
10101 	}
10102 	else if (PL_regkind[(U8)op] == EXACT) {
10103             /* Literal string, where present. */
10104 	    node += NODE_SZ_STR(node) - 1;
10105 	    node = NEXTOPER(node);
10106 	}
10107 	else {
10108 	    node = NEXTOPER(node);
10109 	    node += regarglen[(U8)op];
10110 	}
10111 	if (op == CURLYX || op == OPEN)
10112 	    indent++;
10113     }
10114     CLEAR_OPTSTART;
10115 #ifdef DEBUG_DUMPUNTIL
10116     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10117 #endif
10118     return node;
10119 }
10120 
10121 #endif	/* DEBUGGING */
10122 
10123 /*
10124  * Local variables:
10125  * c-indentation-style: bsd
10126  * c-basic-offset: 4
10127  * indent-tabs-mode: t
10128  * End:
10129  *
10130  * ex: set ts=8 sts=4 sw=4 noet:
10131  */
10132