xref: /openbsd-src/gnu/usr.bin/perl/regcomp.c (revision 43003dfe3ad45d1698bed8a37f2b0f5b14f20d4f)
1 /*    regcomp.c
2  */
3 
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7 
8 /* This file contains functions for compiling a regular expression.  See
9  * also regexec.c which funnily enough, contains functions for executing
10  * a regular expression.
11  *
12  * This file is also copied at build time to ext/re/re_comp.c, where
13  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14  * This causes the main functions to be compiled under new names and with
15  * debugging support added, which makes "use re 'debug'" work.
16  */
17 
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19  * confused with the original package (see point 3 below).  Thanks, Henry!
20  */
21 
22 /* Additional note: this code is very heavily munged from Henry's version
23  * in places.  In some spots I've traded clarity for efficiency, so don't
24  * blame Henry for some of the lack of readability.
25  */
26 
27 /* The names of the functions have been changed from regcomp and
28  * regexec to  pregcomp and pregexec in order to avoid conflicts
29  * with the POSIX routines of the same names.
30 */
31 
32 #ifdef PERL_EXT_RE_BUILD
33 #include "re_top.h"
34 #endif
35 
36 /*
37  * pregcomp and pregexec -- regsub and regerror are not used in perl
38  *
39  *	Copyright (c) 1986 by University of Toronto.
40  *	Written by Henry Spencer.  Not derived from licensed software.
41  *
42  *	Permission is granted to anyone to use this software for any
43  *	purpose on any computer system, and to redistribute it freely,
44  *	subject to the following restrictions:
45  *
46  *	1. The author is not responsible for the consequences of use of
47  *		this software, no matter how awful, even if they arise
48  *		from defects in it.
49  *
50  *	2. The origin of this software must not be misrepresented, either
51  *		by explicit claim or by omission.
52  *
53  *	3. Altered versions must be plainly marked as such, and must not
54  *		be misrepresented as being the original software.
55  *
56  *
57  ****    Alterations to Henry's code are...
58  ****
59  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61  ****
62  ****    You may distribute under the terms of either the GNU General Public
63  ****    License or the Artistic License, as specified in the README file.
64 
65  *
66  * Beware that some of this code is subtly aware of the way operator
67  * precedence is structured in regular expressions.  Serious changes in
68  * regular-expression syntax might require a total rethink.
69  */
70 #include "EXTERN.h"
71 #define PERL_IN_REGCOMP_C
72 #include "perl.h"
73 
74 #ifndef PERL_IN_XSUB_RE
75 #  include "INTERN.h"
76 #endif
77 
78 #define REG_COMP_C
79 #ifdef PERL_IN_XSUB_RE
80 #  include "re_comp.h"
81 #else
82 #  include "regcomp.h"
83 #endif
84 
85 #ifdef op
86 #undef op
87 #endif /* op */
88 
89 #ifdef MSDOS
90 #  if defined(BUGGY_MSC6)
91  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 #    pragma optimize("a",off)
93  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 #    pragma optimize("w",on )
95 #  endif /* BUGGY_MSC6 */
96 #endif /* MSDOS */
97 
98 #ifndef STATIC
99 #define	STATIC	static
100 #endif
101 
102 typedef struct RExC_state_t {
103     U32		flags;			/* are we folding, multilining? */
104     char	*precomp;		/* uncompiled string. */
105     regexp	*rx;                    /* perl core regexp structure */
106     regexp_internal	*rxi;           /* internal data for regexp object pprivate field */
107     char	*start;			/* Start of input for compile */
108     char	*end;			/* End of input for compile */
109     char	*parse;			/* Input-scan pointer. */
110     I32		whilem_seen;		/* number of WHILEM in this expr */
111     regnode	*emit_start;		/* Start of emitted-code area */
112     regnode	*emit_bound;		/* First regnode outside of the allocated space */
113     regnode	*emit;			/* Code-emit pointer; &regdummy = don't = compiling */
114     I32		naughty;		/* How bad is this pattern? */
115     I32		sawback;		/* Did we see \1, ...? */
116     U32		seen;
117     I32		size;			/* Code size. */
118     I32		npar;			/* Capture buffer count, (OPEN). */
119     I32		cpar;			/* Capture buffer count, (CLOSE). */
120     I32		nestroot;		/* root parens we are in - used by accept */
121     I32		extralen;
122     I32		seen_zerolen;
123     I32		seen_evals;
124     regnode	**open_parens;		/* pointers to open parens */
125     regnode	**close_parens;		/* pointers to close parens */
126     regnode	*opend;			/* END node in program */
127     I32		utf8;		/* whether the pattern is utf8 or not */
128     I32		orig_utf8;	/* whether the pattern was originally in utf8 */
129 				/* XXX use this for future optimisation of case
130 				 * where pattern must be upgraded to utf8. */
131     HV		*charnames;		/* cache of named sequences */
132     HV		*paren_names;		/* Paren names */
133 
134     regnode	**recurse;		/* Recurse regops */
135     I32		recurse_count;		/* Number of recurse regops */
136 #if ADD_TO_REGEXEC
137     char 	*starttry;		/* -Dr: where regtry was called. */
138 #define RExC_starttry	(pRExC_state->starttry)
139 #endif
140 #ifdef DEBUGGING
141     const char  *lastparse;
142     I32         lastnum;
143     AV          *paren_name_list;       /* idx -> name */
144 #define RExC_lastparse	(pRExC_state->lastparse)
145 #define RExC_lastnum	(pRExC_state->lastnum)
146 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
147 #endif
148 } RExC_state_t;
149 
150 #define RExC_flags	(pRExC_state->flags)
151 #define RExC_precomp	(pRExC_state->precomp)
152 #define RExC_rx		(pRExC_state->rx)
153 #define RExC_rxi	(pRExC_state->rxi)
154 #define RExC_start	(pRExC_state->start)
155 #define RExC_end	(pRExC_state->end)
156 #define RExC_parse	(pRExC_state->parse)
157 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
158 #ifdef RE_TRACK_PATTERN_OFFSETS
159 #define RExC_offsets	(pRExC_state->rxi->u.offsets) /* I am not like the others */
160 #endif
161 #define RExC_emit	(pRExC_state->emit)
162 #define RExC_emit_start	(pRExC_state->emit_start)
163 #define RExC_emit_bound	(pRExC_state->emit_bound)
164 #define RExC_naughty	(pRExC_state->naughty)
165 #define RExC_sawback	(pRExC_state->sawback)
166 #define RExC_seen	(pRExC_state->seen)
167 #define RExC_size	(pRExC_state->size)
168 #define RExC_npar	(pRExC_state->npar)
169 #define RExC_nestroot   (pRExC_state->nestroot)
170 #define RExC_extralen	(pRExC_state->extralen)
171 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
172 #define RExC_seen_evals	(pRExC_state->seen_evals)
173 #define RExC_utf8	(pRExC_state->utf8)
174 #define RExC_orig_utf8	(pRExC_state->orig_utf8)
175 #define RExC_charnames  (pRExC_state->charnames)
176 #define RExC_open_parens	(pRExC_state->open_parens)
177 #define RExC_close_parens	(pRExC_state->close_parens)
178 #define RExC_opend	(pRExC_state->opend)
179 #define RExC_paren_names	(pRExC_state->paren_names)
180 #define RExC_recurse	(pRExC_state->recurse)
181 #define RExC_recurse_count	(pRExC_state->recurse_count)
182 
183 
184 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
185 #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
186 	((*s) == '{' && regcurly(s)))
187 
188 #ifdef SPSTART
189 #undef SPSTART		/* dratted cpp namespace... */
190 #endif
191 /*
192  * Flags to be passed up and down.
193  */
194 #define	WORST		0	/* Worst case. */
195 #define	HASWIDTH	0x01	/* Known to match non-null strings. */
196 #define	SIMPLE		0x02	/* Simple enough to be STAR/PLUS operand. */
197 #define	SPSTART		0x04	/* Starts with * or +. */
198 #define TRYAGAIN	0x08	/* Weeded out a declaration. */
199 #define POSTPONED	0x10    /* (?1),(?&name), (??{...}) or similar */
200 
201 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
202 
203 /* whether trie related optimizations are enabled */
204 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
205 #define TRIE_STUDY_OPT
206 #define FULL_TRIE_STUDY
207 #define TRIE_STCLASS
208 #endif
209 
210 
211 
212 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
213 #define PBITVAL(paren) (1 << ((paren) & 7))
214 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
215 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
216 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
217 
218 
219 /* About scan_data_t.
220 
221   During optimisation we recurse through the regexp program performing
222   various inplace (keyhole style) optimisations. In addition study_chunk
223   and scan_commit populate this data structure with information about
224   what strings MUST appear in the pattern. We look for the longest
225   string that must appear for at a fixed location, and we look for the
226   longest string that may appear at a floating location. So for instance
227   in the pattern:
228 
229     /FOO[xX]A.*B[xX]BAR/
230 
231   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
232   strings (because they follow a .* construct). study_chunk will identify
233   both FOO and BAR as being the longest fixed and floating strings respectively.
234 
235   The strings can be composites, for instance
236 
237      /(f)(o)(o)/
238 
239   will result in a composite fixed substring 'foo'.
240 
241   For each string some basic information is maintained:
242 
243   - offset or min_offset
244     This is the position the string must appear at, or not before.
245     It also implicitly (when combined with minlenp) tells us how many
246     character must match before the string we are searching.
247     Likewise when combined with minlenp and the length of the string
248     tells us how many characters must appear after the string we have
249     found.
250 
251   - max_offset
252     Only used for floating strings. This is the rightmost point that
253     the string can appear at. Ifset to I32 max it indicates that the
254     string can occur infinitely far to the right.
255 
256   - minlenp
257     A pointer to the minimum length of the pattern that the string
258     was found inside. This is important as in the case of positive
259     lookahead or positive lookbehind we can have multiple patterns
260     involved. Consider
261 
262     /(?=FOO).*F/
263 
264     The minimum length of the pattern overall is 3, the minimum length
265     of the lookahead part is 3, but the minimum length of the part that
266     will actually match is 1. So 'FOO's minimum length is 3, but the
267     minimum length for the F is 1. This is important as the minimum length
268     is used to determine offsets in front of and behind the string being
269     looked for.  Since strings can be composites this is the length of the
270     pattern at the time it was commited with a scan_commit. Note that
271     the length is calculated by study_chunk, so that the minimum lengths
272     are not known until the full pattern has been compiled, thus the
273     pointer to the value.
274 
275   - lookbehind
276 
277     In the case of lookbehind the string being searched for can be
278     offset past the start point of the final matching string.
279     If this value was just blithely removed from the min_offset it would
280     invalidate some of the calculations for how many chars must match
281     before or after (as they are derived from min_offset and minlen and
282     the length of the string being searched for).
283     When the final pattern is compiled and the data is moved from the
284     scan_data_t structure into the regexp structure the information
285     about lookbehind is factored in, with the information that would
286     have been lost precalculated in the end_shift field for the
287     associated string.
288 
289   The fields pos_min and pos_delta are used to store the minimum offset
290   and the delta to the maximum offset at the current point in the pattern.
291 
292 */
293 
294 typedef struct scan_data_t {
295     /*I32 len_min;      unused */
296     /*I32 len_delta;    unused */
297     I32 pos_min;
298     I32 pos_delta;
299     SV *last_found;
300     I32 last_end;	    /* min value, <0 unless valid. */
301     I32 last_start_min;
302     I32 last_start_max;
303     SV **longest;	    /* Either &l_fixed, or &l_float. */
304     SV *longest_fixed;      /* longest fixed string found in pattern */
305     I32 offset_fixed;       /* offset where it starts */
306     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
307     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
308     SV *longest_float;      /* longest floating string found in pattern */
309     I32 offset_float_min;   /* earliest point in string it can appear */
310     I32 offset_float_max;   /* latest point in string it can appear */
311     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
312     I32 lookbehind_float;   /* is the position of the string modified by LB */
313     I32 flags;
314     I32 whilem_c;
315     I32 *last_closep;
316     struct regnode_charclass_class *start_class;
317 } scan_data_t;
318 
319 /*
320  * Forward declarations for pregcomp()'s friends.
321  */
322 
323 static const scan_data_t zero_scan_data =
324   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
325 
326 #define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
327 #define SF_BEFORE_SEOL		0x0001
328 #define SF_BEFORE_MEOL		0x0002
329 #define SF_FIX_BEFORE_EOL	(SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
330 #define SF_FL_BEFORE_EOL	(SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
331 
332 #ifdef NO_UNARY_PLUS
333 #  define SF_FIX_SHIFT_EOL	(0+2)
334 #  define SF_FL_SHIFT_EOL		(0+4)
335 #else
336 #  define SF_FIX_SHIFT_EOL	(+2)
337 #  define SF_FL_SHIFT_EOL		(+4)
338 #endif
339 
340 #define SF_FIX_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
341 #define SF_FIX_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
342 
343 #define SF_FL_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
344 #define SF_FL_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
345 #define SF_IS_INF		0x0040
346 #define SF_HAS_PAR		0x0080
347 #define SF_IN_PAR		0x0100
348 #define SF_HAS_EVAL		0x0200
349 #define SCF_DO_SUBSTR		0x0400
350 #define SCF_DO_STCLASS_AND	0x0800
351 #define SCF_DO_STCLASS_OR	0x1000
352 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
353 #define SCF_WHILEM_VISITED_POS	0x2000
354 
355 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
356 #define SCF_SEEN_ACCEPT         0x8000
357 
358 #define UTF (RExC_utf8 != 0)
359 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
360 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
361 
362 #define OOB_UNICODE		12345678
363 #define OOB_NAMEDCLASS		-1
364 
365 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
366 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
367 
368 
369 /* length of regex to show in messages that don't mark a position within */
370 #define RegexLengthToShowInErrorMessages 127
371 
372 /*
373  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
374  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
375  * op/pragma/warn/regcomp.
376  */
377 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
378 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
379 
380 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
381 
382 /*
383  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
384  * arg. Show regex, up to a maximum length. If it's too long, chop and add
385  * "...".
386  */
387 #define _FAIL(code) STMT_START {					\
388     const char *ellipses = "";						\
389     IV len = RExC_end - RExC_precomp;					\
390 									\
391     if (!SIZE_ONLY)							\
392 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);			\
393     if (len > RegexLengthToShowInErrorMessages) {			\
394 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
395 	len = RegexLengthToShowInErrorMessages - 10;			\
396 	ellipses = "...";						\
397     }									\
398     code;                                                               \
399 } STMT_END
400 
401 #define	FAIL(msg) _FAIL(			    \
402     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",	    \
403 	    msg, (int)len, RExC_precomp, ellipses))
404 
405 #define	FAIL2(msg,arg) _FAIL(			    \
406     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",	    \
407 	    arg, (int)len, RExC_precomp, ellipses))
408 
409 /*
410  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
411  */
412 #define	Simple_vFAIL(m) STMT_START {					\
413     const IV offset = RExC_parse - RExC_precomp;			\
414     Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
415 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
416 } STMT_END
417 
418 /*
419  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
420  */
421 #define	vFAIL(m) STMT_START {				\
422     if (!SIZE_ONLY)					\
423 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
424     Simple_vFAIL(m);					\
425 } STMT_END
426 
427 /*
428  * Like Simple_vFAIL(), but accepts two arguments.
429  */
430 #define	Simple_vFAIL2(m,a1) STMT_START {			\
431     const IV offset = RExC_parse - RExC_precomp;			\
432     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,			\
433 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
434 } STMT_END
435 
436 /*
437  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
438  */
439 #define	vFAIL2(m,a1) STMT_START {			\
440     if (!SIZE_ONLY)					\
441 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
442     Simple_vFAIL2(m, a1);				\
443 } STMT_END
444 
445 
446 /*
447  * Like Simple_vFAIL(), but accepts three arguments.
448  */
449 #define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
450     const IV offset = RExC_parse - RExC_precomp;		\
451     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,		\
452 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
453 } STMT_END
454 
455 /*
456  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
457  */
458 #define	vFAIL3(m,a1,a2) STMT_START {			\
459     if (!SIZE_ONLY)					\
460 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
461     Simple_vFAIL3(m, a1, a2);				\
462 } STMT_END
463 
464 /*
465  * Like Simple_vFAIL(), but accepts four arguments.
466  */
467 #define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
468     const IV offset = RExC_parse - RExC_precomp;		\
469     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,		\
470 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
471 } STMT_END
472 
473 #define	vWARN(loc,m) STMT_START {					\
474     const IV offset = loc - RExC_precomp;				\
475     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,	\
476 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
477 } STMT_END
478 
479 #define	vWARNdep(loc,m) STMT_START {					\
480     const IV offset = loc - RExC_precomp;				\
481     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),		\
482 	    "%s" REPORT_LOCATION,					\
483 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
484 } STMT_END
485 
486 
487 #define	vWARN2(loc, m, a1) STMT_START {					\
488     const IV offset = loc - RExC_precomp;				\
489     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
490 	    a1, (int)offset, RExC_precomp, RExC_precomp + offset);	\
491 } STMT_END
492 
493 #define	vWARN3(loc, m, a1, a2) STMT_START {				\
494     const IV offset = loc - RExC_precomp;				\
495     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
496 	    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);	\
497 } STMT_END
498 
499 #define	vWARN4(loc, m, a1, a2, a3) STMT_START {				\
500     const IV offset = loc - RExC_precomp;				\
501     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
502 	    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
503 } STMT_END
504 
505 #define	vWARN5(loc, m, a1, a2, a3, a4) STMT_START {			\
506     const IV offset = loc - RExC_precomp;				\
507     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
508 	    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
509 } STMT_END
510 
511 
512 /* Allow for side effects in s */
513 #define REGC(c,s) STMT_START {			\
514     if (!SIZE_ONLY) *(s) = (c); else (void)(s);	\
515 } STMT_END
516 
517 /* Macros for recording node offsets.   20001227 mjd@plover.com
518  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
519  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
520  * Element 0 holds the number n.
521  * Position is 1 indexed.
522  */
523 #ifndef RE_TRACK_PATTERN_OFFSETS
524 #define Set_Node_Offset_To_R(node,byte)
525 #define Set_Node_Offset(node,byte)
526 #define Set_Cur_Node_Offset
527 #define Set_Node_Length_To_R(node,len)
528 #define Set_Node_Length(node,len)
529 #define Set_Node_Cur_Length(node)
530 #define Node_Offset(n)
531 #define Node_Length(n)
532 #define Set_Node_Offset_Length(node,offset,len)
533 #define ProgLen(ri) ri->u.proglen
534 #define SetProgLen(ri,x) ri->u.proglen = x
535 #else
536 #define ProgLen(ri) ri->u.offsets[0]
537 #define SetProgLen(ri,x) ri->u.offsets[0] = x
538 #define Set_Node_Offset_To_R(node,byte) STMT_START {			\
539     if (! SIZE_ONLY) {							\
540 	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
541 		    __LINE__, (int)(node), (int)(byte)));		\
542 	if((node) < 0) {						\
543 	    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
544 	} else {							\
545 	    RExC_offsets[2*(node)-1] = (byte);				\
546 	}								\
547     }									\
548 } STMT_END
549 
550 #define Set_Node_Offset(node,byte) \
551     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
552 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
553 
554 #define Set_Node_Length_To_R(node,len) STMT_START {			\
555     if (! SIZE_ONLY) {							\
556 	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
557 		__LINE__, (int)(node), (int)(len)));			\
558 	if((node) < 0) {						\
559 	    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
560 	} else {							\
561 	    RExC_offsets[2*(node)] = (len);				\
562 	}								\
563     }									\
564 } STMT_END
565 
566 #define Set_Node_Length(node,len) \
567     Set_Node_Length_To_R((node)-RExC_emit_start, len)
568 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
569 #define Set_Node_Cur_Length(node) \
570     Set_Node_Length(node, RExC_parse - parse_start)
571 
572 /* Get offsets and lengths */
573 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
574 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
575 
576 #define Set_Node_Offset_Length(node,offset,len) STMT_START {	\
577     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));	\
578     Set_Node_Length_To_R((node)-RExC_emit_start, (len));	\
579 } STMT_END
580 #endif
581 
582 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
583 #define EXPERIMENTAL_INPLACESCAN
584 #endif /*RE_TRACK_PATTERN_OFFSETS*/
585 
586 #define DEBUG_STUDYDATA(str,data,depth)                              \
587 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
588     PerlIO_printf(Perl_debug_log,                                    \
589         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
590         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
591         (int)(depth)*2, "",                                          \
592         (IV)((data)->pos_min),                                       \
593         (IV)((data)->pos_delta),                                     \
594         (UV)((data)->flags),                                         \
595         (IV)((data)->whilem_c),                                      \
596         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
597         is_inf ? "INF " : ""                                         \
598     );                                                               \
599     if ((data)->last_found)                                          \
600         PerlIO_printf(Perl_debug_log,                                \
601             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
602             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
603             SvPVX_const((data)->last_found),                         \
604             (IV)((data)->last_end),                                  \
605             (IV)((data)->last_start_min),                            \
606             (IV)((data)->last_start_max),                            \
607             ((data)->longest &&                                      \
608              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
609             SvPVX_const((data)->longest_fixed),                      \
610             (IV)((data)->offset_fixed),                              \
611             ((data)->longest &&                                      \
612              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
613             SvPVX_const((data)->longest_float),                      \
614             (IV)((data)->offset_float_min),                          \
615             (IV)((data)->offset_float_max)                           \
616         );                                                           \
617     PerlIO_printf(Perl_debug_log,"\n");                              \
618 });
619 
620 static void clear_re(pTHX_ void *r);
621 
622 /* Mark that we cannot extend a found fixed substring at this point.
623    Update the longest found anchored substring and the longest found
624    floating substrings if needed. */
625 
626 STATIC void
627 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
628 {
629     const STRLEN l = CHR_SVLEN(data->last_found);
630     const STRLEN old_l = CHR_SVLEN(*data->longest);
631     GET_RE_DEBUG_FLAGS_DECL;
632 
633     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
634 	SvSetMagicSV(*data->longest, data->last_found);
635 	if (*data->longest == data->longest_fixed) {
636 	    data->offset_fixed = l ? data->last_start_min : data->pos_min;
637 	    if (data->flags & SF_BEFORE_EOL)
638 		data->flags
639 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
640 	    else
641 		data->flags &= ~SF_FIX_BEFORE_EOL;
642 	    data->minlen_fixed=minlenp;
643 	    data->lookbehind_fixed=0;
644 	}
645 	else { /* *data->longest == data->longest_float */
646 	    data->offset_float_min = l ? data->last_start_min : data->pos_min;
647 	    data->offset_float_max = (l
648 				      ? data->last_start_max
649 				      : data->pos_min + data->pos_delta);
650 	    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
651 		data->offset_float_max = I32_MAX;
652 	    if (data->flags & SF_BEFORE_EOL)
653 		data->flags
654 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
655 	    else
656 		data->flags &= ~SF_FL_BEFORE_EOL;
657             data->minlen_float=minlenp;
658             data->lookbehind_float=0;
659 	}
660     }
661     SvCUR_set(data->last_found, 0);
662     {
663 	SV * const sv = data->last_found;
664 	if (SvUTF8(sv) && SvMAGICAL(sv)) {
665 	    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
666 	    if (mg)
667 		mg->mg_len = 0;
668 	}
669     }
670     data->last_end = -1;
671     data->flags &= ~SF_BEFORE_EOL;
672     DEBUG_STUDYDATA("commit: ",data,0);
673 }
674 
675 /* Can match anything (initialization) */
676 STATIC void
677 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
678 {
679     ANYOF_CLASS_ZERO(cl);
680     ANYOF_BITMAP_SETALL(cl);
681     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
682     if (LOC)
683 	cl->flags |= ANYOF_LOCALE;
684 }
685 
686 /* Can match anything (initialization) */
687 STATIC int
688 S_cl_is_anything(const struct regnode_charclass_class *cl)
689 {
690     int value;
691 
692     for (value = 0; value <= ANYOF_MAX; value += 2)
693 	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
694 	    return 1;
695     if (!(cl->flags & ANYOF_UNICODE_ALL))
696 	return 0;
697     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
698 	return 0;
699     return 1;
700 }
701 
702 /* Can match anything (initialization) */
703 STATIC void
704 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
705 {
706     Zero(cl, 1, struct regnode_charclass_class);
707     cl->type = ANYOF;
708     cl_anything(pRExC_state, cl);
709 }
710 
711 STATIC void
712 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
713 {
714     Zero(cl, 1, struct regnode_charclass_class);
715     cl->type = ANYOF;
716     cl_anything(pRExC_state, cl);
717     if (LOC)
718 	cl->flags |= ANYOF_LOCALE;
719 }
720 
721 /* 'And' a given class with another one.  Can create false positives */
722 /* We assume that cl is not inverted */
723 STATIC void
724 S_cl_and(struct regnode_charclass_class *cl,
725 	const struct regnode_charclass_class *and_with)
726 {
727 
728     assert(and_with->type == ANYOF);
729     if (!(and_with->flags & ANYOF_CLASS)
730 	&& !(cl->flags & ANYOF_CLASS)
731 	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
732 	&& !(and_with->flags & ANYOF_FOLD)
733 	&& !(cl->flags & ANYOF_FOLD)) {
734 	int i;
735 
736 	if (and_with->flags & ANYOF_INVERT)
737 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
738 		cl->bitmap[i] &= ~and_with->bitmap[i];
739 	else
740 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
741 		cl->bitmap[i] &= and_with->bitmap[i];
742     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
743     if (!(and_with->flags & ANYOF_EOS))
744 	cl->flags &= ~ANYOF_EOS;
745 
746     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
747 	!(and_with->flags & ANYOF_INVERT)) {
748 	cl->flags &= ~ANYOF_UNICODE_ALL;
749 	cl->flags |= ANYOF_UNICODE;
750 	ARG_SET(cl, ARG(and_with));
751     }
752     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
753 	!(and_with->flags & ANYOF_INVERT))
754 	cl->flags &= ~ANYOF_UNICODE_ALL;
755     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
756 	!(and_with->flags & ANYOF_INVERT))
757 	cl->flags &= ~ANYOF_UNICODE;
758 }
759 
760 /* 'OR' a given class with another one.  Can create false positives */
761 /* We assume that cl is not inverted */
762 STATIC void
763 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
764 {
765     if (or_with->flags & ANYOF_INVERT) {
766 	/* We do not use
767 	 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
768 	 *   <= (B1 | !B2) | (CL1 | !CL2)
769 	 * which is wasteful if CL2 is small, but we ignore CL2:
770 	 *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
771 	 * XXXX Can we handle case-fold?  Unclear:
772 	 *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
773 	 *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
774 	 */
775 	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
776 	     && !(or_with->flags & ANYOF_FOLD)
777 	     && !(cl->flags & ANYOF_FOLD) ) {
778 	    int i;
779 
780 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
781 		cl->bitmap[i] |= ~or_with->bitmap[i];
782 	} /* XXXX: logic is complicated otherwise */
783 	else {
784 	    cl_anything(pRExC_state, cl);
785 	}
786     } else {
787 	/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
788 	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
789 	     && (!(or_with->flags & ANYOF_FOLD)
790 		 || (cl->flags & ANYOF_FOLD)) ) {
791 	    int i;
792 
793 	    /* OR char bitmap and class bitmap separately */
794 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
795 		cl->bitmap[i] |= or_with->bitmap[i];
796 	    if (or_with->flags & ANYOF_CLASS) {
797 		for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
798 		    cl->classflags[i] |= or_with->classflags[i];
799 		cl->flags |= ANYOF_CLASS;
800 	    }
801 	}
802 	else { /* XXXX: logic is complicated, leave it along for a moment. */
803 	    cl_anything(pRExC_state, cl);
804 	}
805     }
806     if (or_with->flags & ANYOF_EOS)
807 	cl->flags |= ANYOF_EOS;
808 
809     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
810 	ARG(cl) != ARG(or_with)) {
811 	cl->flags |= ANYOF_UNICODE_ALL;
812 	cl->flags &= ~ANYOF_UNICODE;
813     }
814     if (or_with->flags & ANYOF_UNICODE_ALL) {
815 	cl->flags |= ANYOF_UNICODE_ALL;
816 	cl->flags &= ~ANYOF_UNICODE;
817     }
818 }
819 
820 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
821 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
822 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
823 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
824 
825 
826 #ifdef DEBUGGING
827 /*
828    dump_trie(trie,widecharmap,revcharmap)
829    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
830    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
831 
832    These routines dump out a trie in a somewhat readable format.
833    The _interim_ variants are used for debugging the interim
834    tables that are used to generate the final compressed
835    representation which is what dump_trie expects.
836 
837    Part of the reason for their existance is to provide a form
838    of documentation as to how the different representations function.
839 
840 */
841 
842 /*
843   Dumps the final compressed table form of the trie to Perl_debug_log.
844   Used for debugging make_trie().
845 */
846 
847 STATIC void
848 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
849 	    AV *revcharmap, U32 depth)
850 {
851     U32 state;
852     SV *sv=sv_newmortal();
853     int colwidth= widecharmap ? 6 : 4;
854     GET_RE_DEBUG_FLAGS_DECL;
855 
856 
857     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858         (int)depth * 2 + 2,"",
859         "Match","Base","Ofs" );
860 
861     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 	SV ** const tmp = av_fetch( revcharmap, state, 0);
863         if ( tmp ) {
864             PerlIO_printf( Perl_debug_log, "%*s",
865                 colwidth,
866                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
867 	                    PL_colors[0], PL_colors[1],
868 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
869 	                    PERL_PV_ESCAPE_FIRSTCHAR
870                 )
871             );
872         }
873     }
874     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
875         (int)depth * 2 + 2,"");
876 
877     for( state = 0 ; state < trie->uniquecharcount ; state++ )
878         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
879     PerlIO_printf( Perl_debug_log, "\n");
880 
881     for( state = 1 ; state < trie->statecount ; state++ ) {
882 	const U32 base = trie->states[ state ].trans.base;
883 
884         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
885 
886         if ( trie->states[ state ].wordnum ) {
887             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
888         } else {
889             PerlIO_printf( Perl_debug_log, "%6s", "" );
890         }
891 
892         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
893 
894         if ( base ) {
895             U32 ofs = 0;
896 
897             while( ( base + ofs  < trie->uniquecharcount ) ||
898                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
899                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
900                     ofs++;
901 
902             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
903 
904             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
905                 if ( ( base + ofs >= trie->uniquecharcount ) &&
906                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
907                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
908                 {
909                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
910                     colwidth,
911                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
912                 } else {
913                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
914                 }
915             }
916 
917             PerlIO_printf( Perl_debug_log, "]");
918 
919         }
920         PerlIO_printf( Perl_debug_log, "\n" );
921     }
922 }
923 /*
924   Dumps a fully constructed but uncompressed trie in list form.
925   List tries normally only are used for construction when the number of
926   possible chars (trie->uniquecharcount) is very high.
927   Used for debugging make_trie().
928 */
929 STATIC void
930 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
931 			 HV *widecharmap, AV *revcharmap, U32 next_alloc,
932 			 U32 depth)
933 {
934     U32 state;
935     SV *sv=sv_newmortal();
936     int colwidth= widecharmap ? 6 : 4;
937     GET_RE_DEBUG_FLAGS_DECL;
938     /* print out the table precompression.  */
939     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
940         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
941         "------:-----+-----------------\n" );
942 
943     for( state=1 ; state < next_alloc ; state ++ ) {
944         U16 charid;
945 
946         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
947             (int)depth * 2 + 2,"", (UV)state  );
948         if ( ! trie->states[ state ].wordnum ) {
949             PerlIO_printf( Perl_debug_log, "%5s| ","");
950         } else {
951             PerlIO_printf( Perl_debug_log, "W%4x| ",
952                 trie->states[ state ].wordnum
953             );
954         }
955         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
956 	    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
957 	    if ( tmp ) {
958                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
959                     colwidth,
960                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
961 	                    PL_colors[0], PL_colors[1],
962 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
963 	                    PERL_PV_ESCAPE_FIRSTCHAR
964                     ) ,
965                     TRIE_LIST_ITEM(state,charid).forid,
966                     (UV)TRIE_LIST_ITEM(state,charid).newstate
967                 );
968                 if (!(charid % 10))
969                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
970                         (int)((depth * 2) + 14), "");
971             }
972         }
973         PerlIO_printf( Perl_debug_log, "\n");
974     }
975 }
976 
977 /*
978   Dumps a fully constructed but uncompressed trie in table form.
979   This is the normal DFA style state transition table, with a few
980   twists to facilitate compression later.
981   Used for debugging make_trie().
982 */
983 STATIC void
984 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
985 			  HV *widecharmap, AV *revcharmap, U32 next_alloc,
986 			  U32 depth)
987 {
988     U32 state;
989     U16 charid;
990     SV *sv=sv_newmortal();
991     int colwidth= widecharmap ? 6 : 4;
992     GET_RE_DEBUG_FLAGS_DECL;
993 
994     /*
995        print out the table precompression so that we can do a visual check
996        that they are identical.
997      */
998 
999     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1000 
1001     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1002 	SV ** const tmp = av_fetch( revcharmap, charid, 0);
1003         if ( tmp ) {
1004             PerlIO_printf( Perl_debug_log, "%*s",
1005                 colwidth,
1006                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1007 	                    PL_colors[0], PL_colors[1],
1008 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1009 	                    PERL_PV_ESCAPE_FIRSTCHAR
1010                 )
1011             );
1012         }
1013     }
1014 
1015     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1016 
1017     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1018         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1019     }
1020 
1021     PerlIO_printf( Perl_debug_log, "\n" );
1022 
1023     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1024 
1025         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1026             (int)depth * 2 + 2,"",
1027             (UV)TRIE_NODENUM( state ) );
1028 
1029         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1030             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1031             if (v)
1032                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1033             else
1034                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1035         }
1036         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1037             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1038         } else {
1039             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1040             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1041         }
1042     }
1043 }
1044 
1045 #endif
1046 
1047 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1048   startbranch: the first branch in the whole branch sequence
1049   first      : start branch of sequence of branch-exact nodes.
1050 	       May be the same as startbranch
1051   last       : Thing following the last branch.
1052 	       May be the same as tail.
1053   tail       : item following the branch sequence
1054   count      : words in the sequence
1055   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1056   depth      : indent depth
1057 
1058 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1059 
1060 A trie is an N'ary tree where the branches are determined by digital
1061 decomposition of the key. IE, at the root node you look up the 1st character and
1062 follow that branch repeat until you find the end of the branches. Nodes can be
1063 marked as "accepting" meaning they represent a complete word. Eg:
1064 
1065   /he|she|his|hers/
1066 
1067 would convert into the following structure. Numbers represent states, letters
1068 following numbers represent valid transitions on the letter from that state, if
1069 the number is in square brackets it represents an accepting state, otherwise it
1070 will be in parenthesis.
1071 
1072       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1073       |    |
1074       |   (2)
1075       |    |
1076      (1)   +-i->(6)-+-s->[7]
1077       |
1078       +-s->(3)-+-h->(4)-+-e->[5]
1079 
1080       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1081 
1082 This shows that when matching against the string 'hers' we will begin at state 1
1083 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1084 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1085 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1086 single traverse. We store a mapping from accepting to state to which word was
1087 matched, and then when we have multiple possibilities we try to complete the
1088 rest of the regex in the order in which they occured in the alternation.
1089 
1090 The only prior NFA like behaviour that would be changed by the TRIE support is
1091 the silent ignoring of duplicate alternations which are of the form:
1092 
1093  / (DUPE|DUPE) X? (?{ ... }) Y /x
1094 
1095 Thus EVAL blocks follwing a trie may be called a different number of times with
1096 and without the optimisation. With the optimisations dupes will be silently
1097 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1098 the following demonstrates:
1099 
1100  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1101 
1102 which prints out 'word' three times, but
1103 
1104  'words'=~/(word|word|word)(?{ print $1 })S/
1105 
1106 which doesnt print it out at all. This is due to other optimisations kicking in.
1107 
1108 Example of what happens on a structural level:
1109 
1110 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1111 
1112    1: CURLYM[1] {1,32767}(18)
1113    5:   BRANCH(8)
1114    6:     EXACT <ac>(16)
1115    8:   BRANCH(11)
1116    9:     EXACT <ad>(16)
1117   11:   BRANCH(14)
1118   12:     EXACT <ab>(16)
1119   16:   SUCCEED(0)
1120   17:   NOTHING(18)
1121   18: END(0)
1122 
1123 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1124 and should turn into:
1125 
1126    1: CURLYM[1] {1,32767}(18)
1127    5:   TRIE(16)
1128 	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1129 	  <ac>
1130 	  <ad>
1131 	  <ab>
1132   16:   SUCCEED(0)
1133   17:   NOTHING(18)
1134   18: END(0)
1135 
1136 Cases where tail != last would be like /(?foo|bar)baz/:
1137 
1138    1: BRANCH(4)
1139    2:   EXACT <foo>(8)
1140    4: BRANCH(7)
1141    5:   EXACT <bar>(8)
1142    7: TAIL(8)
1143    8: EXACT <baz>(10)
1144   10: END(0)
1145 
1146 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1147 and would end up looking like:
1148 
1149     1: TRIE(8)
1150       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1151 	<foo>
1152 	<bar>
1153    7: TAIL(8)
1154    8: EXACT <baz>(10)
1155   10: END(0)
1156 
1157     d = uvuni_to_utf8_flags(d, uv, 0);
1158 
1159 is the recommended Unicode-aware way of saying
1160 
1161     *(d++) = uv;
1162 */
1163 
1164 #define TRIE_STORE_REVCHAR                                                 \
1165     STMT_START {                                                           \
1166 	if (UTF) {							   \
1167 	    SV *zlopp = newSV(2);					   \
1168 	    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);	   \
1169 	    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1170 	    SvCUR_set(zlopp, kapow - flrbbbbb);				   \
1171 	    SvPOK_on(zlopp);						   \
1172 	    SvUTF8_on(zlopp);						   \
1173 	    av_push(revcharmap, zlopp);					   \
1174 	} else {							   \
1175 	    char ooooff = (char)uvc;					   	   \
1176 	    av_push(revcharmap, newSVpvn(&ooooff, 1));			   \
1177 	}								   \
1178         } STMT_END
1179 
1180 #define TRIE_READ_CHAR STMT_START {                                           \
1181     wordlen++;                                                                \
1182     if ( UTF ) {                                                              \
1183 	if ( folder ) {                                                       \
1184 	    if ( foldlen > 0 ) {                                              \
1185 	       uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1186 	       foldlen -= len;                                                \
1187 	       scan += len;                                                   \
1188 	       len = 0;                                                       \
1189 	    } else {                                                          \
1190 		uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1191 		uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1192 		foldlen -= UNISKIP( uvc );                                    \
1193 		scan = foldbuf + UNISKIP( uvc );                              \
1194 	    }                                                                 \
1195 	} else {                                                              \
1196 	    uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1197 	}                                                                     \
1198     } else {                                                                  \
1199 	uvc = (U32)*uc;                                                       \
1200 	len = 1;                                                              \
1201     }                                                                         \
1202 } STMT_END
1203 
1204 
1205 
1206 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1207     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1208 	U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1209 	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1210     }                                                           \
1211     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1212     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1213     TRIE_LIST_CUR( state )++;                                   \
1214 } STMT_END
1215 
1216 #define TRIE_LIST_NEW(state) STMT_START {                       \
1217     Newxz( trie->states[ state ].trans.list,               \
1218 	4, reg_trie_trans_le );                                 \
1219      TRIE_LIST_CUR( state ) = 1;                                \
1220      TRIE_LIST_LEN( state ) = 4;                                \
1221 } STMT_END
1222 
1223 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1224     U16 dupe= trie->states[ state ].wordnum;                    \
1225     regnode * const noper_next = regnext( noper );              \
1226                                                                 \
1227     if (trie->wordlen)                                          \
1228         trie->wordlen[ curword ] = wordlen;                     \
1229     DEBUG_r({                                                   \
1230         /* store the word for dumping */                        \
1231         SV* tmp;                                                \
1232         if (OP(noper) != NOTHING)                               \
1233             tmp = newSVpvn(STRING(noper), STR_LEN(noper));      \
1234         else                                                    \
1235             tmp = newSVpvn( "", 0 );                            \
1236         if ( UTF ) SvUTF8_on( tmp );                            \
1237         av_push( trie_words, tmp );                             \
1238     });                                                         \
1239                                                                 \
1240     curword++;                                                  \
1241                                                                 \
1242     if ( noper_next < tail ) {                                  \
1243         if (!trie->jump)                                        \
1244             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1245         trie->jump[curword] = (U16)(noper_next - convert);      \
1246         if (!jumper)                                            \
1247             jumper = noper_next;                                \
1248         if (!nextbranch)                                        \
1249             nextbranch= regnext(cur);                           \
1250     }                                                           \
1251                                                                 \
1252     if ( dupe ) {                                               \
1253         /* So it's a dupe. This means we need to maintain a   */\
1254         /* linked-list from the first to the next.            */\
1255         /* we only allocate the nextword buffer when there    */\
1256         /* a dupe, so first time we have to do the allocation */\
1257         if (!trie->nextword)                                    \
1258             trie->nextword = (U16 *)					\
1259 		PerlMemShared_calloc( word_count + 1, sizeof(U16));	\
1260         while ( trie->nextword[dupe] )                          \
1261             dupe= trie->nextword[dupe];                         \
1262         trie->nextword[dupe]= curword;                          \
1263     } else {                                                    \
1264         /* we haven't inserted this word yet.                */ \
1265         trie->states[ state ].wordnum = curword;                \
1266     }                                                           \
1267 } STMT_END
1268 
1269 
1270 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)		\
1271      ( ( base + charid >=  ucharcount					\
1272          && base + charid < ubound					\
1273          && state == trie->trans[ base - ucharcount + charid ].check	\
1274          && trie->trans[ base - ucharcount + charid ].next )		\
1275            ? trie->trans[ base - ucharcount + charid ].next		\
1276            : ( state==1 ? special : 0 )					\
1277       )
1278 
1279 #define MADE_TRIE       1
1280 #define MADE_JUMP_TRIE  2
1281 #define MADE_EXACT_TRIE 4
1282 
1283 STATIC I32
1284 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1285 {
1286     dVAR;
1287     /* first pass, loop through and scan words */
1288     reg_trie_data *trie;
1289     HV *widecharmap = NULL;
1290     AV *revcharmap = newAV();
1291     regnode *cur;
1292     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1293     STRLEN len = 0;
1294     UV uvc = 0;
1295     U16 curword = 0;
1296     U32 next_alloc = 0;
1297     regnode *jumper = NULL;
1298     regnode *nextbranch = NULL;
1299     regnode *convert = NULL;
1300     /* we just use folder as a flag in utf8 */
1301     const U8 * const folder = ( flags == EXACTF
1302                        ? PL_fold
1303                        : ( flags == EXACTFL
1304                            ? PL_fold_locale
1305                            : NULL
1306                          )
1307                      );
1308 
1309 #ifdef DEBUGGING
1310     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1311     AV *trie_words = NULL;
1312     /* along with revcharmap, this only used during construction but both are
1313      * useful during debugging so we store them in the struct when debugging.
1314      */
1315 #else
1316     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1317     STRLEN trie_charcount=0;
1318 #endif
1319     SV *re_trie_maxbuff;
1320     GET_RE_DEBUG_FLAGS_DECL;
1321 #ifndef DEBUGGING
1322     PERL_UNUSED_ARG(depth);
1323 #endif
1324 
1325     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1326     trie->refcount = 1;
1327     trie->startstate = 1;
1328     trie->wordcount = word_count;
1329     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1330     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1331     if (!(UTF && folder))
1332 	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1333     DEBUG_r({
1334         trie_words = newAV();
1335     });
1336 
1337     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1338     if (!SvIOK(re_trie_maxbuff)) {
1339         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1340     }
1341     DEBUG_OPTIMISE_r({
1342                 PerlIO_printf( Perl_debug_log,
1343                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1344                   (int)depth * 2 + 2, "",
1345                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1346                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1347                   (int)depth);
1348     });
1349 
1350    /* Find the node we are going to overwrite */
1351     if ( first == startbranch && OP( last ) != BRANCH ) {
1352         /* whole branch chain */
1353         convert = first;
1354     } else {
1355         /* branch sub-chain */
1356         convert = NEXTOPER( first );
1357     }
1358 
1359     /*  -- First loop and Setup --
1360 
1361        We first traverse the branches and scan each word to determine if it
1362        contains widechars, and how many unique chars there are, this is
1363        important as we have to build a table with at least as many columns as we
1364        have unique chars.
1365 
1366        We use an array of integers to represent the character codes 0..255
1367        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1368        native representation of the character value as the key and IV's for the
1369        coded index.
1370 
1371        *TODO* If we keep track of how many times each character is used we can
1372        remap the columns so that the table compression later on is more
1373        efficient in terms of memory by ensuring most common value is in the
1374        middle and the least common are on the outside.  IMO this would be better
1375        than a most to least common mapping as theres a decent chance the most
1376        common letter will share a node with the least common, meaning the node
1377        will not be compressable. With a middle is most common approach the worst
1378        case is when we have the least common nodes twice.
1379 
1380      */
1381 
1382     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1383         regnode * const noper = NEXTOPER( cur );
1384         const U8 *uc = (U8*)STRING( noper );
1385         const U8 * const e  = uc + STR_LEN( noper );
1386         STRLEN foldlen = 0;
1387         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1388         const U8 *scan = (U8*)NULL;
1389         U32 wordlen      = 0;         /* required init */
1390         STRLEN chars = 0;
1391         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1392 
1393         if (OP(noper) == NOTHING) {
1394             trie->minlen= 0;
1395             continue;
1396         }
1397         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1398             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1399                                           regardless of encoding */
1400 
1401         for ( ; uc < e ; uc += len ) {
1402             TRIE_CHARCOUNT(trie)++;
1403             TRIE_READ_CHAR;
1404             chars++;
1405             if ( uvc < 256 ) {
1406                 if ( !trie->charmap[ uvc ] ) {
1407                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1408                     if ( folder )
1409                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1410                     TRIE_STORE_REVCHAR;
1411                 }
1412                 if ( set_bit ) {
1413                     /* store the codepoint in the bitmap, and if its ascii
1414                        also store its folded equivelent. */
1415                     TRIE_BITMAP_SET(trie,uvc);
1416 
1417 		    /* store the folded codepoint */
1418 		    if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1419 
1420 		    if ( !UTF ) {
1421 			/* store first byte of utf8 representation of
1422 			   codepoints in the 127 < uvc < 256 range */
1423 			if (127 < uvc && uvc < 192) {
1424 			    TRIE_BITMAP_SET(trie,194);
1425 			} else if (191 < uvc ) {
1426 			    TRIE_BITMAP_SET(trie,195);
1427 			/* && uvc < 256 -- we know uvc is < 256 already */
1428 			}
1429 		    }
1430                     set_bit = 0; /* We've done our bit :-) */
1431                 }
1432             } else {
1433                 SV** svpp;
1434                 if ( !widecharmap )
1435                     widecharmap = newHV();
1436 
1437                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1438 
1439                 if ( !svpp )
1440                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1441 
1442                 if ( !SvTRUE( *svpp ) ) {
1443                     sv_setiv( *svpp, ++trie->uniquecharcount );
1444                     TRIE_STORE_REVCHAR;
1445                 }
1446             }
1447         }
1448         if( cur == first ) {
1449             trie->minlen=chars;
1450             trie->maxlen=chars;
1451         } else if (chars < trie->minlen) {
1452             trie->minlen=chars;
1453         } else if (chars > trie->maxlen) {
1454             trie->maxlen=chars;
1455         }
1456 
1457     } /* end first pass */
1458     DEBUG_TRIE_COMPILE_r(
1459         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1460                 (int)depth * 2 + 2,"",
1461                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1462 		(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1463 		(int)trie->minlen, (int)trie->maxlen )
1464     );
1465     trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1466 
1467     /*
1468         We now know what we are dealing with in terms of unique chars and
1469         string sizes so we can calculate how much memory a naive
1470         representation using a flat table  will take. If it's over a reasonable
1471         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1472         conservative but potentially much slower representation using an array
1473         of lists.
1474 
1475         At the end we convert both representations into the same compressed
1476         form that will be used in regexec.c for matching with. The latter
1477         is a form that cannot be used to construct with but has memory
1478         properties similar to the list form and access properties similar
1479         to the table form making it both suitable for fast searches and
1480         small enough that its feasable to store for the duration of a program.
1481 
1482         See the comment in the code where the compressed table is produced
1483         inplace from the flat tabe representation for an explanation of how
1484         the compression works.
1485 
1486     */
1487 
1488 
1489     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1490         /*
1491             Second Pass -- Array Of Lists Representation
1492 
1493             Each state will be represented by a list of charid:state records
1494             (reg_trie_trans_le) the first such element holds the CUR and LEN
1495             points of the allocated array. (See defines above).
1496 
1497             We build the initial structure using the lists, and then convert
1498             it into the compressed table form which allows faster lookups
1499             (but cant be modified once converted).
1500         */
1501 
1502         STRLEN transcount = 1;
1503 
1504         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1505             "%*sCompiling trie using list compiler\n",
1506             (int)depth * 2 + 2, ""));
1507 
1508 	trie->states = (reg_trie_state *)
1509 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1510 				  sizeof(reg_trie_state) );
1511         TRIE_LIST_NEW(1);
1512         next_alloc = 2;
1513 
1514         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1515 
1516 	    regnode * const noper = NEXTOPER( cur );
1517 	    U8 *uc           = (U8*)STRING( noper );
1518 	    const U8 * const e = uc + STR_LEN( noper );
1519 	    U32 state        = 1;         /* required init */
1520 	    U16 charid       = 0;         /* sanity init */
1521 	    U8 *scan         = (U8*)NULL; /* sanity init */
1522 	    STRLEN foldlen   = 0;         /* required init */
1523             U32 wordlen      = 0;         /* required init */
1524 	    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1525 
1526             if (OP(noper) != NOTHING) {
1527                 for ( ; uc < e ; uc += len ) {
1528 
1529                     TRIE_READ_CHAR;
1530 
1531                     if ( uvc < 256 ) {
1532                         charid = trie->charmap[ uvc ];
1533 		    } else {
1534                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1535                         if ( !svpp ) {
1536                             charid = 0;
1537                         } else {
1538                             charid=(U16)SvIV( *svpp );
1539                         }
1540 		    }
1541                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1542                     if ( charid ) {
1543 
1544                         U16 check;
1545                         U32 newstate = 0;
1546 
1547                         charid--;
1548                         if ( !trie->states[ state ].trans.list ) {
1549                             TRIE_LIST_NEW( state );
1550 			}
1551                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1552                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1553                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1554                                 break;
1555                             }
1556                         }
1557                         if ( ! newstate ) {
1558                             newstate = next_alloc++;
1559                             TRIE_LIST_PUSH( state, charid, newstate );
1560                             transcount++;
1561                         }
1562                         state = newstate;
1563                     } else {
1564                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1565 		    }
1566 		}
1567 	    }
1568             TRIE_HANDLE_WORD(state);
1569 
1570         } /* end second pass */
1571 
1572         /* next alloc is the NEXT state to be allocated */
1573         trie->statecount = next_alloc;
1574         trie->states = (reg_trie_state *)
1575 	    PerlMemShared_realloc( trie->states,
1576 				   next_alloc
1577 				   * sizeof(reg_trie_state) );
1578 
1579         /* and now dump it out before we compress it */
1580         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1581 							 revcharmap, next_alloc,
1582 							 depth+1)
1583         );
1584 
1585         trie->trans = (reg_trie_trans *)
1586 	    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1587         {
1588             U32 state;
1589             U32 tp = 0;
1590             U32 zp = 0;
1591 
1592 
1593             for( state=1 ; state < next_alloc ; state ++ ) {
1594                 U32 base=0;
1595 
1596                 /*
1597                 DEBUG_TRIE_COMPILE_MORE_r(
1598                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1599                 );
1600                 */
1601 
1602                 if (trie->states[state].trans.list) {
1603                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1604                     U16 maxid=minid;
1605 		    U16 idx;
1606 
1607                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1608 			const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1609 			if ( forid < minid ) {
1610 			    minid=forid;
1611 			} else if ( forid > maxid ) {
1612 			    maxid=forid;
1613 			}
1614                     }
1615                     if ( transcount < tp + maxid - minid + 1) {
1616                         transcount *= 2;
1617 			trie->trans = (reg_trie_trans *)
1618 			    PerlMemShared_realloc( trie->trans,
1619 						     transcount
1620 						     * sizeof(reg_trie_trans) );
1621                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1622                     }
1623                     base = trie->uniquecharcount + tp - minid;
1624                     if ( maxid == minid ) {
1625                         U32 set = 0;
1626                         for ( ; zp < tp ; zp++ ) {
1627                             if ( ! trie->trans[ zp ].next ) {
1628                                 base = trie->uniquecharcount + zp - minid;
1629                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1630                                 trie->trans[ zp ].check = state;
1631                                 set = 1;
1632                                 break;
1633                             }
1634                         }
1635                         if ( !set ) {
1636                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1637                             trie->trans[ tp ].check = state;
1638                             tp++;
1639                             zp = tp;
1640                         }
1641                     } else {
1642                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1643                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1644                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1645                             trie->trans[ tid ].check = state;
1646                         }
1647                         tp += ( maxid - minid + 1 );
1648                     }
1649                     Safefree(trie->states[ state ].trans.list);
1650                 }
1651                 /*
1652                 DEBUG_TRIE_COMPILE_MORE_r(
1653                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1654                 );
1655                 */
1656                 trie->states[ state ].trans.base=base;
1657             }
1658             trie->lasttrans = tp + 1;
1659         }
1660     } else {
1661         /*
1662            Second Pass -- Flat Table Representation.
1663 
1664            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1665            We know that we will need Charcount+1 trans at most to store the data
1666            (one row per char at worst case) So we preallocate both structures
1667            assuming worst case.
1668 
1669            We then construct the trie using only the .next slots of the entry
1670            structs.
1671 
1672            We use the .check field of the first entry of the node  temporarily to
1673            make compression both faster and easier by keeping track of how many non
1674            zero fields are in the node.
1675 
1676            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1677            transition.
1678 
1679            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1680            number representing the first entry of the node, and state as a
1681            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1682            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1683            are 2 entrys per node. eg:
1684 
1685              A B       A B
1686           1. 2 4    1. 3 7
1687           2. 0 3    3. 0 5
1688           3. 0 0    5. 0 0
1689           4. 0 0    7. 0 0
1690 
1691            The table is internally in the right hand, idx form. However as we also
1692            have to deal with the states array which is indexed by nodenum we have to
1693            use TRIE_NODENUM() to convert.
1694 
1695         */
1696         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1697             "%*sCompiling trie using table compiler\n",
1698             (int)depth * 2 + 2, ""));
1699 
1700 	trie->trans = (reg_trie_trans *)
1701 	    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1702 				  * trie->uniquecharcount + 1,
1703 				  sizeof(reg_trie_trans) );
1704         trie->states = (reg_trie_state *)
1705 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1706 				  sizeof(reg_trie_state) );
1707         next_alloc = trie->uniquecharcount + 1;
1708 
1709 
1710         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1711 
1712 	    regnode * const noper   = NEXTOPER( cur );
1713 	    const U8 *uc     = (U8*)STRING( noper );
1714 	    const U8 * const e = uc + STR_LEN( noper );
1715 
1716             U32 state        = 1;         /* required init */
1717 
1718             U16 charid       = 0;         /* sanity init */
1719             U32 accept_state = 0;         /* sanity init */
1720             U8 *scan         = (U8*)NULL; /* sanity init */
1721 
1722             STRLEN foldlen   = 0;         /* required init */
1723             U32 wordlen      = 0;         /* required init */
1724             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1725 
1726             if ( OP(noper) != NOTHING ) {
1727                 for ( ; uc < e ; uc += len ) {
1728 
1729                     TRIE_READ_CHAR;
1730 
1731                     if ( uvc < 256 ) {
1732                         charid = trie->charmap[ uvc ];
1733                     } else {
1734                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1735                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1736                     }
1737                     if ( charid ) {
1738                         charid--;
1739                         if ( !trie->trans[ state + charid ].next ) {
1740                             trie->trans[ state + charid ].next = next_alloc;
1741                             trie->trans[ state ].check++;
1742                             next_alloc += trie->uniquecharcount;
1743                         }
1744                         state = trie->trans[ state + charid ].next;
1745                     } else {
1746                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1747                     }
1748                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1749                 }
1750             }
1751             accept_state = TRIE_NODENUM( state );
1752             TRIE_HANDLE_WORD(accept_state);
1753 
1754         } /* end second pass */
1755 
1756         /* and now dump it out before we compress it */
1757         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1758 							  revcharmap,
1759 							  next_alloc, depth+1));
1760 
1761         {
1762         /*
1763            * Inplace compress the table.*
1764 
1765            For sparse data sets the table constructed by the trie algorithm will
1766            be mostly 0/FAIL transitions or to put it another way mostly empty.
1767            (Note that leaf nodes will not contain any transitions.)
1768 
1769            This algorithm compresses the tables by eliminating most such
1770            transitions, at the cost of a modest bit of extra work during lookup:
1771 
1772            - Each states[] entry contains a .base field which indicates the
1773            index in the state[] array wheres its transition data is stored.
1774 
1775            - If .base is 0 there are no  valid transitions from that node.
1776 
1777            - If .base is nonzero then charid is added to it to find an entry in
1778            the trans array.
1779 
1780            -If trans[states[state].base+charid].check!=state then the
1781            transition is taken to be a 0/Fail transition. Thus if there are fail
1782            transitions at the front of the node then the .base offset will point
1783            somewhere inside the previous nodes data (or maybe even into a node
1784            even earlier), but the .check field determines if the transition is
1785            valid.
1786 
1787            XXX - wrong maybe?
1788            The following process inplace converts the table to the compressed
1789            table: We first do not compress the root node 1,and mark its all its
1790            .check pointers as 1 and set its .base pointer as 1 as well. This
1791            allows to do a DFA construction from the compressed table later, and
1792            ensures that any .base pointers we calculate later are greater than
1793            0.
1794 
1795            - We set 'pos' to indicate the first entry of the second node.
1796 
1797            - We then iterate over the columns of the node, finding the first and
1798            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1799            and set the .check pointers accordingly, and advance pos
1800            appropriately and repreat for the next node. Note that when we copy
1801            the next pointers we have to convert them from the original
1802            NODEIDX form to NODENUM form as the former is not valid post
1803            compression.
1804 
1805            - If a node has no transitions used we mark its base as 0 and do not
1806            advance the pos pointer.
1807 
1808            - If a node only has one transition we use a second pointer into the
1809            structure to fill in allocated fail transitions from other states.
1810            This pointer is independent of the main pointer and scans forward
1811            looking for null transitions that are allocated to a state. When it
1812            finds one it writes the single transition into the "hole".  If the
1813            pointer doesnt find one the single transition is appended as normal.
1814 
1815            - Once compressed we can Renew/realloc the structures to release the
1816            excess space.
1817 
1818            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1819            specifically Fig 3.47 and the associated pseudocode.
1820 
1821            demq
1822         */
1823         const U32 laststate = TRIE_NODENUM( next_alloc );
1824 	U32 state, charid;
1825         U32 pos = 0, zp=0;
1826         trie->statecount = laststate;
1827 
1828         for ( state = 1 ; state < laststate ; state++ ) {
1829             U8 flag = 0;
1830 	    const U32 stateidx = TRIE_NODEIDX( state );
1831 	    const U32 o_used = trie->trans[ stateidx ].check;
1832 	    U32 used = trie->trans[ stateidx ].check;
1833             trie->trans[ stateidx ].check = 0;
1834 
1835             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1836                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1837                     if ( trie->trans[ stateidx + charid ].next ) {
1838                         if (o_used == 1) {
1839                             for ( ; zp < pos ; zp++ ) {
1840                                 if ( ! trie->trans[ zp ].next ) {
1841                                     break;
1842                                 }
1843                             }
1844                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1845                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1846                             trie->trans[ zp ].check = state;
1847                             if ( ++zp > pos ) pos = zp;
1848                             break;
1849                         }
1850                         used--;
1851                     }
1852                     if ( !flag ) {
1853                         flag = 1;
1854                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1855                     }
1856                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1857                     trie->trans[ pos ].check = state;
1858                     pos++;
1859                 }
1860             }
1861         }
1862         trie->lasttrans = pos + 1;
1863         trie->states = (reg_trie_state *)
1864 	    PerlMemShared_realloc( trie->states, laststate
1865 				   * sizeof(reg_trie_state) );
1866         DEBUG_TRIE_COMPILE_MORE_r(
1867                 PerlIO_printf( Perl_debug_log,
1868 		    "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1869 		    (int)depth * 2 + 2,"",
1870                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1871 		    (IV)next_alloc,
1872 		    (IV)pos,
1873                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1874             );
1875 
1876         } /* end table compress */
1877     }
1878     DEBUG_TRIE_COMPILE_MORE_r(
1879             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1880                 (int)depth * 2 + 2, "",
1881                 (UV)trie->statecount,
1882                 (UV)trie->lasttrans)
1883     );
1884     /* resize the trans array to remove unused space */
1885     trie->trans = (reg_trie_trans *)
1886 	PerlMemShared_realloc( trie->trans, trie->lasttrans
1887 			       * sizeof(reg_trie_trans) );
1888 
1889     /* and now dump out the compressed format */
1890     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1891 
1892     {   /* Modify the program and insert the new TRIE node*/
1893         U8 nodetype =(U8)(flags & 0xFF);
1894         char *str=NULL;
1895 
1896 #ifdef DEBUGGING
1897         regnode *optimize = NULL;
1898 #ifdef RE_TRACK_PATTERN_OFFSETS
1899 
1900         U32 mjd_offset = 0;
1901         U32 mjd_nodelen = 0;
1902 #endif /* RE_TRACK_PATTERN_OFFSETS */
1903 #endif /* DEBUGGING */
1904         /*
1905            This means we convert either the first branch or the first Exact,
1906            depending on whether the thing following (in 'last') is a branch
1907            or not and whther first is the startbranch (ie is it a sub part of
1908            the alternation or is it the whole thing.)
1909            Assuming its a sub part we conver the EXACT otherwise we convert
1910            the whole branch sequence, including the first.
1911          */
1912         /* Find the node we are going to overwrite */
1913         if ( first != startbranch || OP( last ) == BRANCH ) {
1914             /* branch sub-chain */
1915             NEXT_OFF( first ) = (U16)(last - first);
1916 #ifdef RE_TRACK_PATTERN_OFFSETS
1917             DEBUG_r({
1918                 mjd_offset= Node_Offset((convert));
1919                 mjd_nodelen= Node_Length((convert));
1920             });
1921 #endif
1922             /* whole branch chain */
1923         }
1924 #ifdef RE_TRACK_PATTERN_OFFSETS
1925         else {
1926             DEBUG_r({
1927                 const  regnode *nop = NEXTOPER( convert );
1928                 mjd_offset= Node_Offset((nop));
1929                 mjd_nodelen= Node_Length((nop));
1930             });
1931         }
1932         DEBUG_OPTIMISE_r(
1933             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1934                 (int)depth * 2 + 2, "",
1935                 (UV)mjd_offset, (UV)mjd_nodelen)
1936         );
1937 #endif
1938         /* But first we check to see if there is a common prefix we can
1939            split out as an EXACT and put in front of the TRIE node.  */
1940         trie->startstate= 1;
1941         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1942             U32 state;
1943             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1944                 U32 ofs = 0;
1945                 I32 idx = -1;
1946                 U32 count = 0;
1947                 const U32 base = trie->states[ state ].trans.base;
1948 
1949                 if ( trie->states[state].wordnum )
1950                         count = 1;
1951 
1952                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1953                     if ( ( base + ofs >= trie->uniquecharcount ) &&
1954                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1955                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1956                     {
1957                         if ( ++count > 1 ) {
1958                             SV **tmp = av_fetch( revcharmap, ofs, 0);
1959 			    const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1960                             if ( state == 1 ) break;
1961                             if ( count == 2 ) {
1962                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1963                                 DEBUG_OPTIMISE_r(
1964                                     PerlIO_printf(Perl_debug_log,
1965 					"%*sNew Start State=%"UVuf" Class: [",
1966                                         (int)depth * 2 + 2, "",
1967                                         (UV)state));
1968 				if (idx >= 0) {
1969 				    SV ** const tmp = av_fetch( revcharmap, idx, 0);
1970 				    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1971 
1972                                     TRIE_BITMAP_SET(trie,*ch);
1973                                     if ( folder )
1974                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
1975                                     DEBUG_OPTIMISE_r(
1976                                         PerlIO_printf(Perl_debug_log, (char*)ch)
1977                                     );
1978 				}
1979 			    }
1980 			    TRIE_BITMAP_SET(trie,*ch);
1981 			    if ( folder )
1982 				TRIE_BITMAP_SET(trie,folder[ *ch ]);
1983 			    DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1984 			}
1985                         idx = ofs;
1986 		    }
1987                 }
1988                 if ( count == 1 ) {
1989                     SV **tmp = av_fetch( revcharmap, idx, 0);
1990                     STRLEN len;
1991                     char *ch = SvPV( *tmp, len );
1992                     DEBUG_OPTIMISE_r({
1993                         SV *sv=sv_newmortal();
1994                         PerlIO_printf( Perl_debug_log,
1995 			    "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1996                             (int)depth * 2 + 2, "",
1997                             (UV)state, (UV)idx,
1998                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1999 	                        PL_colors[0], PL_colors[1],
2000 	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2001 	                        PERL_PV_ESCAPE_FIRSTCHAR
2002                             )
2003                         );
2004                     });
2005                     if ( state==1 ) {
2006                         OP( convert ) = nodetype;
2007                         str=STRING(convert);
2008                         STR_LEN(convert)=0;
2009                     }
2010                     STR_LEN(convert) += len;
2011                     while (len--)
2012                         *str++ = *ch++;
2013 		} else {
2014 #ifdef DEBUGGING
2015 		    if (state>1)
2016 			DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2017 #endif
2018 		    break;
2019 		}
2020 	    }
2021             if (str) {
2022                 regnode *n = convert+NODE_SZ_STR(convert);
2023                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2024                 trie->startstate = state;
2025                 trie->minlen -= (state - 1);
2026                 trie->maxlen -= (state - 1);
2027 #ifdef DEBUGGING
2028                /* At least the UNICOS C compiler choked on this
2029                 * being argument to DEBUG_r(), so let's just have
2030                 * it right here. */
2031                if (
2032 #ifdef PERL_EXT_RE_BUILD
2033                    1
2034 #else
2035                    DEBUG_r_TEST
2036 #endif
2037                    ) {
2038                    regnode *fix = convert;
2039                    U32 word = trie->wordcount;
2040                    mjd_nodelen++;
2041                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2042                    while( ++fix < n ) {
2043                        Set_Node_Offset_Length(fix, 0, 0);
2044                    }
2045                    while (word--) {
2046                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2047                        if (tmp) {
2048                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2049                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2050                            else
2051                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2052                        }
2053                    }
2054                }
2055 #endif
2056                 if (trie->maxlen) {
2057                     convert = n;
2058 		} else {
2059                     NEXT_OFF(convert) = (U16)(tail - convert);
2060                     DEBUG_r(optimize= n);
2061                 }
2062             }
2063         }
2064         if (!jumper)
2065             jumper = last;
2066         if ( trie->maxlen ) {
2067 	    NEXT_OFF( convert ) = (U16)(tail - convert);
2068 	    ARG_SET( convert, data_slot );
2069 	    /* Store the offset to the first unabsorbed branch in
2070 	       jump[0], which is otherwise unused by the jump logic.
2071 	       We use this when dumping a trie and during optimisation. */
2072 	    if (trie->jump)
2073 	        trie->jump[0] = (U16)(nextbranch - convert);
2074 
2075             /* XXXX */
2076             if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2077                  ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2078             {
2079                 OP( convert ) = TRIEC;
2080                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2081                 PerlMemShared_free(trie->bitmap);
2082                 trie->bitmap= NULL;
2083             } else
2084                 OP( convert ) = TRIE;
2085 
2086             /* store the type in the flags */
2087             convert->flags = nodetype;
2088             DEBUG_r({
2089             optimize = convert
2090                       + NODE_STEP_REGNODE
2091                       + regarglen[ OP( convert ) ];
2092             });
2093             /* XXX We really should free up the resource in trie now,
2094                    as we won't use them - (which resources?) dmq */
2095         }
2096         /* needed for dumping*/
2097         DEBUG_r(if (optimize) {
2098             regnode *opt = convert;
2099 
2100             while ( ++opt < optimize) {
2101                 Set_Node_Offset_Length(opt,0,0);
2102             }
2103             /*
2104                 Try to clean up some of the debris left after the
2105                 optimisation.
2106              */
2107             while( optimize < jumper ) {
2108                 mjd_nodelen += Node_Length((optimize));
2109                 OP( optimize ) = OPTIMIZED;
2110                 Set_Node_Offset_Length(optimize,0,0);
2111                 optimize++;
2112             }
2113             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2114         });
2115     } /* end node insert */
2116     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2117 #ifdef DEBUGGING
2118     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2119     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2120 #else
2121     SvREFCNT_dec(revcharmap);
2122 #endif
2123     return trie->jump
2124            ? MADE_JUMP_TRIE
2125            : trie->startstate>1
2126              ? MADE_EXACT_TRIE
2127              : MADE_TRIE;
2128 }
2129 
2130 STATIC void
2131 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2132 {
2133 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2134 
2135    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2136    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2137    ISBN 0-201-10088-6
2138 
2139    We find the fail state for each state in the trie, this state is the longest proper
2140    suffix of the current states 'word' that is also a proper prefix of another word in our
2141    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2142    the DFA not to have to restart after its tried and failed a word at a given point, it
2143    simply continues as though it had been matching the other word in the first place.
2144    Consider
2145       'abcdgu'=~/abcdefg|cdgu/
2146    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2147    fail, which would bring use to the state representing 'd' in the second word where we would
2148    try 'g' and succeed, prodceding to match 'cdgu'.
2149  */
2150  /* add a fail transition */
2151     const U32 trie_offset = ARG(source);
2152     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2153     U32 *q;
2154     const U32 ucharcount = trie->uniquecharcount;
2155     const U32 numstates = trie->statecount;
2156     const U32 ubound = trie->lasttrans + ucharcount;
2157     U32 q_read = 0;
2158     U32 q_write = 0;
2159     U32 charid;
2160     U32 base = trie->states[ 1 ].trans.base;
2161     U32 *fail;
2162     reg_ac_data *aho;
2163     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2164     GET_RE_DEBUG_FLAGS_DECL;
2165 #ifndef DEBUGGING
2166     PERL_UNUSED_ARG(depth);
2167 #endif
2168 
2169 
2170     ARG_SET( stclass, data_slot );
2171     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2172     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2173     aho->trie=trie_offset;
2174     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2175     Copy( trie->states, aho->states, numstates, reg_trie_state );
2176     Newxz( q, numstates, U32);
2177     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2178     aho->refcount = 1;
2179     fail = aho->fail;
2180     /* initialize fail[0..1] to be 1 so that we always have
2181        a valid final fail state */
2182     fail[ 0 ] = fail[ 1 ] = 1;
2183 
2184     for ( charid = 0; charid < ucharcount ; charid++ ) {
2185 	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2186 	if ( newstate ) {
2187             q[ q_write ] = newstate;
2188             /* set to point at the root */
2189             fail[ q[ q_write++ ] ]=1;
2190         }
2191     }
2192     while ( q_read < q_write) {
2193 	const U32 cur = q[ q_read++ % numstates ];
2194         base = trie->states[ cur ].trans.base;
2195 
2196         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2197 	    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2198 	    if (ch_state) {
2199                 U32 fail_state = cur;
2200                 U32 fail_base;
2201                 do {
2202                     fail_state = fail[ fail_state ];
2203                     fail_base = aho->states[ fail_state ].trans.base;
2204                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2205 
2206                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2207                 fail[ ch_state ] = fail_state;
2208                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2209                 {
2210                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2211                 }
2212                 q[ q_write++ % numstates] = ch_state;
2213             }
2214         }
2215     }
2216     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2217        when we fail in state 1, this allows us to use the
2218        charclass scan to find a valid start char. This is based on the principle
2219        that theres a good chance the string being searched contains lots of stuff
2220        that cant be a start char.
2221      */
2222     fail[ 0 ] = fail[ 1 ] = 0;
2223     DEBUG_TRIE_COMPILE_r({
2224         PerlIO_printf(Perl_debug_log,
2225 		      "%*sStclass Failtable (%"UVuf" states): 0",
2226 		      (int)(depth * 2), "", (UV)numstates
2227         );
2228         for( q_read=1; q_read<numstates; q_read++ ) {
2229             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2230         }
2231         PerlIO_printf(Perl_debug_log, "\n");
2232     });
2233     Safefree(q);
2234     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2235 }
2236 
2237 
2238 /*
2239  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2240  * These need to be revisited when a newer toolchain becomes available.
2241  */
2242 #if defined(__sparc64__) && defined(__GNUC__)
2243 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2244 #       undef  SPARC64_GCC_WORKAROUND
2245 #       define SPARC64_GCC_WORKAROUND 1
2246 #   endif
2247 #endif
2248 
2249 #define DEBUG_PEEP(str,scan,depth) \
2250     DEBUG_OPTIMISE_r({if (scan){ \
2251        SV * const mysv=sv_newmortal(); \
2252        regnode *Next = regnext(scan); \
2253        regprop(RExC_rx, mysv, scan); \
2254        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2255        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2256        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2257    }});
2258 
2259 
2260 
2261 
2262 
2263 #define JOIN_EXACT(scan,min,flags) \
2264     if (PL_regkind[OP(scan)] == EXACT) \
2265         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2266 
2267 STATIC U32
2268 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2269     /* Merge several consecutive EXACTish nodes into one. */
2270     regnode *n = regnext(scan);
2271     U32 stringok = 1;
2272     regnode *next = scan + NODE_SZ_STR(scan);
2273     U32 merged = 0;
2274     U32 stopnow = 0;
2275 #ifdef DEBUGGING
2276     regnode *stop = scan;
2277     GET_RE_DEBUG_FLAGS_DECL;
2278 #else
2279     PERL_UNUSED_ARG(depth);
2280 #endif
2281 #ifndef EXPERIMENTAL_INPLACESCAN
2282     PERL_UNUSED_ARG(flags);
2283     PERL_UNUSED_ARG(val);
2284 #endif
2285     DEBUG_PEEP("join",scan,depth);
2286 
2287     /* Skip NOTHING, merge EXACT*. */
2288     while (n &&
2289            ( PL_regkind[OP(n)] == NOTHING ||
2290              (stringok && (OP(n) == OP(scan))))
2291            && NEXT_OFF(n)
2292            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2293 
2294         if (OP(n) == TAIL || n > next)
2295             stringok = 0;
2296         if (PL_regkind[OP(n)] == NOTHING) {
2297             DEBUG_PEEP("skip:",n,depth);
2298             NEXT_OFF(scan) += NEXT_OFF(n);
2299             next = n + NODE_STEP_REGNODE;
2300 #ifdef DEBUGGING
2301             if (stringok)
2302                 stop = n;
2303 #endif
2304             n = regnext(n);
2305         }
2306         else if (stringok) {
2307             const unsigned int oldl = STR_LEN(scan);
2308             regnode * const nnext = regnext(n);
2309 
2310             DEBUG_PEEP("merg",n,depth);
2311 
2312             merged++;
2313             if (oldl + STR_LEN(n) > U8_MAX)
2314                 break;
2315             NEXT_OFF(scan) += NEXT_OFF(n);
2316             STR_LEN(scan) += STR_LEN(n);
2317             next = n + NODE_SZ_STR(n);
2318             /* Now we can overwrite *n : */
2319             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2320 #ifdef DEBUGGING
2321             stop = next - 1;
2322 #endif
2323             n = nnext;
2324             if (stopnow) break;
2325         }
2326 
2327 #ifdef EXPERIMENTAL_INPLACESCAN
2328 	if (flags && !NEXT_OFF(n)) {
2329 	    DEBUG_PEEP("atch", val, depth);
2330 	    if (reg_off_by_arg[OP(n)]) {
2331 		ARG_SET(n, val - n);
2332 	    }
2333 	    else {
2334 		NEXT_OFF(n) = val - n;
2335 	    }
2336 	    stopnow = 1;
2337 	}
2338 #endif
2339     }
2340 
2341     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2342     /*
2343     Two problematic code points in Unicode casefolding of EXACT nodes:
2344 
2345     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2346     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2347 
2348     which casefold to
2349 
2350     Unicode                      UTF-8
2351 
2352     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2353     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2354 
2355     This means that in case-insensitive matching (or "loose matching",
2356     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2357     length of the above casefolded versions) can match a target string
2358     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2359     This would rather mess up the minimum length computation.
2360 
2361     What we'll do is to look for the tail four bytes, and then peek
2362     at the preceding two bytes to see whether we need to decrease
2363     the minimum length by four (six minus two).
2364 
2365     Thanks to the design of UTF-8, there cannot be false matches:
2366     A sequence of valid UTF-8 bytes cannot be a subsequence of
2367     another valid sequence of UTF-8 bytes.
2368 
2369     */
2370          char * const s0 = STRING(scan), *s, *t;
2371          char * const s1 = s0 + STR_LEN(scan) - 1;
2372          char * const s2 = s1 - 4;
2373 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2374 	 const char t0[] = "\xaf\x49\xaf\x42";
2375 #else
2376          const char t0[] = "\xcc\x88\xcc\x81";
2377 #endif
2378          const char * const t1 = t0 + 3;
2379 
2380          for (s = s0 + 2;
2381               s < s2 && (t = ninstr(s, s1, t0, t1));
2382               s = t + 4) {
2383 #ifdef EBCDIC
2384 	      if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2385 		  ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2386 #else
2387               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2388                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2389 #endif
2390                    *min -= 4;
2391          }
2392     }
2393 
2394 #ifdef DEBUGGING
2395     /* Allow dumping */
2396     n = scan + NODE_SZ_STR(scan);
2397     while (n <= stop) {
2398         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2399             OP(n) = OPTIMIZED;
2400             NEXT_OFF(n) = 0;
2401         }
2402         n++;
2403     }
2404 #endif
2405     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2406     return stopnow;
2407 }
2408 
2409 /* REx optimizer.  Converts nodes into quickier variants "in place".
2410    Finds fixed substrings.  */
2411 
2412 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2413    to the position after last scanned or to NULL. */
2414 
2415 #define INIT_AND_WITHP \
2416     assert(!and_withp); \
2417     Newx(and_withp,1,struct regnode_charclass_class); \
2418     SAVEFREEPV(and_withp)
2419 
2420 /* this is a chain of data about sub patterns we are processing that
2421    need to be handled seperately/specially in study_chunk. Its so
2422    we can simulate recursion without losing state.  */
2423 struct scan_frame;
2424 typedef struct scan_frame {
2425     regnode *last;  /* last node to process in this frame */
2426     regnode *next;  /* next node to process when last is reached */
2427     struct scan_frame *prev; /*previous frame*/
2428     I32 stop; /* what stopparen do we use */
2429 } scan_frame;
2430 
2431 
2432 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2433 
2434 #define CASE_SYNST_FNC(nAmE)                                       \
2435 case nAmE:                                                         \
2436     if (flags & SCF_DO_STCLASS_AND) {                              \
2437 	    for (value = 0; value < 256; value++)                  \
2438 		if (!is_ ## nAmE ## _cp(value))                       \
2439 		    ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2440     }                                                              \
2441     else {                                                         \
2442 	    for (value = 0; value < 256; value++)                  \
2443 		if (is_ ## nAmE ## _cp(value))                        \
2444 		    ANYOF_BITMAP_SET(data->start_class, value);	   \
2445     }                                                              \
2446     break;                                                         \
2447 case N ## nAmE:                                                    \
2448     if (flags & SCF_DO_STCLASS_AND) {                              \
2449 	    for (value = 0; value < 256; value++)                   \
2450 		if (is_ ## nAmE ## _cp(value))                         \
2451 		    ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2452     }                                                               \
2453     else {                                                          \
2454 	    for (value = 0; value < 256; value++)                   \
2455 		if (!is_ ## nAmE ## _cp(value))                        \
2456 		    ANYOF_BITMAP_SET(data->start_class, value);	    \
2457     }                                                               \
2458     break
2459 
2460 
2461 
2462 STATIC I32
2463 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2464                         I32 *minlenp, I32 *deltap,
2465 			regnode *last,
2466 			scan_data_t *data,
2467 			I32 stopparen,
2468 			U8* recursed,
2469 			struct regnode_charclass_class *and_withp,
2470 			U32 flags, U32 depth)
2471 			/* scanp: Start here (read-write). */
2472 			/* deltap: Write maxlen-minlen here. */
2473 			/* last: Stop before this one. */
2474 			/* data: string data about the pattern */
2475 			/* stopparen: treat close N as END */
2476 			/* recursed: which subroutines have we recursed into */
2477 			/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2478 {
2479     dVAR;
2480     I32 min = 0, pars = 0, code;
2481     regnode *scan = *scanp, *next;
2482     I32 delta = 0;
2483     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2484     int is_inf_internal = 0;		/* The studied chunk is infinite */
2485     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2486     scan_data_t data_fake;
2487     SV *re_trie_maxbuff = NULL;
2488     regnode *first_non_open = scan;
2489     I32 stopmin = I32_MAX;
2490     scan_frame *frame = NULL;
2491 
2492     GET_RE_DEBUG_FLAGS_DECL;
2493 
2494 #ifdef DEBUGGING
2495     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2496 #endif
2497 
2498     if ( depth == 0 ) {
2499         while (first_non_open && OP(first_non_open) == OPEN)
2500             first_non_open=regnext(first_non_open);
2501     }
2502 
2503 
2504   fake_study_recurse:
2505     while ( scan && OP(scan) != END && scan < last ){
2506 	/* Peephole optimizer: */
2507 	DEBUG_STUDYDATA("Peep:", data,depth);
2508 	DEBUG_PEEP("Peep",scan,depth);
2509         JOIN_EXACT(scan,&min,0);
2510 
2511 	/* Follow the next-chain of the current node and optimize
2512 	   away all the NOTHINGs from it.  */
2513 	if (OP(scan) != CURLYX) {
2514 	    const int max = (reg_off_by_arg[OP(scan)]
2515 		       ? I32_MAX
2516 		       /* I32 may be smaller than U16 on CRAYs! */
2517 		       : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2518 	    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2519 	    int noff;
2520 	    regnode *n = scan;
2521 
2522 	    /* Skip NOTHING and LONGJMP. */
2523 	    while ((n = regnext(n))
2524 		   && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2525 		       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2526 		   && off + noff < max)
2527 		off += noff;
2528 	    if (reg_off_by_arg[OP(scan)])
2529 		ARG(scan) = off;
2530 	    else
2531 		NEXT_OFF(scan) = off;
2532 	}
2533 
2534 
2535 
2536 	/* The principal pseudo-switch.  Cannot be a switch, since we
2537 	   look into several different things.  */
2538 	if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2539 		   || OP(scan) == IFTHEN) {
2540 	    next = regnext(scan);
2541 	    code = OP(scan);
2542 	    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2543 
2544 	    if (OP(next) == code || code == IFTHEN) {
2545 	        /* NOTE - There is similar code to this block below for handling
2546 	           TRIE nodes on a re-study.  If you change stuff here check there
2547 	           too. */
2548 		I32 max1 = 0, min1 = I32_MAX, num = 0;
2549 		struct regnode_charclass_class accum;
2550 		regnode * const startbranch=scan;
2551 
2552 		if (flags & SCF_DO_SUBSTR)
2553 		    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2554 		if (flags & SCF_DO_STCLASS)
2555 		    cl_init_zero(pRExC_state, &accum);
2556 
2557 		while (OP(scan) == code) {
2558 		    I32 deltanext, minnext, f = 0, fake;
2559 		    struct regnode_charclass_class this_class;
2560 
2561 		    num++;
2562 		    data_fake.flags = 0;
2563 		    if (data) {
2564 			data_fake.whilem_c = data->whilem_c;
2565 			data_fake.last_closep = data->last_closep;
2566 		    }
2567 		    else
2568 			data_fake.last_closep = &fake;
2569 
2570 		    data_fake.pos_delta = delta;
2571 		    next = regnext(scan);
2572 		    scan = NEXTOPER(scan);
2573 		    if (code != BRANCH)
2574 			scan = NEXTOPER(scan);
2575 		    if (flags & SCF_DO_STCLASS) {
2576 			cl_init(pRExC_state, &this_class);
2577 			data_fake.start_class = &this_class;
2578 			f = SCF_DO_STCLASS_AND;
2579 		    }
2580 		    if (flags & SCF_WHILEM_VISITED_POS)
2581 			f |= SCF_WHILEM_VISITED_POS;
2582 
2583 		    /* we suppose the run is continuous, last=next...*/
2584 		    minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2585 					  next, &data_fake,
2586 					  stopparen, recursed, NULL, f,depth+1);
2587 		    if (min1 > minnext)
2588 			min1 = minnext;
2589 		    if (max1 < minnext + deltanext)
2590 			max1 = minnext + deltanext;
2591 		    if (deltanext == I32_MAX)
2592 			is_inf = is_inf_internal = 1;
2593 		    scan = next;
2594 		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2595 			pars++;
2596 	            if (data_fake.flags & SCF_SEEN_ACCEPT) {
2597 	                if ( stopmin > minnext)
2598 	                    stopmin = min + min1;
2599 	                flags &= ~SCF_DO_SUBSTR;
2600 	                if (data)
2601 	                    data->flags |= SCF_SEEN_ACCEPT;
2602 	            }
2603 		    if (data) {
2604 			if (data_fake.flags & SF_HAS_EVAL)
2605 			    data->flags |= SF_HAS_EVAL;
2606 			data->whilem_c = data_fake.whilem_c;
2607 		    }
2608 		    if (flags & SCF_DO_STCLASS)
2609 			cl_or(pRExC_state, &accum, &this_class);
2610 		}
2611 		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2612 		    min1 = 0;
2613 		if (flags & SCF_DO_SUBSTR) {
2614 		    data->pos_min += min1;
2615 		    data->pos_delta += max1 - min1;
2616 		    if (max1 != min1 || is_inf)
2617 			data->longest = &(data->longest_float);
2618 		}
2619 		min += min1;
2620 		delta += max1 - min1;
2621 		if (flags & SCF_DO_STCLASS_OR) {
2622 		    cl_or(pRExC_state, data->start_class, &accum);
2623 		    if (min1) {
2624 			cl_and(data->start_class, and_withp);
2625 			flags &= ~SCF_DO_STCLASS;
2626 		    }
2627 		}
2628 		else if (flags & SCF_DO_STCLASS_AND) {
2629 		    if (min1) {
2630 			cl_and(data->start_class, &accum);
2631 			flags &= ~SCF_DO_STCLASS;
2632 		    }
2633 		    else {
2634 			/* Switch to OR mode: cache the old value of
2635 			 * data->start_class */
2636 			INIT_AND_WITHP;
2637 			StructCopy(data->start_class, and_withp,
2638 				   struct regnode_charclass_class);
2639 			flags &= ~SCF_DO_STCLASS_AND;
2640 			StructCopy(&accum, data->start_class,
2641 				   struct regnode_charclass_class);
2642 			flags |= SCF_DO_STCLASS_OR;
2643 			data->start_class->flags |= ANYOF_EOS;
2644 		    }
2645 		}
2646 
2647                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2648 		/* demq.
2649 
2650 		   Assuming this was/is a branch we are dealing with: 'scan' now
2651 		   points at the item that follows the branch sequence, whatever
2652 		   it is. We now start at the beginning of the sequence and look
2653 		   for subsequences of
2654 
2655 		   BRANCH->EXACT=>x1
2656 		   BRANCH->EXACT=>x2
2657 		   tail
2658 
2659 		   which would be constructed from a pattern like /A|LIST|OF|WORDS/
2660 
2661 		   If we can find such a subseqence we need to turn the first
2662 		   element into a trie and then add the subsequent branch exact
2663 		   strings to the trie.
2664 
2665 		   We have two cases
2666 
2667 		     1. patterns where the whole set of branch can be converted.
2668 
2669 		     2. patterns where only a subset can be converted.
2670 
2671 		   In case 1 we can replace the whole set with a single regop
2672 		   for the trie. In case 2 we need to keep the start and end
2673 		   branchs so
2674 
2675 		     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2676 		     becomes BRANCH TRIE; BRANCH X;
2677 
2678 		  There is an additional case, that being where there is a
2679 		  common prefix, which gets split out into an EXACT like node
2680 		  preceding the TRIE node.
2681 
2682 		  If x(1..n)==tail then we can do a simple trie, if not we make
2683 		  a "jump" trie, such that when we match the appropriate word
2684 		  we "jump" to the appopriate tail node. Essentailly we turn
2685 		  a nested if into a case structure of sorts.
2686 
2687 		*/
2688 
2689 		    int made=0;
2690 		    if (!re_trie_maxbuff) {
2691 			re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2692 			if (!SvIOK(re_trie_maxbuff))
2693 			    sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2694 		    }
2695                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2696                         regnode *cur;
2697                         regnode *first = (regnode *)NULL;
2698                         regnode *last = (regnode *)NULL;
2699                         regnode *tail = scan;
2700                         U8 optype = 0;
2701                         U32 count=0;
2702 
2703 #ifdef DEBUGGING
2704                         SV * const mysv = sv_newmortal();       /* for dumping */
2705 #endif
2706                         /* var tail is used because there may be a TAIL
2707                            regop in the way. Ie, the exacts will point to the
2708                            thing following the TAIL, but the last branch will
2709                            point at the TAIL. So we advance tail. If we
2710                            have nested (?:) we may have to move through several
2711                            tails.
2712                          */
2713 
2714                         while ( OP( tail ) == TAIL ) {
2715                             /* this is the TAIL generated by (?:) */
2716                             tail = regnext( tail );
2717                         }
2718 
2719 
2720                         DEBUG_OPTIMISE_r({
2721                             regprop(RExC_rx, mysv, tail );
2722                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2723                                 (int)depth * 2 + 2, "",
2724                                 "Looking for TRIE'able sequences. Tail node is: ",
2725                                 SvPV_nolen_const( mysv )
2726                             );
2727                         });
2728 
2729                         /*
2730 
2731                            step through the branches, cur represents each
2732                            branch, noper is the first thing to be matched
2733                            as part of that branch and noper_next is the
2734                            regnext() of that node. if noper is an EXACT
2735                            and noper_next is the same as scan (our current
2736                            position in the regex) then the EXACT branch is
2737                            a possible optimization target. Once we have
2738                            two or more consequetive such branches we can
2739                            create a trie of the EXACT's contents and stich
2740                            it in place. If the sequence represents all of
2741                            the branches we eliminate the whole thing and
2742                            replace it with a single TRIE. If it is a
2743                            subsequence then we need to stitch it in. This
2744                            means the first branch has to remain, and needs
2745                            to be repointed at the item on the branch chain
2746                            following the last branch optimized. This could
2747                            be either a BRANCH, in which case the
2748                            subsequence is internal, or it could be the
2749                            item following the branch sequence in which
2750                            case the subsequence is at the end.
2751 
2752                         */
2753 
2754                         /* dont use tail as the end marker for this traverse */
2755                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2756                             regnode * const noper = NEXTOPER( cur );
2757 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2758                             regnode * const noper_next = regnext( noper );
2759 #endif
2760 
2761                             DEBUG_OPTIMISE_r({
2762                                 regprop(RExC_rx, mysv, cur);
2763                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2764                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2765 
2766                                 regprop(RExC_rx, mysv, noper);
2767                                 PerlIO_printf( Perl_debug_log, " -> %s",
2768                                     SvPV_nolen_const(mysv));
2769 
2770                                 if ( noper_next ) {
2771                                   regprop(RExC_rx, mysv, noper_next );
2772                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2773                                     SvPV_nolen_const(mysv));
2774                                 }
2775                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2776                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2777                             });
2778                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2779                                          : PL_regkind[ OP( noper ) ] == EXACT )
2780                                   || OP(noper) == NOTHING )
2781 #ifdef NOJUMPTRIE
2782                                   && noper_next == tail
2783 #endif
2784                                   && count < U16_MAX)
2785                             {
2786                                 count++;
2787                                 if ( !first || optype == NOTHING ) {
2788                                     if (!first) first = cur;
2789                                     optype = OP( noper );
2790                                 } else {
2791                                     last = cur;
2792                                 }
2793                             } else {
2794 /*
2795     Currently we assume that the trie can handle unicode and ascii
2796     matches fold cased matches. If this proves true then the following
2797     define will prevent tries in this situation.
2798 
2799     #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2800 */
2801 #define TRIE_TYPE_IS_SAFE 1
2802                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2803                                     make_trie( pRExC_state,
2804                                             startbranch, first, cur, tail, count,
2805                                             optype, depth+1 );
2806                                 }
2807                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2808 #ifdef NOJUMPTRIE
2809                                      && noper_next == tail
2810 #endif
2811                                 ){
2812                                     count = 1;
2813                                     first = cur;
2814                                     optype = OP( noper );
2815                                 } else {
2816                                     count = 0;
2817                                     first = NULL;
2818                                     optype = 0;
2819                                 }
2820                                 last = NULL;
2821                             }
2822                         }
2823                         DEBUG_OPTIMISE_r({
2824                             regprop(RExC_rx, mysv, cur);
2825                             PerlIO_printf( Perl_debug_log,
2826                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2827                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2828 
2829                         });
2830 
2831                         if ( last && TRIE_TYPE_IS_SAFE ) {
2832                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2833 #ifdef TRIE_STUDY_OPT
2834                             if ( ((made == MADE_EXACT_TRIE &&
2835                                  startbranch == first)
2836                                  || ( first_non_open == first )) &&
2837                                  depth==0 ) {
2838                                 flags |= SCF_TRIE_RESTUDY;
2839                                 if ( startbranch == first
2840                                      && scan == tail )
2841                                 {
2842                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2843                                 }
2844                             }
2845 #endif
2846                         }
2847                     }
2848 
2849                 } /* do trie */
2850 
2851 	    }
2852 	    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2853 		scan = NEXTOPER(NEXTOPER(scan));
2854 	    } else			/* single branch is optimized. */
2855 		scan = NEXTOPER(scan);
2856 	    continue;
2857 	} else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2858 	    scan_frame *newframe = NULL;
2859 	    I32 paren;
2860 	    regnode *start;
2861 	    regnode *end;
2862 
2863 	    if (OP(scan) != SUSPEND) {
2864 	    /* set the pointer */
2865 	        if (OP(scan) == GOSUB) {
2866 	            paren = ARG(scan);
2867 	            RExC_recurse[ARG2L(scan)] = scan;
2868                     start = RExC_open_parens[paren-1];
2869                     end   = RExC_close_parens[paren-1];
2870                 } else {
2871                     paren = 0;
2872                     start = RExC_rxi->program + 1;
2873                     end   = RExC_opend;
2874                 }
2875                 if (!recursed) {
2876                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2877                     SAVEFREEPV(recursed);
2878                 }
2879                 if (!PAREN_TEST(recursed,paren+1)) {
2880 		    PAREN_SET(recursed,paren+1);
2881                     Newx(newframe,1,scan_frame);
2882                 } else {
2883                     if (flags & SCF_DO_SUBSTR) {
2884                         SCAN_COMMIT(pRExC_state,data,minlenp);
2885                         data->longest = &(data->longest_float);
2886                     }
2887                     is_inf = is_inf_internal = 1;
2888                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2889                         cl_anything(pRExC_state, data->start_class);
2890                     flags &= ~SCF_DO_STCLASS;
2891 	        }
2892             } else {
2893 	        Newx(newframe,1,scan_frame);
2894 	        paren = stopparen;
2895 	        start = scan+2;
2896 	        end = regnext(scan);
2897 	    }
2898 	    if (newframe) {
2899                 assert(start);
2900                 assert(end);
2901 	        SAVEFREEPV(newframe);
2902 	        newframe->next = regnext(scan);
2903 	        newframe->last = last;
2904 	        newframe->stop = stopparen;
2905 	        newframe->prev = frame;
2906 
2907 	        frame = newframe;
2908 	        scan =  start;
2909 	        stopparen = paren;
2910 	        last = end;
2911 
2912 	        continue;
2913 	    }
2914 	}
2915 	else if (OP(scan) == EXACT) {
2916 	    I32 l = STR_LEN(scan);
2917 	    UV uc;
2918 	    if (UTF) {
2919 		const U8 * const s = (U8*)STRING(scan);
2920 		l = utf8_length(s, s + l);
2921 		uc = utf8_to_uvchr(s, NULL);
2922 	    } else {
2923 		uc = *((U8*)STRING(scan));
2924 	    }
2925 	    min += l;
2926 	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2927 		/* The code below prefers earlier match for fixed
2928 		   offset, later match for variable offset.  */
2929 		if (data->last_end == -1) { /* Update the start info. */
2930 		    data->last_start_min = data->pos_min;
2931  		    data->last_start_max = is_inf
2932  			? I32_MAX : data->pos_min + data->pos_delta;
2933 		}
2934 		sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2935 		if (UTF)
2936 		    SvUTF8_on(data->last_found);
2937 		{
2938 		    SV * const sv = data->last_found;
2939 		    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2940 			mg_find(sv, PERL_MAGIC_utf8) : NULL;
2941 		    if (mg && mg->mg_len >= 0)
2942 			mg->mg_len += utf8_length((U8*)STRING(scan),
2943 						  (U8*)STRING(scan)+STR_LEN(scan));
2944 		}
2945 		data->last_end = data->pos_min + l;
2946 		data->pos_min += l; /* As in the first entry. */
2947 		data->flags &= ~SF_BEFORE_EOL;
2948 	    }
2949 	    if (flags & SCF_DO_STCLASS_AND) {
2950 		/* Check whether it is compatible with what we know already! */
2951 		int compat = 1;
2952 
2953 		if (uc >= 0x100 ||
2954 		    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2955 		    && !ANYOF_BITMAP_TEST(data->start_class, uc)
2956 		    && (!(data->start_class->flags & ANYOF_FOLD)
2957 			|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2958                     )
2959 		    compat = 0;
2960 		ANYOF_CLASS_ZERO(data->start_class);
2961 		ANYOF_BITMAP_ZERO(data->start_class);
2962 		if (compat)
2963 		    ANYOF_BITMAP_SET(data->start_class, uc);
2964 		data->start_class->flags &= ~ANYOF_EOS;
2965 		if (uc < 0x100)
2966 		  data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2967 	    }
2968 	    else if (flags & SCF_DO_STCLASS_OR) {
2969 		/* false positive possible if the class is case-folded */
2970 		if (uc < 0x100)
2971 		    ANYOF_BITMAP_SET(data->start_class, uc);
2972 		else
2973 		    data->start_class->flags |= ANYOF_UNICODE_ALL;
2974 		data->start_class->flags &= ~ANYOF_EOS;
2975 		cl_and(data->start_class, and_withp);
2976 	    }
2977 	    flags &= ~SCF_DO_STCLASS;
2978 	}
2979 	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2980 	    I32 l = STR_LEN(scan);
2981 	    UV uc = *((U8*)STRING(scan));
2982 
2983 	    /* Search for fixed substrings supports EXACT only. */
2984 	    if (flags & SCF_DO_SUBSTR) {
2985 		assert(data);
2986 		SCAN_COMMIT(pRExC_state, data, minlenp);
2987 	    }
2988 	    if (UTF) {
2989 		const U8 * const s = (U8 *)STRING(scan);
2990 		l = utf8_length(s, s + l);
2991 		uc = utf8_to_uvchr(s, NULL);
2992 	    }
2993 	    min += l;
2994 	    if (flags & SCF_DO_SUBSTR)
2995 		data->pos_min += l;
2996 	    if (flags & SCF_DO_STCLASS_AND) {
2997 		/* Check whether it is compatible with what we know already! */
2998 		int compat = 1;
2999 
3000 		if (uc >= 0x100 ||
3001 		    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3002 		    && !ANYOF_BITMAP_TEST(data->start_class, uc)
3003 		     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3004 		    compat = 0;
3005 		ANYOF_CLASS_ZERO(data->start_class);
3006 		ANYOF_BITMAP_ZERO(data->start_class);
3007 		if (compat) {
3008 		    ANYOF_BITMAP_SET(data->start_class, uc);
3009 		    data->start_class->flags &= ~ANYOF_EOS;
3010 		    data->start_class->flags |= ANYOF_FOLD;
3011 		    if (OP(scan) == EXACTFL)
3012 			data->start_class->flags |= ANYOF_LOCALE;
3013 		}
3014 	    }
3015 	    else if (flags & SCF_DO_STCLASS_OR) {
3016 		if (data->start_class->flags & ANYOF_FOLD) {
3017 		    /* false positive possible if the class is case-folded.
3018 		       Assume that the locale settings are the same... */
3019 		    if (uc < 0x100)
3020 			ANYOF_BITMAP_SET(data->start_class, uc);
3021 		    data->start_class->flags &= ~ANYOF_EOS;
3022 		}
3023 		cl_and(data->start_class, and_withp);
3024 	    }
3025 	    flags &= ~SCF_DO_STCLASS;
3026 	}
3027 	else if (strchr((const char*)PL_varies,OP(scan))) {
3028 	    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3029 	    I32 f = flags, pos_before = 0;
3030 	    regnode * const oscan = scan;
3031 	    struct regnode_charclass_class this_class;
3032 	    struct regnode_charclass_class *oclass = NULL;
3033 	    I32 next_is_eval = 0;
3034 
3035 	    switch (PL_regkind[OP(scan)]) {
3036 	    case WHILEM:		/* End of (?:...)* . */
3037 		scan = NEXTOPER(scan);
3038 		goto finish;
3039 	    case PLUS:
3040 		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3041 		    next = NEXTOPER(scan);
3042 		    if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3043 			mincount = 1;
3044 			maxcount = REG_INFTY;
3045 			next = regnext(scan);
3046 			scan = NEXTOPER(scan);
3047 			goto do_curly;
3048 		    }
3049 		}
3050 		if (flags & SCF_DO_SUBSTR)
3051 		    data->pos_min++;
3052 		min++;
3053 		/* Fall through. */
3054 	    case STAR:
3055 		if (flags & SCF_DO_STCLASS) {
3056 		    mincount = 0;
3057 		    maxcount = REG_INFTY;
3058 		    next = regnext(scan);
3059 		    scan = NEXTOPER(scan);
3060 		    goto do_curly;
3061 		}
3062 		is_inf = is_inf_internal = 1;
3063 		scan = regnext(scan);
3064 		if (flags & SCF_DO_SUBSTR) {
3065 		    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3066 		    data->longest = &(data->longest_float);
3067 		}
3068 		goto optimize_curly_tail;
3069 	    case CURLY:
3070 	        if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3071 	            && (scan->flags == stopparen))
3072 		{
3073 		    mincount = 1;
3074 		    maxcount = 1;
3075 		} else {
3076 		    mincount = ARG1(scan);
3077 		    maxcount = ARG2(scan);
3078 		}
3079 		next = regnext(scan);
3080 		if (OP(scan) == CURLYX) {
3081 		    I32 lp = (data ? *(data->last_closep) : 0);
3082 		    scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3083 		}
3084 		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3085 		next_is_eval = (OP(scan) == EVAL);
3086 	      do_curly:
3087 		if (flags & SCF_DO_SUBSTR) {
3088 		    if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3089 		    pos_before = data->pos_min;
3090 		}
3091 		if (data) {
3092 		    fl = data->flags;
3093 		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3094 		    if (is_inf)
3095 			data->flags |= SF_IS_INF;
3096 		}
3097 		if (flags & SCF_DO_STCLASS) {
3098 		    cl_init(pRExC_state, &this_class);
3099 		    oclass = data->start_class;
3100 		    data->start_class = &this_class;
3101 		    f |= SCF_DO_STCLASS_AND;
3102 		    f &= ~SCF_DO_STCLASS_OR;
3103 		}
3104 		/* These are the cases when once a subexpression
3105 		   fails at a particular position, it cannot succeed
3106 		   even after backtracking at the enclosing scope.
3107 
3108 		   XXXX what if minimal match and we are at the
3109 		        initial run of {n,m}? */
3110 		if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3111 		    f &= ~SCF_WHILEM_VISITED_POS;
3112 
3113 		/* This will finish on WHILEM, setting scan, or on NULL: */
3114 		minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3115 		                      last, data, stopparen, recursed, NULL,
3116 				      (mincount == 0
3117 					? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3118 
3119 		if (flags & SCF_DO_STCLASS)
3120 		    data->start_class = oclass;
3121 		if (mincount == 0 || minnext == 0) {
3122 		    if (flags & SCF_DO_STCLASS_OR) {
3123 			cl_or(pRExC_state, data->start_class, &this_class);
3124 		    }
3125 		    else if (flags & SCF_DO_STCLASS_AND) {
3126 			/* Switch to OR mode: cache the old value of
3127 			 * data->start_class */
3128 			INIT_AND_WITHP;
3129 			StructCopy(data->start_class, and_withp,
3130 				   struct regnode_charclass_class);
3131 			flags &= ~SCF_DO_STCLASS_AND;
3132 			StructCopy(&this_class, data->start_class,
3133 				   struct regnode_charclass_class);
3134 			flags |= SCF_DO_STCLASS_OR;
3135 			data->start_class->flags |= ANYOF_EOS;
3136 		    }
3137 		} else {		/* Non-zero len */
3138 		    if (flags & SCF_DO_STCLASS_OR) {
3139 			cl_or(pRExC_state, data->start_class, &this_class);
3140 			cl_and(data->start_class, and_withp);
3141 		    }
3142 		    else if (flags & SCF_DO_STCLASS_AND)
3143 			cl_and(data->start_class, &this_class);
3144 		    flags &= ~SCF_DO_STCLASS;
3145 		}
3146 		if (!scan) 		/* It was not CURLYX, but CURLY. */
3147 		    scan = next;
3148 		if ( /* ? quantifier ok, except for (?{ ... }) */
3149 		    (next_is_eval || !(mincount == 0 && maxcount == 1))
3150 		    && (minnext == 0) && (deltanext == 0)
3151 		    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3152 		    && maxcount <= REG_INFTY/3 /* Complement check for big count */
3153 		    && ckWARN(WARN_REGEXP))
3154 		{
3155 		    vWARN(RExC_parse,
3156 			  "Quantifier unexpected on zero-length expression");
3157 		}
3158 
3159 		min += minnext * mincount;
3160 		is_inf_internal |= ((maxcount == REG_INFTY
3161 				     && (minnext + deltanext) > 0)
3162 				    || deltanext == I32_MAX);
3163 		is_inf |= is_inf_internal;
3164 		delta += (minnext + deltanext) * maxcount - minnext * mincount;
3165 
3166 		/* Try powerful optimization CURLYX => CURLYN. */
3167 		if (  OP(oscan) == CURLYX && data
3168 		      && data->flags & SF_IN_PAR
3169 		      && !(data->flags & SF_HAS_EVAL)
3170 		      && !deltanext && minnext == 1 ) {
3171 		    /* Try to optimize to CURLYN.  */
3172 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3173 		    regnode * const nxt1 = nxt;
3174 #ifdef DEBUGGING
3175 		    regnode *nxt2;
3176 #endif
3177 
3178 		    /* Skip open. */
3179 		    nxt = regnext(nxt);
3180 		    if (!strchr((const char*)PL_simple,OP(nxt))
3181 			&& !(PL_regkind[OP(nxt)] == EXACT
3182 			     && STR_LEN(nxt) == 1))
3183 			goto nogo;
3184 #ifdef DEBUGGING
3185 		    nxt2 = nxt;
3186 #endif
3187 		    nxt = regnext(nxt);
3188 		    if (OP(nxt) != CLOSE)
3189 			goto nogo;
3190 		    if (RExC_open_parens) {
3191 			RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3192 			RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3193 		    }
3194 		    /* Now we know that nxt2 is the only contents: */
3195 		    oscan->flags = (U8)ARG(nxt);
3196 		    OP(oscan) = CURLYN;
3197 		    OP(nxt1) = NOTHING;	/* was OPEN. */
3198 
3199 #ifdef DEBUGGING
3200 		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3201 		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3202 		    NEXT_OFF(nxt2) = 0;	/* just for consistancy with CURLY. */
3203 		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
3204 		    OP(nxt + 1) = OPTIMIZED; /* was count. */
3205 		    NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3206 #endif
3207 		}
3208 	      nogo:
3209 
3210 		/* Try optimization CURLYX => CURLYM. */
3211 		if (  OP(oscan) == CURLYX && data
3212 		      && !(data->flags & SF_HAS_PAR)
3213 		      && !(data->flags & SF_HAS_EVAL)
3214 		      && !deltanext	/* atom is fixed width */
3215 		      && minnext != 0	/* CURLYM can't handle zero width */
3216 		) {
3217 		    /* XXXX How to optimize if data == 0? */
3218 		    /* Optimize to a simpler form.  */
3219 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3220 		    regnode *nxt2;
3221 
3222 		    OP(oscan) = CURLYM;
3223 		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3224 			    && (OP(nxt2) != WHILEM))
3225 			nxt = nxt2;
3226 		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3227 		    /* Need to optimize away parenths. */
3228 		    if (data->flags & SF_IN_PAR) {
3229 			/* Set the parenth number.  */
3230 			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3231 
3232 			if (OP(nxt) != CLOSE)
3233 			    FAIL("Panic opt close");
3234 			oscan->flags = (U8)ARG(nxt);
3235 			if (RExC_open_parens) {
3236 			    RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3237 			    RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3238 			}
3239 			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
3240 			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
3241 
3242 #ifdef DEBUGGING
3243 			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3244 			OP(nxt + 1) = OPTIMIZED; /* was count. */
3245 			NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3246 			NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3247 #endif
3248 #if 0
3249 			while ( nxt1 && (OP(nxt1) != WHILEM)) {
3250 			    regnode *nnxt = regnext(nxt1);
3251 
3252 			    if (nnxt == nxt) {
3253 				if (reg_off_by_arg[OP(nxt1)])
3254 				    ARG_SET(nxt1, nxt2 - nxt1);
3255 				else if (nxt2 - nxt1 < U16_MAX)
3256 				    NEXT_OFF(nxt1) = nxt2 - nxt1;
3257 				else
3258 				    OP(nxt) = NOTHING;	/* Cannot beautify */
3259 			    }
3260 			    nxt1 = nnxt;
3261 			}
3262 #endif
3263 			/* Optimize again: */
3264 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3265 				    NULL, stopparen, recursed, NULL, 0,depth+1);
3266 		    }
3267 		    else
3268 			oscan->flags = 0;
3269 		}
3270 		else if ((OP(oscan) == CURLYX)
3271 			 && (flags & SCF_WHILEM_VISITED_POS)
3272 			 /* See the comment on a similar expression above.
3273 			    However, this time it not a subexpression
3274 			    we care about, but the expression itself. */
3275 			 && (maxcount == REG_INFTY)
3276 			 && data && ++data->whilem_c < 16) {
3277 		    /* This stays as CURLYX, we can put the count/of pair. */
3278 		    /* Find WHILEM (as in regexec.c) */
3279 		    regnode *nxt = oscan + NEXT_OFF(oscan);
3280 
3281 		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3282 			nxt += ARG(nxt);
3283 		    PREVOPER(nxt)->flags = (U8)(data->whilem_c
3284 			| (RExC_whilem_seen << 4)); /* On WHILEM */
3285 		}
3286 		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3287 		    pars++;
3288 		if (flags & SCF_DO_SUBSTR) {
3289 		    SV *last_str = NULL;
3290 		    int counted = mincount != 0;
3291 
3292 		    if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3293 #if defined(SPARC64_GCC_WORKAROUND)
3294 			I32 b = 0;
3295 			STRLEN l = 0;
3296 			const char *s = NULL;
3297 			I32 old = 0;
3298 
3299 			if (pos_before >= data->last_start_min)
3300 			    b = pos_before;
3301 			else
3302 			    b = data->last_start_min;
3303 
3304 			l = 0;
3305 			s = SvPV_const(data->last_found, l);
3306 			old = b - data->last_start_min;
3307 
3308 #else
3309 			I32 b = pos_before >= data->last_start_min
3310 			    ? pos_before : data->last_start_min;
3311 			STRLEN l;
3312 			const char * const s = SvPV_const(data->last_found, l);
3313 			I32 old = b - data->last_start_min;
3314 #endif
3315 
3316 			if (UTF)
3317 			    old = utf8_hop((U8*)s, old) - (U8*)s;
3318 
3319 			l -= old;
3320 			/* Get the added string: */
3321 			last_str = newSVpvn(s  + old, l);
3322 			if (UTF)
3323 			    SvUTF8_on(last_str);
3324 			if (deltanext == 0 && pos_before == b) {
3325 			    /* What was added is a constant string */
3326 			    if (mincount > 1) {
3327 				SvGROW(last_str, (mincount * l) + 1);
3328 				repeatcpy(SvPVX(last_str) + l,
3329 					  SvPVX_const(last_str), l, mincount - 1);
3330 				SvCUR_set(last_str, SvCUR(last_str) * mincount);
3331 				/* Add additional parts. */
3332 				SvCUR_set(data->last_found,
3333 					  SvCUR(data->last_found) - l);
3334 				sv_catsv(data->last_found, last_str);
3335 				{
3336 				    SV * sv = data->last_found;
3337 				    MAGIC *mg =
3338 					SvUTF8(sv) && SvMAGICAL(sv) ?
3339 					mg_find(sv, PERL_MAGIC_utf8) : NULL;
3340 				    if (mg && mg->mg_len >= 0)
3341 					mg->mg_len += CHR_SVLEN(last_str) - l;
3342 				}
3343 				data->last_end += l * (mincount - 1);
3344 			    }
3345 			} else {
3346 			    /* start offset must point into the last copy */
3347 			    data->last_start_min += minnext * (mincount - 1);
3348 			    data->last_start_max += is_inf ? I32_MAX
3349 				: (maxcount - 1) * (minnext + data->pos_delta);
3350 			}
3351 		    }
3352 		    /* It is counted once already... */
3353 		    data->pos_min += minnext * (mincount - counted);
3354 		    data->pos_delta += - counted * deltanext +
3355 			(minnext + deltanext) * maxcount - minnext * mincount;
3356 		    if (mincount != maxcount) {
3357 			 /* Cannot extend fixed substrings found inside
3358 			    the group.  */
3359 			SCAN_COMMIT(pRExC_state,data,minlenp);
3360 			if (mincount && last_str) {
3361 			    SV * const sv = data->last_found;
3362 			    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3363 				mg_find(sv, PERL_MAGIC_utf8) : NULL;
3364 
3365 			    if (mg)
3366 				mg->mg_len = -1;
3367 			    sv_setsv(sv, last_str);
3368 			    data->last_end = data->pos_min;
3369 			    data->last_start_min =
3370 				data->pos_min - CHR_SVLEN(last_str);
3371 			    data->last_start_max = is_inf
3372 				? I32_MAX
3373 				: data->pos_min + data->pos_delta
3374 				- CHR_SVLEN(last_str);
3375 			}
3376 			data->longest = &(data->longest_float);
3377 		    }
3378 		    SvREFCNT_dec(last_str);
3379 		}
3380 		if (data && (fl & SF_HAS_EVAL))
3381 		    data->flags |= SF_HAS_EVAL;
3382 	      optimize_curly_tail:
3383 		if (OP(oscan) != CURLYX) {
3384 		    while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3385 			   && NEXT_OFF(next))
3386 			NEXT_OFF(oscan) += NEXT_OFF(next);
3387 		}
3388 		continue;
3389 	    default:			/* REF and CLUMP only? */
3390 		if (flags & SCF_DO_SUBSTR) {
3391 		    SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3392 		    data->longest = &(data->longest_float);
3393 		}
3394 		is_inf = is_inf_internal = 1;
3395 		if (flags & SCF_DO_STCLASS_OR)
3396 		    cl_anything(pRExC_state, data->start_class);
3397 		flags &= ~SCF_DO_STCLASS;
3398 		break;
3399 	    }
3400 	}
3401 	else if (OP(scan) == LNBREAK) {
3402 	    if (flags & SCF_DO_STCLASS) {
3403 		int value = 0;
3404 		data->start_class->flags &= ~ANYOF_EOS;	/* No match on empty */
3405     	        if (flags & SCF_DO_STCLASS_AND) {
3406                     for (value = 0; value < 256; value++)
3407                         if (!is_VERTWS_cp(value))
3408                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3409                 }
3410                 else {
3411                     for (value = 0; value < 256; value++)
3412                         if (is_VERTWS_cp(value))
3413                             ANYOF_BITMAP_SET(data->start_class, value);
3414                 }
3415                 if (flags & SCF_DO_STCLASS_OR)
3416 		    cl_and(data->start_class, and_withp);
3417 		flags &= ~SCF_DO_STCLASS;
3418             }
3419 	    min += 1;
3420 	    delta += 1;
3421             if (flags & SCF_DO_SUBSTR) {
3422     	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3423     	        data->pos_min += 1;
3424 	        data->pos_delta += 1;
3425 		data->longest = &(data->longest_float);
3426     	    }
3427 
3428 	}
3429 	else if (OP(scan) == FOLDCHAR) {
3430 	    int d = ARG(scan)==0xDF ? 1 : 2;
3431 	    flags &= ~SCF_DO_STCLASS;
3432             min += 1;
3433             delta += d;
3434             if (flags & SCF_DO_SUBSTR) {
3435 	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3436 	        data->pos_min += 1;
3437 	        data->pos_delta += d;
3438 		data->longest = &(data->longest_float);
3439 	    }
3440 	}
3441 	else if (strchr((const char*)PL_simple,OP(scan))) {
3442 	    int value = 0;
3443 
3444 	    if (flags & SCF_DO_SUBSTR) {
3445 		SCAN_COMMIT(pRExC_state,data,minlenp);
3446 		data->pos_min++;
3447 	    }
3448 	    min++;
3449 	    if (flags & SCF_DO_STCLASS) {
3450 		data->start_class->flags &= ~ANYOF_EOS;	/* No match on empty */
3451 
3452 		/* Some of the logic below assumes that switching
3453 		   locale on will only add false positives. */
3454 		switch (PL_regkind[OP(scan)]) {
3455 		case SANY:
3456 		default:
3457 		  do_default:
3458 		    /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3459 		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3460 			cl_anything(pRExC_state, data->start_class);
3461 		    break;
3462 		case REG_ANY:
3463 		    if (OP(scan) == SANY)
3464 			goto do_default;
3465 		    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3466 			value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3467 				 || (data->start_class->flags & ANYOF_CLASS));
3468 			cl_anything(pRExC_state, data->start_class);
3469 		    }
3470 		    if (flags & SCF_DO_STCLASS_AND || !value)
3471 			ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3472 		    break;
3473 		case ANYOF:
3474 		    if (flags & SCF_DO_STCLASS_AND)
3475 			cl_and(data->start_class,
3476 			       (struct regnode_charclass_class*)scan);
3477 		    else
3478 			cl_or(pRExC_state, data->start_class,
3479 			      (struct regnode_charclass_class*)scan);
3480 		    break;
3481 		case ALNUM:
3482 		    if (flags & SCF_DO_STCLASS_AND) {
3483 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3484 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3485 			    for (value = 0; value < 256; value++)
3486 				if (!isALNUM(value))
3487 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3488 			}
3489 		    }
3490 		    else {
3491 			if (data->start_class->flags & ANYOF_LOCALE)
3492 			    ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3493 			else {
3494 			    for (value = 0; value < 256; value++)
3495 				if (isALNUM(value))
3496 				    ANYOF_BITMAP_SET(data->start_class, value);
3497 			}
3498 		    }
3499 		    break;
3500 		case ALNUML:
3501 		    if (flags & SCF_DO_STCLASS_AND) {
3502 			if (data->start_class->flags & ANYOF_LOCALE)
3503 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3504 		    }
3505 		    else {
3506 			ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3507 			data->start_class->flags |= ANYOF_LOCALE;
3508 		    }
3509 		    break;
3510 		case NALNUM:
3511 		    if (flags & SCF_DO_STCLASS_AND) {
3512 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3513 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3514 			    for (value = 0; value < 256; value++)
3515 				if (isALNUM(value))
3516 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3517 			}
3518 		    }
3519 		    else {
3520 			if (data->start_class->flags & ANYOF_LOCALE)
3521 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3522 			else {
3523 			    for (value = 0; value < 256; value++)
3524 				if (!isALNUM(value))
3525 				    ANYOF_BITMAP_SET(data->start_class, value);
3526 			}
3527 		    }
3528 		    break;
3529 		case NALNUML:
3530 		    if (flags & SCF_DO_STCLASS_AND) {
3531 			if (data->start_class->flags & ANYOF_LOCALE)
3532 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3533 		    }
3534 		    else {
3535 			data->start_class->flags |= ANYOF_LOCALE;
3536 			ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3537 		    }
3538 		    break;
3539 		case SPACE:
3540 		    if (flags & SCF_DO_STCLASS_AND) {
3541 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3542 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3543 			    for (value = 0; value < 256; value++)
3544 				if (!isSPACE(value))
3545 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3546 			}
3547 		    }
3548 		    else {
3549 			if (data->start_class->flags & ANYOF_LOCALE)
3550 			    ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3551 			else {
3552 			    for (value = 0; value < 256; value++)
3553 				if (isSPACE(value))
3554 				    ANYOF_BITMAP_SET(data->start_class, value);
3555 			}
3556 		    }
3557 		    break;
3558 		case SPACEL:
3559 		    if (flags & SCF_DO_STCLASS_AND) {
3560 			if (data->start_class->flags & ANYOF_LOCALE)
3561 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3562 		    }
3563 		    else {
3564 			data->start_class->flags |= ANYOF_LOCALE;
3565 			ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3566 		    }
3567 		    break;
3568 		case NSPACE:
3569 		    if (flags & SCF_DO_STCLASS_AND) {
3570 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3571 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3572 			    for (value = 0; value < 256; value++)
3573 				if (isSPACE(value))
3574 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3575 			}
3576 		    }
3577 		    else {
3578 			if (data->start_class->flags & ANYOF_LOCALE)
3579 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3580 			else {
3581 			    for (value = 0; value < 256; value++)
3582 				if (!isSPACE(value))
3583 				    ANYOF_BITMAP_SET(data->start_class, value);
3584 			}
3585 		    }
3586 		    break;
3587 		case NSPACEL:
3588 		    if (flags & SCF_DO_STCLASS_AND) {
3589 			if (data->start_class->flags & ANYOF_LOCALE) {
3590 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3591 			    for (value = 0; value < 256; value++)
3592 				if (!isSPACE(value))
3593 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3594 			}
3595 		    }
3596 		    else {
3597 			data->start_class->flags |= ANYOF_LOCALE;
3598 			ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3599 		    }
3600 		    break;
3601 		case DIGIT:
3602 		    if (flags & SCF_DO_STCLASS_AND) {
3603 			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3604 			for (value = 0; value < 256; value++)
3605 			    if (!isDIGIT(value))
3606 				ANYOF_BITMAP_CLEAR(data->start_class, value);
3607 		    }
3608 		    else {
3609 			if (data->start_class->flags & ANYOF_LOCALE)
3610 			    ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3611 			else {
3612 			    for (value = 0; value < 256; value++)
3613 				if (isDIGIT(value))
3614 				    ANYOF_BITMAP_SET(data->start_class, value);
3615 			}
3616 		    }
3617 		    break;
3618 		case NDIGIT:
3619 		    if (flags & SCF_DO_STCLASS_AND) {
3620 			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3621 			for (value = 0; value < 256; value++)
3622 			    if (isDIGIT(value))
3623 				ANYOF_BITMAP_CLEAR(data->start_class, value);
3624 		    }
3625 		    else {
3626 			if (data->start_class->flags & ANYOF_LOCALE)
3627 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3628 			else {
3629 			    for (value = 0; value < 256; value++)
3630 				if (!isDIGIT(value))
3631 				    ANYOF_BITMAP_SET(data->start_class, value);
3632 			}
3633 		    }
3634 		    break;
3635 		CASE_SYNST_FNC(VERTWS);
3636 		CASE_SYNST_FNC(HORIZWS);
3637 
3638 		}
3639 		if (flags & SCF_DO_STCLASS_OR)
3640 		    cl_and(data->start_class, and_withp);
3641 		flags &= ~SCF_DO_STCLASS;
3642 	    }
3643 	}
3644 	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3645 	    data->flags |= (OP(scan) == MEOL
3646 			    ? SF_BEFORE_MEOL
3647 			    : SF_BEFORE_SEOL);
3648 	}
3649 	else if (  PL_regkind[OP(scan)] == BRANCHJ
3650 		 /* Lookbehind, or need to calculate parens/evals/stclass: */
3651 		   && (scan->flags || data || (flags & SCF_DO_STCLASS))
3652 		   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3653             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3654                 || OP(scan) == UNLESSM )
3655             {
3656                 /* Negative Lookahead/lookbehind
3657                    In this case we can't do fixed string optimisation.
3658                 */
3659 
3660                 I32 deltanext, minnext, fake = 0;
3661                 regnode *nscan;
3662                 struct regnode_charclass_class intrnl;
3663                 int f = 0;
3664 
3665                 data_fake.flags = 0;
3666                 if (data) {
3667                     data_fake.whilem_c = data->whilem_c;
3668                     data_fake.last_closep = data->last_closep;
3669 		}
3670                 else
3671                     data_fake.last_closep = &fake;
3672 		data_fake.pos_delta = delta;
3673                 if ( flags & SCF_DO_STCLASS && !scan->flags
3674                      && OP(scan) == IFMATCH ) { /* Lookahead */
3675                     cl_init(pRExC_state, &intrnl);
3676                     data_fake.start_class = &intrnl;
3677                     f |= SCF_DO_STCLASS_AND;
3678 		}
3679                 if (flags & SCF_WHILEM_VISITED_POS)
3680                     f |= SCF_WHILEM_VISITED_POS;
3681                 next = regnext(scan);
3682                 nscan = NEXTOPER(NEXTOPER(scan));
3683                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3684                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3685                 if (scan->flags) {
3686                     if (deltanext) {
3687 			FAIL("Variable length lookbehind not implemented");
3688                     }
3689                     else if (minnext > (I32)U8_MAX) {
3690 			FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3691                     }
3692                     scan->flags = (U8)minnext;
3693                 }
3694                 if (data) {
3695                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3696                         pars++;
3697                     if (data_fake.flags & SF_HAS_EVAL)
3698                         data->flags |= SF_HAS_EVAL;
3699                     data->whilem_c = data_fake.whilem_c;
3700                 }
3701                 if (f & SCF_DO_STCLASS_AND) {
3702                     const int was = (data->start_class->flags & ANYOF_EOS);
3703 
3704                     cl_and(data->start_class, &intrnl);
3705                     if (was)
3706                         data->start_class->flags |= ANYOF_EOS;
3707                 }
3708 	    }
3709 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3710             else {
3711                 /* Positive Lookahead/lookbehind
3712                    In this case we can do fixed string optimisation,
3713                    but we must be careful about it. Note in the case of
3714                    lookbehind the positions will be offset by the minimum
3715                    length of the pattern, something we won't know about
3716                    until after the recurse.
3717                 */
3718                 I32 deltanext, fake = 0;
3719                 regnode *nscan;
3720                 struct regnode_charclass_class intrnl;
3721                 int f = 0;
3722                 /* We use SAVEFREEPV so that when the full compile
3723                     is finished perl will clean up the allocated
3724                     minlens when its all done. This was we don't
3725                     have to worry about freeing them when we know
3726                     they wont be used, which would be a pain.
3727                  */
3728                 I32 *minnextp;
3729                 Newx( minnextp, 1, I32 );
3730                 SAVEFREEPV(minnextp);
3731 
3732                 if (data) {
3733                     StructCopy(data, &data_fake, scan_data_t);
3734                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3735                         f |= SCF_DO_SUBSTR;
3736                         if (scan->flags)
3737                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3738                         data_fake.last_found=newSVsv(data->last_found);
3739                     }
3740                 }
3741                 else
3742                     data_fake.last_closep = &fake;
3743                 data_fake.flags = 0;
3744 		data_fake.pos_delta = delta;
3745                 if (is_inf)
3746 	            data_fake.flags |= SF_IS_INF;
3747                 if ( flags & SCF_DO_STCLASS && !scan->flags
3748                      && OP(scan) == IFMATCH ) { /* Lookahead */
3749                     cl_init(pRExC_state, &intrnl);
3750                     data_fake.start_class = &intrnl;
3751                     f |= SCF_DO_STCLASS_AND;
3752                 }
3753                 if (flags & SCF_WHILEM_VISITED_POS)
3754                     f |= SCF_WHILEM_VISITED_POS;
3755                 next = regnext(scan);
3756                 nscan = NEXTOPER(NEXTOPER(scan));
3757 
3758                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3759                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3760                 if (scan->flags) {
3761                     if (deltanext) {
3762 			FAIL("Variable length lookbehind not implemented");
3763                     }
3764                     else if (*minnextp > (I32)U8_MAX) {
3765 			FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3766                     }
3767                     scan->flags = (U8)*minnextp;
3768                 }
3769 
3770                 *minnextp += min;
3771 
3772                 if (f & SCF_DO_STCLASS_AND) {
3773                     const int was = (data->start_class->flags & ANYOF_EOS);
3774 
3775                     cl_and(data->start_class, &intrnl);
3776                     if (was)
3777                         data->start_class->flags |= ANYOF_EOS;
3778                 }
3779                 if (data) {
3780                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3781                         pars++;
3782                     if (data_fake.flags & SF_HAS_EVAL)
3783                         data->flags |= SF_HAS_EVAL;
3784                     data->whilem_c = data_fake.whilem_c;
3785                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3786                         if (RExC_rx->minlen<*minnextp)
3787                             RExC_rx->minlen=*minnextp;
3788                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3789                         SvREFCNT_dec(data_fake.last_found);
3790 
3791                         if ( data_fake.minlen_fixed != minlenp )
3792                         {
3793                             data->offset_fixed= data_fake.offset_fixed;
3794                             data->minlen_fixed= data_fake.minlen_fixed;
3795                             data->lookbehind_fixed+= scan->flags;
3796                         }
3797                         if ( data_fake.minlen_float != minlenp )
3798                         {
3799                             data->minlen_float= data_fake.minlen_float;
3800                             data->offset_float_min=data_fake.offset_float_min;
3801                             data->offset_float_max=data_fake.offset_float_max;
3802                             data->lookbehind_float+= scan->flags;
3803                         }
3804                     }
3805                 }
3806 
3807 
3808 	    }
3809 #endif
3810 	}
3811 	else if (OP(scan) == OPEN) {
3812 	    if (stopparen != (I32)ARG(scan))
3813 	        pars++;
3814 	}
3815 	else if (OP(scan) == CLOSE) {
3816 	    if (stopparen == (I32)ARG(scan)) {
3817 	        break;
3818 	    }
3819 	    if ((I32)ARG(scan) == is_par) {
3820 		next = regnext(scan);
3821 
3822 		if ( next && (OP(next) != WHILEM) && next < last)
3823 		    is_par = 0;		/* Disable optimization */
3824 	    }
3825 	    if (data)
3826 		*(data->last_closep) = ARG(scan);
3827 	}
3828 	else if (OP(scan) == EVAL) {
3829 		if (data)
3830 		    data->flags |= SF_HAS_EVAL;
3831 	}
3832 	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3833 	    if (flags & SCF_DO_SUBSTR) {
3834 		SCAN_COMMIT(pRExC_state,data,minlenp);
3835 		flags &= ~SCF_DO_SUBSTR;
3836 	    }
3837 	    if (data && OP(scan)==ACCEPT) {
3838 	        data->flags |= SCF_SEEN_ACCEPT;
3839 	        if (stopmin > min)
3840 	            stopmin = min;
3841 	    }
3842 	}
3843 	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3844 	{
3845 		if (flags & SCF_DO_SUBSTR) {
3846 		    SCAN_COMMIT(pRExC_state,data,minlenp);
3847 		    data->longest = &(data->longest_float);
3848 		}
3849 		is_inf = is_inf_internal = 1;
3850 		if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3851 		    cl_anything(pRExC_state, data->start_class);
3852 		flags &= ~SCF_DO_STCLASS;
3853 	}
3854 	else if (OP(scan) == GPOS) {
3855 	    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3856 	        !(delta || is_inf || (data && data->pos_delta)))
3857 	    {
3858 	        if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3859 		    RExC_rx->extflags |= RXf_ANCH_GPOS;
3860 	        if (RExC_rx->gofs < (U32)min)
3861 		    RExC_rx->gofs = min;
3862             } else {
3863                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3864                 RExC_rx->gofs = 0;
3865             }
3866 	}
3867 #ifdef TRIE_STUDY_OPT
3868 #ifdef FULL_TRIE_STUDY
3869         else if (PL_regkind[OP(scan)] == TRIE) {
3870             /* NOTE - There is similar code to this block above for handling
3871                BRANCH nodes on the initial study.  If you change stuff here
3872                check there too. */
3873             regnode *trie_node= scan;
3874             regnode *tail= regnext(scan);
3875             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3876             I32 max1 = 0, min1 = I32_MAX;
3877             struct regnode_charclass_class accum;
3878 
3879             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3880                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3881             if (flags & SCF_DO_STCLASS)
3882                 cl_init_zero(pRExC_state, &accum);
3883 
3884             if (!trie->jump) {
3885                 min1= trie->minlen;
3886                 max1= trie->maxlen;
3887             } else {
3888                 const regnode *nextbranch= NULL;
3889                 U32 word;
3890 
3891                 for ( word=1 ; word <= trie->wordcount ; word++)
3892                 {
3893                     I32 deltanext=0, minnext=0, f = 0, fake;
3894                     struct regnode_charclass_class this_class;
3895 
3896                     data_fake.flags = 0;
3897                     if (data) {
3898                         data_fake.whilem_c = data->whilem_c;
3899                         data_fake.last_closep = data->last_closep;
3900                     }
3901                     else
3902                         data_fake.last_closep = &fake;
3903 		    data_fake.pos_delta = delta;
3904                     if (flags & SCF_DO_STCLASS) {
3905                         cl_init(pRExC_state, &this_class);
3906                         data_fake.start_class = &this_class;
3907                         f = SCF_DO_STCLASS_AND;
3908                     }
3909                     if (flags & SCF_WHILEM_VISITED_POS)
3910                         f |= SCF_WHILEM_VISITED_POS;
3911 
3912                     if (trie->jump[word]) {
3913                         if (!nextbranch)
3914                             nextbranch = trie_node + trie->jump[0];
3915                         scan= trie_node + trie->jump[word];
3916                         /* We go from the jump point to the branch that follows
3917                            it. Note this means we need the vestigal unused branches
3918                            even though they arent otherwise used.
3919                          */
3920                         minnext = study_chunk(pRExC_state, &scan, minlenp,
3921                             &deltanext, (regnode *)nextbranch, &data_fake,
3922                             stopparen, recursed, NULL, f,depth+1);
3923                     }
3924                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3925                         nextbranch= regnext((regnode*)nextbranch);
3926 
3927                     if (min1 > (I32)(minnext + trie->minlen))
3928                         min1 = minnext + trie->minlen;
3929                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3930                         max1 = minnext + deltanext + trie->maxlen;
3931                     if (deltanext == I32_MAX)
3932                         is_inf = is_inf_internal = 1;
3933 
3934                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3935                         pars++;
3936                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3937                         if ( stopmin > min + min1)
3938 	                    stopmin = min + min1;
3939 	                flags &= ~SCF_DO_SUBSTR;
3940 	                if (data)
3941 	                    data->flags |= SCF_SEEN_ACCEPT;
3942 	            }
3943                     if (data) {
3944                         if (data_fake.flags & SF_HAS_EVAL)
3945                             data->flags |= SF_HAS_EVAL;
3946                         data->whilem_c = data_fake.whilem_c;
3947                     }
3948                     if (flags & SCF_DO_STCLASS)
3949                         cl_or(pRExC_state, &accum, &this_class);
3950                 }
3951             }
3952             if (flags & SCF_DO_SUBSTR) {
3953                 data->pos_min += min1;
3954                 data->pos_delta += max1 - min1;
3955                 if (max1 != min1 || is_inf)
3956                     data->longest = &(data->longest_float);
3957             }
3958             min += min1;
3959             delta += max1 - min1;
3960             if (flags & SCF_DO_STCLASS_OR) {
3961                 cl_or(pRExC_state, data->start_class, &accum);
3962                 if (min1) {
3963                     cl_and(data->start_class, and_withp);
3964                     flags &= ~SCF_DO_STCLASS;
3965                 }
3966             }
3967             else if (flags & SCF_DO_STCLASS_AND) {
3968                 if (min1) {
3969                     cl_and(data->start_class, &accum);
3970                     flags &= ~SCF_DO_STCLASS;
3971                 }
3972                 else {
3973                     /* Switch to OR mode: cache the old value of
3974                      * data->start_class */
3975 		    INIT_AND_WITHP;
3976                     StructCopy(data->start_class, and_withp,
3977                                struct regnode_charclass_class);
3978                     flags &= ~SCF_DO_STCLASS_AND;
3979                     StructCopy(&accum, data->start_class,
3980                                struct regnode_charclass_class);
3981                     flags |= SCF_DO_STCLASS_OR;
3982                     data->start_class->flags |= ANYOF_EOS;
3983                 }
3984             }
3985             scan= tail;
3986             continue;
3987         }
3988 #else
3989 	else if (PL_regkind[OP(scan)] == TRIE) {
3990 	    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3991 	    U8*bang=NULL;
3992 
3993 	    min += trie->minlen;
3994 	    delta += (trie->maxlen - trie->minlen);
3995 	    flags &= ~SCF_DO_STCLASS; /* xxx */
3996             if (flags & SCF_DO_SUBSTR) {
3997     	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3998     	        data->pos_min += trie->minlen;
3999     	        data->pos_delta += (trie->maxlen - trie->minlen);
4000 		if (trie->maxlen != trie->minlen)
4001 		    data->longest = &(data->longest_float);
4002     	    }
4003     	    if (trie->jump) /* no more substrings -- for now /grr*/
4004     	        flags &= ~SCF_DO_SUBSTR;
4005 	}
4006 #endif /* old or new */
4007 #endif /* TRIE_STUDY_OPT */
4008 
4009 	/* Else: zero-length, ignore. */
4010 	scan = regnext(scan);
4011     }
4012     if (frame) {
4013         last = frame->last;
4014         scan = frame->next;
4015         stopparen = frame->stop;
4016         frame = frame->prev;
4017         goto fake_study_recurse;
4018     }
4019 
4020   finish:
4021     assert(!frame);
4022     DEBUG_STUDYDATA("pre-fin:",data,depth);
4023 
4024     *scanp = scan;
4025     *deltap = is_inf_internal ? I32_MAX : delta;
4026     if (flags & SCF_DO_SUBSTR && is_inf)
4027 	data->pos_delta = I32_MAX - data->pos_min;
4028     if (is_par > (I32)U8_MAX)
4029 	is_par = 0;
4030     if (is_par && pars==1 && data) {
4031 	data->flags |= SF_IN_PAR;
4032 	data->flags &= ~SF_HAS_PAR;
4033     }
4034     else if (pars && data) {
4035 	data->flags |= SF_HAS_PAR;
4036 	data->flags &= ~SF_IN_PAR;
4037     }
4038     if (flags & SCF_DO_STCLASS_OR)
4039 	cl_and(data->start_class, and_withp);
4040     if (flags & SCF_TRIE_RESTUDY)
4041         data->flags |= 	SCF_TRIE_RESTUDY;
4042 
4043     DEBUG_STUDYDATA("post-fin:",data,depth);
4044 
4045     return min < stopmin ? min : stopmin;
4046 }
4047 
4048 STATIC U32
4049 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4050 {
4051     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4052 
4053     Renewc(RExC_rxi->data,
4054 	   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4055 	   char, struct reg_data);
4056     if(count)
4057 	Renew(RExC_rxi->data->what, count + n, U8);
4058     else
4059 	Newx(RExC_rxi->data->what, n, U8);
4060     RExC_rxi->data->count = count + n;
4061     Copy(s, RExC_rxi->data->what + count, n, U8);
4062     return count;
4063 }
4064 
4065 /*XXX: todo make this not included in a non debugging perl */
4066 #ifndef PERL_IN_XSUB_RE
4067 void
4068 Perl_reginitcolors(pTHX)
4069 {
4070     dVAR;
4071     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4072     if (s) {
4073 	char *t = savepv(s);
4074 	int i = 0;
4075 	PL_colors[0] = t;
4076 	while (++i < 6) {
4077 	    t = strchr(t, '\t');
4078 	    if (t) {
4079 		*t = '\0';
4080 		PL_colors[i] = ++t;
4081 	    }
4082 	    else
4083 		PL_colors[i] = t = (char *)"";
4084 	}
4085     } else {
4086 	int i = 0;
4087 	while (i < 6)
4088 	    PL_colors[i++] = (char *)"";
4089     }
4090     PL_colorset = 1;
4091 }
4092 #endif
4093 
4094 
4095 #ifdef TRIE_STUDY_OPT
4096 #define CHECK_RESTUDY_GOTO                                  \
4097         if (                                                \
4098               (data.flags & SCF_TRIE_RESTUDY)               \
4099               && ! restudied++                              \
4100         )     goto reStudy
4101 #else
4102 #define CHECK_RESTUDY_GOTO
4103 #endif
4104 
4105 /*
4106  - pregcomp - compile a regular expression into internal code
4107  *
4108  * We can't allocate space until we know how big the compiled form will be,
4109  * but we can't compile it (and thus know how big it is) until we've got a
4110  * place to put the code.  So we cheat:  we compile it twice, once with code
4111  * generation turned off and size counting turned on, and once "for real".
4112  * This also means that we don't allocate space until we are sure that the
4113  * thing really will compile successfully, and we never have to move the
4114  * code and thus invalidate pointers into it.  (Note that it has to be in
4115  * one piece because free() must be able to free it all.) [NB: not true in perl]
4116  *
4117  * Beware that the optimization-preparation code in here knows about some
4118  * of the structure of the compiled regexp.  [I'll say.]
4119  */
4120 
4121 
4122 
4123 #ifndef PERL_IN_XSUB_RE
4124 #define RE_ENGINE_PTR &PL_core_reg_engine
4125 #else
4126 extern const struct regexp_engine my_reg_engine;
4127 #define RE_ENGINE_PTR &my_reg_engine
4128 #endif
4129 
4130 #ifndef PERL_IN_XSUB_RE
4131 REGEXP *
4132 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4133 {
4134     dVAR;
4135     HV * const table = GvHV(PL_hintgv);
4136     /* Dispatch a request to compile a regexp to correct
4137        regexp engine. */
4138     if (table) {
4139         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4140         GET_RE_DEBUG_FLAGS_DECL;
4141         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4142             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4143             DEBUG_COMPILE_r({
4144                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4145                     SvIV(*ptr));
4146             });
4147             return CALLREGCOMP_ENG(eng, pattern, flags);
4148         }
4149     }
4150     return Perl_re_compile(aTHX_ pattern, flags);
4151 }
4152 #endif
4153 
4154 REGEXP *
4155 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4156 {
4157     dVAR;
4158     register REGEXP *r;
4159     register regexp_internal *ri;
4160     STRLEN plen;
4161     char*  exp = SvPV((SV*)pattern, plen);
4162     char* xend = exp + plen;
4163     regnode *scan;
4164     I32 flags;
4165     I32 minlen = 0;
4166     I32 sawplus = 0;
4167     I32 sawopen = 0;
4168     scan_data_t data;
4169     RExC_state_t RExC_state;
4170     RExC_state_t * const pRExC_state = &RExC_state;
4171 #ifdef TRIE_STUDY_OPT
4172     int restudied= 0;
4173     RExC_state_t copyRExC_state;
4174 #endif
4175     GET_RE_DEBUG_FLAGS_DECL;
4176     DEBUG_r(if (!PL_colorset) reginitcolors());
4177 
4178     RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4179 
4180     DEBUG_COMPILE_r({
4181         SV *dsv= sv_newmortal();
4182         RE_PV_QUOTED_DECL(s, RExC_utf8,
4183             dsv, exp, plen, 60);
4184         PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4185 		       PL_colors[4],PL_colors[5],s);
4186     });
4187 
4188 redo_first_pass:
4189     RExC_precomp = exp;
4190     RExC_flags = pm_flags;
4191     RExC_sawback = 0;
4192 
4193     RExC_seen = 0;
4194     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4195     RExC_seen_evals = 0;
4196     RExC_extralen = 0;
4197 
4198     /* First pass: determine size, legality. */
4199     RExC_parse = exp;
4200     RExC_start = exp;
4201     RExC_end = xend;
4202     RExC_naughty = 0;
4203     RExC_npar = 1;
4204     RExC_nestroot = 0;
4205     RExC_size = 0L;
4206     RExC_emit = &PL_regdummy;
4207     RExC_whilem_seen = 0;
4208     RExC_charnames = NULL;
4209     RExC_open_parens = NULL;
4210     RExC_close_parens = NULL;
4211     RExC_opend = NULL;
4212     RExC_paren_names = NULL;
4213 #ifdef DEBUGGING
4214     RExC_paren_name_list = NULL;
4215 #endif
4216     RExC_recurse = NULL;
4217     RExC_recurse_count = 0;
4218 
4219 #if 0 /* REGC() is (currently) a NOP at the first pass.
4220        * Clever compilers notice this and complain. --jhi */
4221     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4222 #endif
4223     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4224     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4225 	RExC_precomp = NULL;
4226 	return(NULL);
4227     }
4228     if (RExC_utf8 && !RExC_orig_utf8) {
4229         /* It's possible to write a regexp in ascii that represents Unicode
4230         codepoints outside of the byte range, such as via \x{100}. If we
4231         detect such a sequence we have to convert the entire pattern to utf8
4232         and then recompile, as our sizing calculation will have been based
4233         on 1 byte == 1 character, but we will need to use utf8 to encode
4234         at least some part of the pattern, and therefore must convert the whole
4235         thing.
4236         XXX: somehow figure out how to make this less expensive...
4237         -- dmq */
4238         STRLEN len = plen;
4239         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4240 	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4241         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4242         xend = exp + len;
4243         RExC_orig_utf8 = RExC_utf8;
4244         SAVEFREEPV(exp);
4245         goto redo_first_pass;
4246     }
4247     DEBUG_PARSE_r({
4248         PerlIO_printf(Perl_debug_log,
4249             "Required size %"IVdf" nodes\n"
4250             "Starting second pass (creation)\n",
4251             (IV)RExC_size);
4252         RExC_lastnum=0;
4253         RExC_lastparse=NULL;
4254     });
4255     /* Small enough for pointer-storage convention?
4256        If extralen==0, this means that we will not need long jumps. */
4257     if (RExC_size >= 0x10000L && RExC_extralen)
4258         RExC_size += RExC_extralen;
4259     else
4260 	RExC_extralen = 0;
4261     if (RExC_whilem_seen > 15)
4262 	RExC_whilem_seen = 15;
4263 
4264     /* Allocate space and zero-initialize. Note, the two step process
4265        of zeroing when in debug mode, thus anything assigned has to
4266        happen after that */
4267     Newxz(r, 1, regexp);
4268     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4269 	 char, regexp_internal);
4270     if ( r == NULL || ri == NULL )
4271 	FAIL("Regexp out of space");
4272 #ifdef DEBUGGING
4273     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4274     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4275 #else
4276     /* bulk initialize base fields with 0. */
4277     Zero(ri, sizeof(regexp_internal), char);
4278 #endif
4279 
4280     /* non-zero initialization begins here */
4281     RXi_SET( r, ri );
4282     r->engine= RE_ENGINE_PTR;
4283     r->refcnt = 1;
4284     r->prelen = plen;
4285     r->extflags = pm_flags;
4286     {
4287         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4288 	bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4289 	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4290 	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
4291 	const char *fptr = STD_PAT_MODS;        /*"msix"*/
4292 	char *p;
4293         r->wraplen = r->prelen + has_minus + has_p + has_runon
4294             + (sizeof(STD_PAT_MODS) - 1)
4295             + (sizeof("(?:)") - 1);
4296 
4297         Newx(r->wrapped, r->wraplen + 1, char );
4298         p = r->wrapped;
4299         *p++='('; *p++='?';
4300         if (has_p)
4301             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4302         {
4303             char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4304             char *colon = r + 1;
4305             char ch;
4306 
4307             while((ch = *fptr++)) {
4308                 if(reganch & 1)
4309                     *p++ = ch;
4310                 else
4311                     *r-- = ch;
4312                 reganch >>= 1;
4313             }
4314             if(has_minus) {
4315                 *r = '-';
4316                 p = colon;
4317             }
4318         }
4319 
4320         *p++ = ':';
4321         Copy(RExC_precomp, p, r->prelen, char);
4322         r->precomp = p;
4323         p += r->prelen;
4324         if (has_runon)
4325             *p++ = '\n';
4326         *p++ = ')';
4327         *p = 0;
4328     }
4329 
4330     r->intflags = 0;
4331     r->nparens = RExC_npar - 1;	/* set early to validate backrefs */
4332 
4333     if (RExC_seen & REG_SEEN_RECURSE) {
4334         Newxz(RExC_open_parens, RExC_npar,regnode *);
4335         SAVEFREEPV(RExC_open_parens);
4336         Newxz(RExC_close_parens,RExC_npar,regnode *);
4337         SAVEFREEPV(RExC_close_parens);
4338     }
4339 
4340     /* Useful during FAIL. */
4341 #ifdef RE_TRACK_PATTERN_OFFSETS
4342     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4343     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4344                           "%s %"UVuf" bytes for offset annotations.\n",
4345                           ri->u.offsets ? "Got" : "Couldn't get",
4346                           (UV)((2*RExC_size+1) * sizeof(U32))));
4347 #endif
4348     SetProgLen(ri,RExC_size);
4349     RExC_rx = r;
4350     RExC_rxi = ri;
4351 
4352     /* Second pass: emit code. */
4353     RExC_flags = pm_flags;	/* don't let top level (?i) bleed */
4354     RExC_parse = exp;
4355     RExC_end = xend;
4356     RExC_naughty = 0;
4357     RExC_npar = 1;
4358     RExC_emit_start = ri->program;
4359     RExC_emit = ri->program;
4360     RExC_emit_bound = ri->program + RExC_size + 1;
4361 
4362     /* Store the count of eval-groups for security checks: */
4363     RExC_rx->seen_evals = RExC_seen_evals;
4364     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4365     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4366 	ReREFCNT_dec(r);
4367 	return(NULL);
4368     }
4369     /* XXXX To minimize changes to RE engine we always allocate
4370        3-units-long substrs field. */
4371     Newx(r->substrs, 1, struct reg_substr_data);
4372     if (RExC_recurse_count) {
4373         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4374         SAVEFREEPV(RExC_recurse);
4375     }
4376 
4377 reStudy:
4378     r->minlen = minlen = sawplus = sawopen = 0;
4379     Zero(r->substrs, 1, struct reg_substr_data);
4380 
4381 #ifdef TRIE_STUDY_OPT
4382     if ( restudied ) {
4383         U32 seen=RExC_seen;
4384         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4385 
4386         RExC_state = copyRExC_state;
4387         if (seen & REG_TOP_LEVEL_BRANCHES)
4388             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4389         else
4390             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4391         if (data.last_found) {
4392             SvREFCNT_dec(data.longest_fixed);
4393 	    SvREFCNT_dec(data.longest_float);
4394 	    SvREFCNT_dec(data.last_found);
4395 	}
4396 	StructCopy(&zero_scan_data, &data, scan_data_t);
4397     } else {
4398         StructCopy(&zero_scan_data, &data, scan_data_t);
4399         copyRExC_state = RExC_state;
4400     }
4401 #else
4402     StructCopy(&zero_scan_data, &data, scan_data_t);
4403 #endif
4404 
4405     /* Dig out information for optimizations. */
4406     r->extflags = RExC_flags; /* was pm_op */
4407     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4408 
4409     if (UTF)
4410         r->extflags |= RXf_UTF8;	/* Unicode in it? */
4411     ri->regstclass = NULL;
4412     if (RExC_naughty >= 10)	/* Probably an expensive pattern. */
4413 	r->intflags |= PREGf_NAUGHTY;
4414     scan = ri->program + 1;		/* First BRANCH. */
4415 
4416     /* testing for BRANCH here tells us whether there is "must appear"
4417        data in the pattern. If there is then we can use it for optimisations */
4418     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4419 	I32 fake;
4420 	STRLEN longest_float_length, longest_fixed_length;
4421 	struct regnode_charclass_class ch_class; /* pointed to by data */
4422 	int stclass_flag;
4423 	I32 last_close = 0; /* pointed to by data */
4424         regnode *first= scan;
4425         regnode *first_next= regnext(first);
4426 
4427 	/* Skip introductions and multiplicators >= 1. */
4428 	while ((OP(first) == OPEN && (sawopen = 1)) ||
4429 	       /* An OR of *one* alternative - should not happen now. */
4430 	    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4431 	    /* for now we can't handle lookbehind IFMATCH*/
4432 	    (OP(first) == IFMATCH && !first->flags) ||
4433 	    (OP(first) == PLUS) ||
4434 	    (OP(first) == MINMOD) ||
4435 	       /* An {n,m} with n>0 */
4436 	    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4437 	    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4438 	{
4439 
4440 		if (OP(first) == PLUS)
4441 		    sawplus = 1;
4442 		else
4443 		    first += regarglen[OP(first)];
4444 		if (OP(first) == IFMATCH) {
4445 		    first = NEXTOPER(first);
4446 		    first += EXTRA_STEP_2ARGS;
4447 		} else  /* XXX possible optimisation for /(?=)/  */
4448 		    first = NEXTOPER(first);
4449 		first_next= regnext(first);
4450 	}
4451 
4452 	/* Starting-point info. */
4453       again:
4454         DEBUG_PEEP("first:",first,0);
4455         /* Ignore EXACT as we deal with it later. */
4456 	if (PL_regkind[OP(first)] == EXACT) {
4457 	    if (OP(first) == EXACT)
4458 		NOOP;	/* Empty, get anchored substr later. */
4459 	    else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4460 		ri->regstclass = first;
4461 	}
4462 #ifdef TRIE_STCLASS
4463 	else if (PL_regkind[OP(first)] == TRIE &&
4464 	        ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4465 	{
4466 	    regnode *trie_op;
4467 	    /* this can happen only on restudy */
4468 	    if ( OP(first) == TRIE ) {
4469                 struct regnode_1 *trieop = (struct regnode_1 *)
4470 		    PerlMemShared_calloc(1, sizeof(struct regnode_1));
4471                 StructCopy(first,trieop,struct regnode_1);
4472                 trie_op=(regnode *)trieop;
4473             } else {
4474                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4475 		    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4476                 StructCopy(first,trieop,struct regnode_charclass);
4477                 trie_op=(regnode *)trieop;
4478             }
4479             OP(trie_op)+=2;
4480             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4481 	    ri->regstclass = trie_op;
4482 	}
4483 #endif
4484 	else if (strchr((const char*)PL_simple,OP(first)))
4485 	    ri->regstclass = first;
4486 	else if (PL_regkind[OP(first)] == BOUND ||
4487 		 PL_regkind[OP(first)] == NBOUND)
4488 	    ri->regstclass = first;
4489 	else if (PL_regkind[OP(first)] == BOL) {
4490 	    r->extflags |= (OP(first) == MBOL
4491 			   ? RXf_ANCH_MBOL
4492 			   : (OP(first) == SBOL
4493 			      ? RXf_ANCH_SBOL
4494 			      : RXf_ANCH_BOL));
4495 	    first = NEXTOPER(first);
4496 	    goto again;
4497 	}
4498 	else if (OP(first) == GPOS) {
4499 	    r->extflags |= RXf_ANCH_GPOS;
4500 	    first = NEXTOPER(first);
4501 	    goto again;
4502 	}
4503 	else if ((!sawopen || !RExC_sawback) &&
4504 	    (OP(first) == STAR &&
4505 	    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4506 	    !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4507 	{
4508 	    /* turn .* into ^.* with an implied $*=1 */
4509 	    const int type =
4510 		(OP(NEXTOPER(first)) == REG_ANY)
4511 		    ? RXf_ANCH_MBOL
4512 		    : RXf_ANCH_SBOL;
4513 	    r->extflags |= type;
4514 	    r->intflags |= PREGf_IMPLICIT;
4515 	    first = NEXTOPER(first);
4516 	    goto again;
4517 	}
4518 	if (sawplus && (!sawopen || !RExC_sawback)
4519 	    && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4520 	    /* x+ must match at the 1st pos of run of x's */
4521 	    r->intflags |= PREGf_SKIP;
4522 
4523 	/* Scan is after the zeroth branch, first is atomic matcher. */
4524 #ifdef TRIE_STUDY_OPT
4525 	DEBUG_PARSE_r(
4526 	    if (!restudied)
4527 	        PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4528 			      (IV)(first - scan + 1))
4529         );
4530 #else
4531 	DEBUG_PARSE_r(
4532 	    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4533 	        (IV)(first - scan + 1))
4534         );
4535 #endif
4536 
4537 
4538 	/*
4539 	* If there's something expensive in the r.e., find the
4540 	* longest literal string that must appear and make it the
4541 	* regmust.  Resolve ties in favor of later strings, since
4542 	* the regstart check works with the beginning of the r.e.
4543 	* and avoiding duplication strengthens checking.  Not a
4544 	* strong reason, but sufficient in the absence of others.
4545 	* [Now we resolve ties in favor of the earlier string if
4546 	* it happens that c_offset_min has been invalidated, since the
4547 	* earlier string may buy us something the later one won't.]
4548 	*/
4549 
4550 	data.longest_fixed = newSVpvs("");
4551 	data.longest_float = newSVpvs("");
4552 	data.last_found = newSVpvs("");
4553 	data.longest = &(data.longest_fixed);
4554 	first = scan;
4555 	if (!ri->regstclass) {
4556 	    cl_init(pRExC_state, &ch_class);
4557 	    data.start_class = &ch_class;
4558 	    stclass_flag = SCF_DO_STCLASS_AND;
4559 	} else				/* XXXX Check for BOUND? */
4560 	    stclass_flag = 0;
4561 	data.last_closep = &last_close;
4562 
4563 	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4564             &data, -1, NULL, NULL,
4565             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4566 
4567 
4568         CHECK_RESTUDY_GOTO;
4569 
4570 
4571 	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4572 	     && data.last_start_min == 0 && data.last_end > 0
4573 	     && !RExC_seen_zerolen
4574 	     && !(RExC_seen & REG_SEEN_VERBARG)
4575 	     && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4576 	    r->extflags |= RXf_CHECK_ALL;
4577 	scan_commit(pRExC_state, &data,&minlen,0);
4578 	SvREFCNT_dec(data.last_found);
4579 
4580         /* Note that code very similar to this but for anchored string
4581            follows immediately below, changes may need to be made to both.
4582            Be careful.
4583          */
4584 	longest_float_length = CHR_SVLEN(data.longest_float);
4585 	if (longest_float_length
4586 	    || (data.flags & SF_FL_BEFORE_EOL
4587 		&& (!(data.flags & SF_FL_BEFORE_MEOL)
4588 		    || (RExC_flags & RXf_PMf_MULTILINE))))
4589         {
4590             I32 t,ml;
4591 
4592 	    if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4593 		&& data.offset_fixed == data.offset_float_min
4594 		&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4595 		    goto remove_float;		/* As in (a)+. */
4596 
4597             /* copy the information about the longest float from the reg_scan_data
4598                over to the program. */
4599 	    if (SvUTF8(data.longest_float)) {
4600 		r->float_utf8 = data.longest_float;
4601 		r->float_substr = NULL;
4602 	    } else {
4603 		r->float_substr = data.longest_float;
4604 		r->float_utf8 = NULL;
4605 	    }
4606 	    /* float_end_shift is how many chars that must be matched that
4607 	       follow this item. We calculate it ahead of time as once the
4608 	       lookbehind offset is added in we lose the ability to correctly
4609 	       calculate it.*/
4610 	    ml = data.minlen_float ? *(data.minlen_float)
4611 	                           : (I32)longest_float_length;
4612 	    r->float_end_shift = ml - data.offset_float_min
4613 	        - longest_float_length + (SvTAIL(data.longest_float) != 0)
4614 	        + data.lookbehind_float;
4615 	    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4616 	    r->float_max_offset = data.offset_float_max;
4617 	    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4618 	        r->float_max_offset -= data.lookbehind_float;
4619 
4620 	    t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4621 		       && (!(data.flags & SF_FL_BEFORE_MEOL)
4622 			   || (RExC_flags & RXf_PMf_MULTILINE)));
4623 	    fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4624 	}
4625 	else {
4626 	  remove_float:
4627 	    r->float_substr = r->float_utf8 = NULL;
4628 	    SvREFCNT_dec(data.longest_float);
4629 	    longest_float_length = 0;
4630 	}
4631 
4632         /* Note that code very similar to this but for floating string
4633            is immediately above, changes may need to be made to both.
4634            Be careful.
4635          */
4636 	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4637 	if (longest_fixed_length
4638 	    || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4639 		&& (!(data.flags & SF_FIX_BEFORE_MEOL)
4640 		    || (RExC_flags & RXf_PMf_MULTILINE))))
4641         {
4642             I32 t,ml;
4643 
4644             /* copy the information about the longest fixed
4645                from the reg_scan_data over to the program. */
4646 	    if (SvUTF8(data.longest_fixed)) {
4647 		r->anchored_utf8 = data.longest_fixed;
4648 		r->anchored_substr = NULL;
4649 	    } else {
4650 		r->anchored_substr = data.longest_fixed;
4651 		r->anchored_utf8 = NULL;
4652 	    }
4653 	    /* fixed_end_shift is how many chars that must be matched that
4654 	       follow this item. We calculate it ahead of time as once the
4655 	       lookbehind offset is added in we lose the ability to correctly
4656 	       calculate it.*/
4657             ml = data.minlen_fixed ? *(data.minlen_fixed)
4658                                    : (I32)longest_fixed_length;
4659             r->anchored_end_shift = ml - data.offset_fixed
4660 	        - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4661 	        + data.lookbehind_fixed;
4662 	    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4663 
4664 	    t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4665 		 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4666 		     || (RExC_flags & RXf_PMf_MULTILINE)));
4667 	    fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4668 	}
4669 	else {
4670 	    r->anchored_substr = r->anchored_utf8 = NULL;
4671 	    SvREFCNT_dec(data.longest_fixed);
4672 	    longest_fixed_length = 0;
4673 	}
4674 	if (ri->regstclass
4675 	    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4676 	    ri->regstclass = NULL;
4677 	if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4678 	    && stclass_flag
4679 	    && !(data.start_class->flags & ANYOF_EOS)
4680 	    && !cl_is_anything(data.start_class))
4681 	{
4682 	    const U32 n = add_data(pRExC_state, 1, "f");
4683 
4684 	    Newx(RExC_rxi->data->data[n], 1,
4685 		struct regnode_charclass_class);
4686 	    StructCopy(data.start_class,
4687 		       (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4688 		       struct regnode_charclass_class);
4689 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4690 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
4691 	    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4692 	              regprop(r, sv, (regnode*)data.start_class);
4693 		      PerlIO_printf(Perl_debug_log,
4694 				    "synthetic stclass \"%s\".\n",
4695 				    SvPVX_const(sv));});
4696 	}
4697 
4698 	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4699 	if (longest_fixed_length > longest_float_length) {
4700 	    r->check_end_shift = r->anchored_end_shift;
4701 	    r->check_substr = r->anchored_substr;
4702 	    r->check_utf8 = r->anchored_utf8;
4703 	    r->check_offset_min = r->check_offset_max = r->anchored_offset;
4704 	    if (r->extflags & RXf_ANCH_SINGLE)
4705 		r->extflags |= RXf_NOSCAN;
4706 	}
4707 	else {
4708 	    r->check_end_shift = r->float_end_shift;
4709 	    r->check_substr = r->float_substr;
4710 	    r->check_utf8 = r->float_utf8;
4711 	    r->check_offset_min = r->float_min_offset;
4712 	    r->check_offset_max = r->float_max_offset;
4713 	}
4714 	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4715 	   This should be changed ASAP!  */
4716 	if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4717 	    r->extflags |= RXf_USE_INTUIT;
4718 	    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4719 		r->extflags |= RXf_INTUIT_TAIL;
4720 	}
4721 	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4722 	if ( (STRLEN)minlen < longest_float_length )
4723             minlen= longest_float_length;
4724         if ( (STRLEN)minlen < longest_fixed_length )
4725             minlen= longest_fixed_length;
4726         */
4727     }
4728     else {
4729 	/* Several toplevels. Best we can is to set minlen. */
4730 	I32 fake;
4731 	struct regnode_charclass_class ch_class;
4732 	I32 last_close = 0;
4733 
4734 	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4735 
4736 	scan = ri->program + 1;
4737 	cl_init(pRExC_state, &ch_class);
4738 	data.start_class = &ch_class;
4739 	data.last_closep = &last_close;
4740 
4741 
4742 	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4743 	    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4744 
4745         CHECK_RESTUDY_GOTO;
4746 
4747 	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4748 		= r->float_substr = r->float_utf8 = NULL;
4749 	if (!(data.start_class->flags & ANYOF_EOS)
4750 	    && !cl_is_anything(data.start_class))
4751 	{
4752 	    const U32 n = add_data(pRExC_state, 1, "f");
4753 
4754 	    Newx(RExC_rxi->data->data[n], 1,
4755 		struct regnode_charclass_class);
4756 	    StructCopy(data.start_class,
4757 		       (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4758 		       struct regnode_charclass_class);
4759 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4760 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
4761 	    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4762 	              regprop(r, sv, (regnode*)data.start_class);
4763 		      PerlIO_printf(Perl_debug_log,
4764 				    "synthetic stclass \"%s\".\n",
4765 				    SvPVX_const(sv));});
4766 	}
4767     }
4768 
4769     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4770        the "real" pattern. */
4771     DEBUG_OPTIMISE_r({
4772 	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4773 		      (IV)minlen, (IV)r->minlen);
4774     });
4775     r->minlenret = minlen;
4776     if (r->minlen < minlen)
4777         r->minlen = minlen;
4778 
4779     if (RExC_seen & REG_SEEN_GPOS)
4780 	r->extflags |= RXf_GPOS_SEEN;
4781     if (RExC_seen & REG_SEEN_LOOKBEHIND)
4782 	r->extflags |= RXf_LOOKBEHIND_SEEN;
4783     if (RExC_seen & REG_SEEN_EVAL)
4784 	r->extflags |= RXf_EVAL_SEEN;
4785     if (RExC_seen & REG_SEEN_CANY)
4786 	r->extflags |= RXf_CANY_SEEN;
4787     if (RExC_seen & REG_SEEN_VERBARG)
4788 	r->intflags |= PREGf_VERBARG_SEEN;
4789     if (RExC_seen & REG_SEEN_CUTGROUP)
4790 	r->intflags |= PREGf_CUTGROUP_SEEN;
4791     if (RExC_paren_names)
4792         r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4793     else
4794         r->paren_names = NULL;
4795 
4796 #ifdef STUPID_PATTERN_CHECKS
4797     if (r->prelen == 0)
4798         r->extflags |= RXf_NULL;
4799     if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
4800         /* XXX: this should happen BEFORE we compile */
4801         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4802     else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3))
4803         r->extflags |= RXf_WHITE;
4804     else if (r->prelen == 1 && r->precomp[0] == '^')
4805         r->extflags |= RXf_START_ONLY;
4806 #else
4807     if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ')
4808             /* XXX: this should happen BEFORE we compile */
4809             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4810     else {
4811         regnode *first = ri->program + 1;
4812         U8 fop = OP(first);
4813         U8 nop = OP(NEXTOPER(first));
4814 
4815         if (PL_regkind[fop] == NOTHING && nop == END)
4816             r->extflags |= RXf_NULL;
4817         else if (PL_regkind[fop] == BOL && nop == END)
4818             r->extflags |= RXf_START_ONLY;
4819         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4820             r->extflags |= RXf_WHITE;
4821     }
4822 #endif
4823 #ifdef DEBUGGING
4824     if (RExC_paren_names) {
4825         ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4826         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4827     } else
4828 #endif
4829         ri->name_list_idx = 0;
4830 
4831     if (RExC_recurse_count) {
4832         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4833             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4834             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4835         }
4836     }
4837     Newxz(r->offs, RExC_npar, regexp_paren_pair);
4838     /* assume we don't need to swap parens around before we match */
4839 
4840     DEBUG_DUMP_r({
4841         PerlIO_printf(Perl_debug_log,"Final program:\n");
4842         regdump(r);
4843     });
4844 #ifdef RE_TRACK_PATTERN_OFFSETS
4845     DEBUG_OFFSETS_r(if (ri->u.offsets) {
4846         const U32 len = ri->u.offsets[0];
4847         U32 i;
4848         GET_RE_DEBUG_FLAGS_DECL;
4849         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4850         for (i = 1; i <= len; i++) {
4851             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4852                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4853                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4854             }
4855         PerlIO_printf(Perl_debug_log, "\n");
4856     });
4857 #endif
4858     return(r);
4859 }
4860 
4861 #undef RE_ENGINE_PTR
4862 
4863 
4864 SV*
4865 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4866                     const U32 flags)
4867 {
4868     PERL_UNUSED_ARG(value);
4869 
4870     if (flags & RXapif_FETCH) {
4871         return reg_named_buff_fetch(rx, key, flags);
4872     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4873         Perl_croak(aTHX_ PL_no_modify);
4874         return NULL;
4875     } else if (flags & RXapif_EXISTS) {
4876         return reg_named_buff_exists(rx, key, flags)
4877             ? &PL_sv_yes
4878             : &PL_sv_no;
4879     } else if (flags & RXapif_REGNAMES) {
4880         return reg_named_buff_all(rx, flags);
4881     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4882         return reg_named_buff_scalar(rx, flags);
4883     } else {
4884         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4885         return NULL;
4886     }
4887 }
4888 
4889 SV*
4890 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4891                          const U32 flags)
4892 {
4893     PERL_UNUSED_ARG(lastkey);
4894 
4895     if (flags & RXapif_FIRSTKEY)
4896         return reg_named_buff_firstkey(rx, flags);
4897     else if (flags & RXapif_NEXTKEY)
4898         return reg_named_buff_nextkey(rx, flags);
4899     else {
4900         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4901         return NULL;
4902     }
4903 }
4904 
4905 SV*
4906 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4907 {
4908     AV *retarray = NULL;
4909     SV *ret;
4910     if (flags & RXapif_ALL)
4911         retarray=newAV();
4912 
4913     if (rx && rx->paren_names) {
4914         HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4915         if (he_str) {
4916             IV i;
4917             SV* sv_dat=HeVAL(he_str);
4918             I32 *nums=(I32*)SvPVX(sv_dat);
4919             for ( i=0; i<SvIVX(sv_dat); i++ ) {
4920                 if ((I32)(rx->nparens) >= nums[i]
4921                     && rx->offs[nums[i]].start != -1
4922                     && rx->offs[nums[i]].end != -1)
4923                 {
4924                     ret = newSVpvs("");
4925                     CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4926                     if (!retarray)
4927                         return ret;
4928                 } else {
4929                     ret = newSVsv(&PL_sv_undef);
4930                 }
4931                 if (retarray) {
4932                     SvREFCNT_inc_simple_void(ret);
4933                     av_push(retarray, ret);
4934                 }
4935             }
4936             if (retarray)
4937                 return newRV((SV*)retarray);
4938         }
4939     }
4940     return NULL;
4941 }
4942 
4943 bool
4944 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
4945                            const U32 flags)
4946 {
4947     if (rx && rx->paren_names) {
4948         if (flags & RXapif_ALL) {
4949             return hv_exists_ent(rx->paren_names, key, 0);
4950         } else {
4951 	    SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
4952             if (sv) {
4953 		SvREFCNT_dec(sv);
4954                 return TRUE;
4955             } else {
4956                 return FALSE;
4957             }
4958         }
4959     } else {
4960         return FALSE;
4961     }
4962 }
4963 
4964 SV*
4965 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
4966 {
4967     if ( rx && rx->paren_names ) {
4968 	(void)hv_iterinit(rx->paren_names);
4969 
4970 	return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
4971     } else {
4972 	return FALSE;
4973     }
4974 }
4975 
4976 SV*
4977 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
4978 {
4979     if (rx && rx->paren_names) {
4980         HV *hv = rx->paren_names;
4981         HE *temphe;
4982         while ( (temphe = hv_iternext_flags(hv,0)) ) {
4983             IV i;
4984             IV parno = 0;
4985             SV* sv_dat = HeVAL(temphe);
4986             I32 *nums = (I32*)SvPVX(sv_dat);
4987             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
4988                 if ((I32)(rx->lastcloseparen) >= nums[i] &&
4989                     rx->offs[nums[i]].start != -1 &&
4990                     rx->offs[nums[i]].end != -1)
4991                 {
4992                     parno = nums[i];
4993                     break;
4994                 }
4995             }
4996             if (parno || flags & RXapif_ALL) {
4997                 STRLEN len;
4998                 char *pv = HePV(temphe, len);
4999                 return newSVpvn(pv,len);
5000             }
5001         }
5002     }
5003     return NULL;
5004 }
5005 
5006 SV*
5007 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
5008 {
5009     SV *ret;
5010     AV *av;
5011     I32 length;
5012 
5013     if (rx && rx->paren_names) {
5014         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5015             return newSViv(HvTOTALKEYS(rx->paren_names));
5016         } else if (flags & RXapif_ONE) {
5017             ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
5018             av = (AV*)SvRV(ret);
5019             length = av_len(av);
5020             return newSViv(length + 1);
5021         } else {
5022             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5023             return NULL;
5024         }
5025     }
5026     return &PL_sv_undef;
5027 }
5028 
5029 SV*
5030 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5031 {
5032     AV *av = newAV();
5033 
5034     if (rx && rx->paren_names) {
5035         HV *hv= rx->paren_names;
5036         HE *temphe;
5037         (void)hv_iterinit(hv);
5038         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5039             IV i;
5040             IV parno = 0;
5041             SV* sv_dat = HeVAL(temphe);
5042             I32 *nums = (I32*)SvPVX(sv_dat);
5043             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5044                 if ((I32)(rx->lastcloseparen) >= nums[i] &&
5045                     rx->offs[nums[i]].start != -1 &&
5046                     rx->offs[nums[i]].end != -1)
5047                 {
5048                     parno = nums[i];
5049                     break;
5050                 }
5051             }
5052             if (parno || flags & RXapif_ALL) {
5053                 STRLEN len;
5054                 char *pv = HePV(temphe, len);
5055                 av_push(av, newSVpvn(pv,len));
5056             }
5057         }
5058     }
5059 
5060     return newRV((SV*)av);
5061 }
5062 
5063 void
5064 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5065 {
5066     char *s = NULL;
5067     I32 i = 0;
5068     I32 s1, t1;
5069 
5070     if (!rx->subbeg) {
5071         sv_setsv(sv,&PL_sv_undef);
5072         return;
5073     }
5074     else
5075     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5076         /* $` */
5077 	i = rx->offs[0].start;
5078 	s = rx->subbeg;
5079     }
5080     else
5081     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5082         /* $' */
5083 	s = rx->subbeg + rx->offs[0].end;
5084 	i = rx->sublen - rx->offs[0].end;
5085     }
5086     else
5087     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5088         (s1 = rx->offs[paren].start) != -1 &&
5089         (t1 = rx->offs[paren].end) != -1)
5090     {
5091         /* $& $1 ... */
5092         i = t1 - s1;
5093         s = rx->subbeg + s1;
5094     } else {
5095         sv_setsv(sv,&PL_sv_undef);
5096         return;
5097     }
5098     assert(rx->sublen >= (s - rx->subbeg) + i );
5099     if (i >= 0) {
5100         const int oldtainted = PL_tainted;
5101         TAINT_NOT;
5102         sv_setpvn(sv, s, i);
5103         PL_tainted = oldtainted;
5104         if ( (rx->extflags & RXf_CANY_SEEN)
5105             ? (RX_MATCH_UTF8(rx)
5106                         && (!i || is_utf8_string((U8*)s, i)))
5107             : (RX_MATCH_UTF8(rx)) )
5108         {
5109             SvUTF8_on(sv);
5110         }
5111         else
5112             SvUTF8_off(sv);
5113         if (PL_tainting) {
5114             if (RX_MATCH_TAINTED(rx)) {
5115                 if (SvTYPE(sv) >= SVt_PVMG) {
5116                     MAGIC* const mg = SvMAGIC(sv);
5117                     MAGIC* mgt;
5118                     PL_tainted = 1;
5119                     SvMAGIC_set(sv, mg->mg_moremagic);
5120                     SvTAINT(sv);
5121                     if ((mgt = SvMAGIC(sv))) {
5122                         mg->mg_moremagic = mgt;
5123                         SvMAGIC_set(sv, mg);
5124                     }
5125                 } else {
5126                     PL_tainted = 1;
5127                     SvTAINT(sv);
5128                 }
5129             } else
5130                 SvTAINTED_off(sv);
5131         }
5132     } else {
5133         sv_setsv(sv,&PL_sv_undef);
5134         return;
5135     }
5136 }
5137 
5138 void
5139 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5140 							 SV const * const value)
5141 {
5142     PERL_UNUSED_ARG(rx);
5143     PERL_UNUSED_ARG(paren);
5144     PERL_UNUSED_ARG(value);
5145 
5146     if (!PL_localizing)
5147         Perl_croak(aTHX_ PL_no_modify);
5148 }
5149 
5150 I32
5151 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5152                               const I32 paren)
5153 {
5154     I32 i;
5155     I32 s1, t1;
5156 
5157     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5158 	switch (paren) {
5159       /* $` / ${^PREMATCH} */
5160       case RX_BUFF_IDX_PREMATCH:
5161         if (rx->offs[0].start != -1) {
5162 			i = rx->offs[0].start;
5163 			if (i > 0) {
5164 				s1 = 0;
5165 				t1 = i;
5166 				goto getlen;
5167 			}
5168 	    }
5169         return 0;
5170       /* $' / ${^POSTMATCH} */
5171       case RX_BUFF_IDX_POSTMATCH:
5172 	    if (rx->offs[0].end != -1) {
5173 			i = rx->sublen - rx->offs[0].end;
5174 			if (i > 0) {
5175 				s1 = rx->offs[0].end;
5176 				t1 = rx->sublen;
5177 				goto getlen;
5178 			}
5179 	    }
5180         return 0;
5181       /* $& / ${^MATCH}, $1, $2, ... */
5182       default:
5183 	    if (paren <= (I32)rx->nparens &&
5184             (s1 = rx->offs[paren].start) != -1 &&
5185             (t1 = rx->offs[paren].end) != -1)
5186 	    {
5187             i = t1 - s1;
5188             goto getlen;
5189         } else {
5190             if (ckWARN(WARN_UNINITIALIZED))
5191                 report_uninit((SV*)sv);
5192             return 0;
5193         }
5194     }
5195   getlen:
5196     if (i > 0 && RX_MATCH_UTF8(rx)) {
5197         const char * const s = rx->subbeg + s1;
5198         const U8 *ep;
5199         STRLEN el;
5200 
5201         i = t1 - s1;
5202         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5203 			i = el;
5204     }
5205     return i;
5206 }
5207 
5208 SV*
5209 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5210 {
5211 	PERL_UNUSED_ARG(rx);
5212 	return newSVpvs("Regexp");
5213 }
5214 
5215 /* Scans the name of a named buffer from the pattern.
5216  * If flags is REG_RSN_RETURN_NULL returns null.
5217  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5218  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5219  * to the parsed name as looked up in the RExC_paren_names hash.
5220  * If there is an error throws a vFAIL().. type exception.
5221  */
5222 
5223 #define REG_RSN_RETURN_NULL    0
5224 #define REG_RSN_RETURN_NAME    1
5225 #define REG_RSN_RETURN_DATA    2
5226 
5227 STATIC SV*
5228 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
5229     char *name_start = RExC_parse;
5230 
5231     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5232 	 /* skip IDFIRST by using do...while */
5233 	if (UTF)
5234 	    do {
5235 		RExC_parse += UTF8SKIP(RExC_parse);
5236 	    } while (isALNUM_utf8((U8*)RExC_parse));
5237 	else
5238 	    do {
5239 		RExC_parse++;
5240 	    } while (isALNUM(*RExC_parse));
5241     }
5242 
5243     if ( flags ) {
5244         SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
5245             (int)(RExC_parse - name_start)));
5246 	if (UTF)
5247             SvUTF8_on(sv_name);
5248         if ( flags == REG_RSN_RETURN_NAME)
5249             return sv_name;
5250         else if (flags==REG_RSN_RETURN_DATA) {
5251             HE *he_str = NULL;
5252             SV *sv_dat = NULL;
5253             if ( ! sv_name )      /* should not happen*/
5254                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5255             if (RExC_paren_names)
5256                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5257             if ( he_str )
5258                 sv_dat = HeVAL(he_str);
5259             if ( ! sv_dat )
5260                 vFAIL("Reference to nonexistent named group");
5261             return sv_dat;
5262         }
5263         else {
5264             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5265         }
5266         /* NOT REACHED */
5267     }
5268     return NULL;
5269 }
5270 
5271 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5272     int rem=(int)(RExC_end - RExC_parse);                       \
5273     int cut;                                                    \
5274     int num;                                                    \
5275     int iscut=0;                                                \
5276     if (rem>10) {                                               \
5277         rem=10;                                                 \
5278         iscut=1;                                                \
5279     }                                                           \
5280     cut=10-rem;                                                 \
5281     if (RExC_lastparse!=RExC_parse)                             \
5282         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5283             rem, RExC_parse,                                    \
5284             cut + 4,                                            \
5285             iscut ? "..." : "<"                                 \
5286         );                                                      \
5287     else                                                        \
5288         PerlIO_printf(Perl_debug_log,"%16s","");                \
5289                                                                 \
5290     if (SIZE_ONLY)                                              \
5291        num = RExC_size + 1;                                     \
5292     else                                                        \
5293        num=REG_NODE_NUM(RExC_emit);                             \
5294     if (RExC_lastnum!=num)                                      \
5295        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5296     else                                                        \
5297        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5298     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5299         (int)((depth*2)), "",                                   \
5300         (funcname)                                              \
5301     );                                                          \
5302     RExC_lastnum=num;                                           \
5303     RExC_lastparse=RExC_parse;                                  \
5304 })
5305 
5306 
5307 
5308 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5309     DEBUG_PARSE_MSG((funcname));                            \
5310     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5311 })
5312 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5313     DEBUG_PARSE_MSG((funcname));                            \
5314     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5315 })
5316 /*
5317  - reg - regular expression, i.e. main body or parenthesized thing
5318  *
5319  * Caller must absorb opening parenthesis.
5320  *
5321  * Combining parenthesis handling with the base level of regular expression
5322  * is a trifle forced, but the need to tie the tails of the branches to what
5323  * follows makes it hard to avoid.
5324  */
5325 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5326 #ifdef DEBUGGING
5327 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5328 #else
5329 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5330 #endif
5331 
5332 STATIC regnode *
5333 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5334     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5335 {
5336     dVAR;
5337     register regnode *ret;		/* Will be the head of the group. */
5338     register regnode *br;
5339     register regnode *lastbr;
5340     register regnode *ender = NULL;
5341     register I32 parno = 0;
5342     I32 flags;
5343     U32 oregflags = RExC_flags;
5344     bool have_branch = 0;
5345     bool is_open = 0;
5346     I32 freeze_paren = 0;
5347     I32 after_freeze = 0;
5348 
5349     /* for (?g), (?gc), and (?o) warnings; warning
5350        about (?c) will warn about (?g) -- japhy    */
5351 
5352 #define WASTED_O  0x01
5353 #define WASTED_G  0x02
5354 #define WASTED_C  0x04
5355 #define WASTED_GC (0x02|0x04)
5356     I32 wastedflags = 0x00;
5357 
5358     char * parse_start = RExC_parse; /* MJD */
5359     char * const oregcomp_parse = RExC_parse;
5360 
5361     GET_RE_DEBUG_FLAGS_DECL;
5362     DEBUG_PARSE("reg ");
5363 
5364     *flagp = 0;				/* Tentatively. */
5365 
5366 
5367     /* Make an OPEN node, if parenthesized. */
5368     if (paren) {
5369         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5370 	    char *start_verb = RExC_parse;
5371 	    STRLEN verb_len = 0;
5372 	    char *start_arg = NULL;
5373 	    unsigned char op = 0;
5374 	    int argok = 1;
5375 	    int internal_argval = 0; /* internal_argval is only useful if !argok */
5376 	    while ( *RExC_parse && *RExC_parse != ')' ) {
5377 	        if ( *RExC_parse == ':' ) {
5378 	            start_arg = RExC_parse + 1;
5379 	            break;
5380 	        }
5381 	        RExC_parse++;
5382 	    }
5383 	    ++start_verb;
5384 	    verb_len = RExC_parse - start_verb;
5385 	    if ( start_arg ) {
5386 	        RExC_parse++;
5387 	        while ( *RExC_parse && *RExC_parse != ')' )
5388 	            RExC_parse++;
5389 	        if ( *RExC_parse != ')' )
5390 	            vFAIL("Unterminated verb pattern argument");
5391 	        if ( RExC_parse == start_arg )
5392 	            start_arg = NULL;
5393 	    } else {
5394 	        if ( *RExC_parse != ')' )
5395 	            vFAIL("Unterminated verb pattern");
5396 	    }
5397 
5398 	    switch ( *start_verb ) {
5399             case 'A':  /* (*ACCEPT) */
5400                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5401 		    op = ACCEPT;
5402 		    internal_argval = RExC_nestroot;
5403 		}
5404 		break;
5405             case 'C':  /* (*COMMIT) */
5406                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5407                     op = COMMIT;
5408                 break;
5409             case 'F':  /* (*FAIL) */
5410                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5411 		    op = OPFAIL;
5412 		    argok = 0;
5413 		}
5414 		break;
5415             case ':':  /* (*:NAME) */
5416 	    case 'M':  /* (*MARK:NAME) */
5417 	        if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5418                     op = MARKPOINT;
5419                     argok = -1;
5420                 }
5421                 break;
5422             case 'P':  /* (*PRUNE) */
5423                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5424                     op = PRUNE;
5425                 break;
5426             case 'S':   /* (*SKIP) */
5427                 if ( memEQs(start_verb,verb_len,"SKIP") )
5428                     op = SKIP;
5429                 break;
5430             case 'T':  /* (*THEN) */
5431                 /* [19:06] <TimToady> :: is then */
5432                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5433                     op = CUTGROUP;
5434                     RExC_seen |= REG_SEEN_CUTGROUP;
5435                 }
5436                 break;
5437 	    }
5438 	    if ( ! op ) {
5439 	        RExC_parse++;
5440 	        vFAIL3("Unknown verb pattern '%.*s'",
5441 	            verb_len, start_verb);
5442 	    }
5443 	    if ( argok ) {
5444                 if ( start_arg && internal_argval ) {
5445 	            vFAIL3("Verb pattern '%.*s' may not have an argument",
5446 	                verb_len, start_verb);
5447 	        } else if ( argok < 0 && !start_arg ) {
5448                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5449 	                verb_len, start_verb);
5450 	        } else {
5451 	            ret = reganode(pRExC_state, op, internal_argval);
5452 	            if ( ! internal_argval && ! SIZE_ONLY ) {
5453                         if (start_arg) {
5454                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5455                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5456                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5457                             ret->flags = 0;
5458                         } else {
5459                             ret->flags = 1;
5460                         }
5461                     }
5462 	        }
5463 	        if (!internal_argval)
5464 	            RExC_seen |= REG_SEEN_VERBARG;
5465 	    } else if ( start_arg ) {
5466 	        vFAIL3("Verb pattern '%.*s' may not have an argument",
5467 	                verb_len, start_verb);
5468 	    } else {
5469 	        ret = reg_node(pRExC_state, op);
5470 	    }
5471 	    nextchar(pRExC_state);
5472 	    return ret;
5473         } else
5474 	if (*RExC_parse == '?') { /* (?...) */
5475 	    bool is_logical = 0;
5476 	    const char * const seqstart = RExC_parse;
5477 
5478 	    RExC_parse++;
5479 	    paren = *RExC_parse++;
5480 	    ret = NULL;			/* For look-ahead/behind. */
5481 	    switch (paren) {
5482 
5483 	    case 'P':	/* (?P...) variants for those used to PCRE/Python */
5484 	        paren = *RExC_parse++;
5485 		if ( paren == '<')         /* (?P<...>) named capture */
5486 		    goto named_capture;
5487                 else if (paren == '>') {   /* (?P>name) named recursion */
5488                     goto named_recursion;
5489                 }
5490                 else if (paren == '=') {   /* (?P=...)  named backref */
5491                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5492                        you change this make sure you change that */
5493                     char* name_start = RExC_parse;
5494 		    U32 num = 0;
5495                     SV *sv_dat = reg_scan_name(pRExC_state,
5496                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5497                     if (RExC_parse == name_start || *RExC_parse != ')')
5498                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5499 
5500                     if (!SIZE_ONLY) {
5501                         num = add_data( pRExC_state, 1, "S" );
5502                         RExC_rxi->data->data[num]=(void*)sv_dat;
5503                         SvREFCNT_inc_simple_void(sv_dat);
5504                     }
5505                     RExC_sawback = 1;
5506                     ret = reganode(pRExC_state,
5507                     	   (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5508                     	   num);
5509                     *flagp |= HASWIDTH;
5510 
5511                     Set_Node_Offset(ret, parse_start+1);
5512                     Set_Node_Cur_Length(ret); /* MJD */
5513 
5514                     nextchar(pRExC_state);
5515                     return ret;
5516                 }
5517                 RExC_parse++;
5518 		vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5519 		/*NOTREACHED*/
5520             case '<':           /* (?<...) */
5521 		if (*RExC_parse == '!')
5522 		    paren = ',';
5523 		else if (*RExC_parse != '=')
5524               named_capture:
5525 		{               /* (?<...>) */
5526 		    char *name_start;
5527 		    SV *svname;
5528 		    paren= '>';
5529             case '\'':          /* (?'...') */
5530     		    name_start= RExC_parse;
5531     		    svname = reg_scan_name(pRExC_state,
5532     		        SIZE_ONLY ?  /* reverse test from the others */
5533     		        REG_RSN_RETURN_NAME :
5534     		        REG_RSN_RETURN_NULL);
5535 		    if (RExC_parse == name_start) {
5536 		        RExC_parse++;
5537 		        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5538 		        /*NOTREACHED*/
5539                     }
5540 		    if (*RExC_parse != paren)
5541 		        vFAIL2("Sequence (?%c... not terminated",
5542 		            paren=='>' ? '<' : paren);
5543 		    if (SIZE_ONLY) {
5544 			HE *he_str;
5545 			SV *sv_dat = NULL;
5546                         if (!svname) /* shouldnt happen */
5547                             Perl_croak(aTHX_
5548                                 "panic: reg_scan_name returned NULL");
5549                         if (!RExC_paren_names) {
5550                             RExC_paren_names= newHV();
5551                             sv_2mortal((SV*)RExC_paren_names);
5552 #ifdef DEBUGGING
5553                             RExC_paren_name_list= newAV();
5554                             sv_2mortal((SV*)RExC_paren_name_list);
5555 #endif
5556                         }
5557                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5558                         if ( he_str )
5559                             sv_dat = HeVAL(he_str);
5560                         if ( ! sv_dat ) {
5561                             /* croak baby croak */
5562                             Perl_croak(aTHX_
5563                                 "panic: paren_name hash element allocation failed");
5564                         } else if ( SvPOK(sv_dat) ) {
5565                             /* (?|...) can mean we have dupes so scan to check
5566                                its already been stored. Maybe a flag indicating
5567                                we are inside such a construct would be useful,
5568                                but the arrays are likely to be quite small, so
5569                                for now we punt -- dmq */
5570                             IV count = SvIV(sv_dat);
5571                             I32 *pv = (I32*)SvPVX(sv_dat);
5572                             IV i;
5573                             for ( i = 0 ; i < count ; i++ ) {
5574                                 if ( pv[i] == RExC_npar ) {
5575                                     count = 0;
5576                                     break;
5577                                 }
5578                             }
5579                             if ( count ) {
5580                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5581                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5582                                 pv[count] = RExC_npar;
5583                                 SvIVX(sv_dat)++;
5584                             }
5585                         } else {
5586                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5587                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5588                             SvIOK_on(sv_dat);
5589                             SvIVX(sv_dat)= 1;
5590                         }
5591 #ifdef DEBUGGING
5592                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5593                             SvREFCNT_dec(svname);
5594 #endif
5595 
5596                         /*sv_dump(sv_dat);*/
5597                     }
5598                     nextchar(pRExC_state);
5599 		    paren = 1;
5600 		    goto capturing_parens;
5601 		}
5602                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5603 		RExC_parse++;
5604 	    case '=':           /* (?=...) */
5605 	    case '!':           /* (?!...) */
5606 		RExC_seen_zerolen++;
5607 	        if (*RExC_parse == ')') {
5608 	            ret=reg_node(pRExC_state, OPFAIL);
5609 	            nextchar(pRExC_state);
5610 	            return ret;
5611 	        }
5612 	        break;
5613 	    case '|':           /* (?|...) */
5614 	        /* branch reset, behave like a (?:...) except that
5615 	           buffers in alternations share the same numbers */
5616 	        paren = ':';
5617 	        after_freeze = freeze_paren = RExC_npar;
5618 	        break;
5619 	    case ':':           /* (?:...) */
5620 	    case '>':           /* (?>...) */
5621 		break;
5622 	    case '$':           /* (?$...) */
5623 	    case '@':           /* (?@...) */
5624 		vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5625 		break;
5626 	    case '#':           /* (?#...) */
5627 		while (*RExC_parse && *RExC_parse != ')')
5628 		    RExC_parse++;
5629 		if (*RExC_parse != ')')
5630 		    FAIL("Sequence (?#... not terminated");
5631 		nextchar(pRExC_state);
5632 		*flagp = TRYAGAIN;
5633 		return NULL;
5634 	    case '0' :           /* (?0) */
5635 	    case 'R' :           /* (?R) */
5636 		if (*RExC_parse != ')')
5637 		    FAIL("Sequence (?R) not terminated");
5638 		ret = reg_node(pRExC_state, GOSTART);
5639 		*flagp |= POSTPONED;
5640 		nextchar(pRExC_state);
5641 		return ret;
5642 		/*notreached*/
5643             { /* named and numeric backreferences */
5644                 I32 num;
5645             case '&':            /* (?&NAME) */
5646                 parse_start = RExC_parse - 1;
5647               named_recursion:
5648                 {
5649     		    SV *sv_dat = reg_scan_name(pRExC_state,
5650     		        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5651     		     num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5652                 }
5653                 goto gen_recurse_regop;
5654                 /* NOT REACHED */
5655             case '+':
5656                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5657                     RExC_parse++;
5658                     vFAIL("Illegal pattern");
5659                 }
5660                 goto parse_recursion;
5661                 /* NOT REACHED*/
5662             case '-': /* (?-1) */
5663                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5664                     RExC_parse--; /* rewind to let it be handled later */
5665                     goto parse_flags;
5666                 }
5667                 /*FALLTHROUGH */
5668             case '1': case '2': case '3': case '4': /* (?1) */
5669 	    case '5': case '6': case '7': case '8': case '9':
5670 	        RExC_parse--;
5671               parse_recursion:
5672 		num = atoi(RExC_parse);
5673   	        parse_start = RExC_parse - 1; /* MJD */
5674 	        if (*RExC_parse == '-')
5675 	            RExC_parse++;
5676 		while (isDIGIT(*RExC_parse))
5677 			RExC_parse++;
5678 	        if (*RExC_parse!=')')
5679 	            vFAIL("Expecting close bracket");
5680 
5681               gen_recurse_regop:
5682                 if ( paren == '-' ) {
5683                     /*
5684                     Diagram of capture buffer numbering.
5685                     Top line is the normal capture buffer numbers
5686                     Botton line is the negative indexing as from
5687                     the X (the (?-2))
5688 
5689                     +   1 2    3 4 5 X          6 7
5690                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5691                     -   5 4    3 2 1 X          x x
5692 
5693                     */
5694                     num = RExC_npar + num;
5695                     if (num < 1)  {
5696                         RExC_parse++;
5697                         vFAIL("Reference to nonexistent group");
5698                     }
5699                 } else if ( paren == '+' ) {
5700                     num = RExC_npar + num - 1;
5701                 }
5702 
5703                 ret = reganode(pRExC_state, GOSUB, num);
5704                 if (!SIZE_ONLY) {
5705 		    if (num > (I32)RExC_rx->nparens) {
5706 			RExC_parse++;
5707 			vFAIL("Reference to nonexistent group");
5708 	            }
5709 	            ARG2L_SET( ret, RExC_recurse_count++);
5710                     RExC_emit++;
5711 		    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5712 			"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5713 		} else {
5714 		    RExC_size++;
5715     		}
5716     		RExC_seen |= REG_SEEN_RECURSE;
5717                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5718 		Set_Node_Offset(ret, parse_start); /* MJD */
5719 
5720                 *flagp |= POSTPONED;
5721                 nextchar(pRExC_state);
5722                 return ret;
5723             } /* named and numeric backreferences */
5724             /* NOT REACHED */
5725 
5726 	    case '?':           /* (??...) */
5727 		is_logical = 1;
5728 		if (*RExC_parse != '{') {
5729 		    RExC_parse++;
5730 		    vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5731 		    /*NOTREACHED*/
5732 		}
5733 		*flagp |= POSTPONED;
5734 		paren = *RExC_parse++;
5735 		/* FALL THROUGH */
5736 	    case '{':           /* (?{...}) */
5737 	    {
5738 		I32 count = 1;
5739 		U32 n = 0;
5740 		char c;
5741 		char *s = RExC_parse;
5742 
5743 		RExC_seen_zerolen++;
5744 		RExC_seen |= REG_SEEN_EVAL;
5745 		while (count && (c = *RExC_parse)) {
5746 		    if (c == '\\') {
5747 			if (RExC_parse[1])
5748 			    RExC_parse++;
5749 		    }
5750 		    else if (c == '{')
5751 			count++;
5752 		    else if (c == '}')
5753 			count--;
5754 		    RExC_parse++;
5755 		}
5756 		if (*RExC_parse != ')') {
5757 		    RExC_parse = s;
5758 		    vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5759 		}
5760 		if (!SIZE_ONLY) {
5761 		    PAD *pad;
5762 		    OP_4tree *sop, *rop;
5763 		    SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5764 
5765 		    ENTER;
5766 		    Perl_save_re_context(aTHX);
5767 		    rop = sv_compile_2op(sv, &sop, "re", &pad);
5768 		    sop->op_private |= OPpREFCOUNTED;
5769 		    /* re_dup will OpREFCNT_inc */
5770 		    OpREFCNT_set(sop, 1);
5771 		    LEAVE;
5772 
5773 		    n = add_data(pRExC_state, 3, "nop");
5774 		    RExC_rxi->data->data[n] = (void*)rop;
5775 		    RExC_rxi->data->data[n+1] = (void*)sop;
5776 		    RExC_rxi->data->data[n+2] = (void*)pad;
5777 		    SvREFCNT_dec(sv);
5778 		}
5779 		else {						/* First pass */
5780 		    if (PL_reginterp_cnt < ++RExC_seen_evals
5781 			&& IN_PERL_RUNTIME)
5782 			/* No compiled RE interpolated, has runtime
5783 			   components ===> unsafe.  */
5784 			FAIL("Eval-group not allowed at runtime, use re 'eval'");
5785 		    if (PL_tainting && PL_tainted)
5786 			FAIL("Eval-group in insecure regular expression");
5787 #if PERL_VERSION > 8
5788 		    if (IN_PERL_COMPILETIME)
5789 			PL_cv_has_eval = 1;
5790 #endif
5791 		}
5792 
5793 		nextchar(pRExC_state);
5794 		if (is_logical) {
5795 		    ret = reg_node(pRExC_state, LOGICAL);
5796 		    if (!SIZE_ONLY)
5797 			ret->flags = 2;
5798                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5799                     /* deal with the length of this later - MJD */
5800 		    return ret;
5801 		}
5802 		ret = reganode(pRExC_state, EVAL, n);
5803 		Set_Node_Length(ret, RExC_parse - parse_start + 1);
5804 		Set_Node_Offset(ret, parse_start);
5805 		return ret;
5806 	    }
5807 	    case '(':           /* (?(?{...})...) and (?(?=...)...) */
5808 	    {
5809 	        int is_define= 0;
5810 		if (RExC_parse[0] == '?') {        /* (?(?...)) */
5811 		    if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5812 			|| RExC_parse[1] == '<'
5813 			|| RExC_parse[1] == '{') { /* Lookahead or eval. */
5814 			I32 flag;
5815 
5816 			ret = reg_node(pRExC_state, LOGICAL);
5817 			if (!SIZE_ONLY)
5818 			    ret->flags = 1;
5819                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5820 			goto insert_if;
5821 		    }
5822 		}
5823 		else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
5824 		         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5825 	        {
5826 	            char ch = RExC_parse[0] == '<' ? '>' : '\'';
5827 	            char *name_start= RExC_parse++;
5828 	            U32 num = 0;
5829 	            SV *sv_dat=reg_scan_name(pRExC_state,
5830 	                SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5831 	            if (RExC_parse == name_start || *RExC_parse != ch)
5832                         vFAIL2("Sequence (?(%c... not terminated",
5833                             (ch == '>' ? '<' : ch));
5834                     RExC_parse++;
5835 	            if (!SIZE_ONLY) {
5836                         num = add_data( pRExC_state, 1, "S" );
5837                         RExC_rxi->data->data[num]=(void*)sv_dat;
5838                         SvREFCNT_inc_simple_void(sv_dat);
5839                     }
5840                     ret = reganode(pRExC_state,NGROUPP,num);
5841                     goto insert_if_check_paren;
5842 		}
5843 		else if (RExC_parse[0] == 'D' &&
5844 		         RExC_parse[1] == 'E' &&
5845 		         RExC_parse[2] == 'F' &&
5846 		         RExC_parse[3] == 'I' &&
5847 		         RExC_parse[4] == 'N' &&
5848 		         RExC_parse[5] == 'E')
5849 		{
5850 		    ret = reganode(pRExC_state,DEFINEP,0);
5851 		    RExC_parse +=6 ;
5852 		    is_define = 1;
5853 		    goto insert_if_check_paren;
5854 		}
5855 		else if (RExC_parse[0] == 'R') {
5856 		    RExC_parse++;
5857 		    parno = 0;
5858 		    if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5859 		        parno = atoi(RExC_parse++);
5860 		        while (isDIGIT(*RExC_parse))
5861 			    RExC_parse++;
5862 		    } else if (RExC_parse[0] == '&') {
5863 		        SV *sv_dat;
5864 		        RExC_parse++;
5865 		        sv_dat = reg_scan_name(pRExC_state,
5866     		            SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5867     		        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5868 		    }
5869 		    ret = reganode(pRExC_state,INSUBP,parno);
5870 		    goto insert_if_check_paren;
5871 		}
5872 		else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5873                     /* (?(1)...) */
5874 		    char c;
5875 		    parno = atoi(RExC_parse++);
5876 
5877 		    while (isDIGIT(*RExC_parse))
5878 			RExC_parse++;
5879                     ret = reganode(pRExC_state, GROUPP, parno);
5880 
5881                  insert_if_check_paren:
5882 		    if ((c = *nextchar(pRExC_state)) != ')')
5883 			vFAIL("Switch condition not recognized");
5884 		  insert_if:
5885                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5886                     br = regbranch(pRExC_state, &flags, 1,depth+1);
5887 		    if (br == NULL)
5888 			br = reganode(pRExC_state, LONGJMP, 0);
5889 		    else
5890                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5891 		    c = *nextchar(pRExC_state);
5892 		    if (flags&HASWIDTH)
5893 			*flagp |= HASWIDTH;
5894 		    if (c == '|') {
5895 		        if (is_define)
5896 		            vFAIL("(?(DEFINE)....) does not allow branches");
5897 			lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5898                         regbranch(pRExC_state, &flags, 1,depth+1);
5899                         REGTAIL(pRExC_state, ret, lastbr);
5900 		 	if (flags&HASWIDTH)
5901 			    *flagp |= HASWIDTH;
5902 			c = *nextchar(pRExC_state);
5903 		    }
5904 		    else
5905 			lastbr = NULL;
5906 		    if (c != ')')
5907 			vFAIL("Switch (?(condition)... contains too many branches");
5908 		    ender = reg_node(pRExC_state, TAIL);
5909                     REGTAIL(pRExC_state, br, ender);
5910 		    if (lastbr) {
5911                         REGTAIL(pRExC_state, lastbr, ender);
5912                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5913 		    }
5914 		    else
5915                         REGTAIL(pRExC_state, ret, ender);
5916                     RExC_size++; /* XXX WHY do we need this?!!
5917                                     For large programs it seems to be required
5918                                     but I can't figure out why. -- dmq*/
5919 		    return ret;
5920 		}
5921 		else {
5922 		    vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5923 		}
5924 	    }
5925             case 0:
5926 		RExC_parse--; /* for vFAIL to print correctly */
5927                 vFAIL("Sequence (? incomplete");
5928                 break;
5929 	    default:
5930 	        --RExC_parse;
5931 	        parse_flags:      /* (?i) */
5932 	    {
5933                 U32 posflags = 0, negflags = 0;
5934 	        U32 *flagsp = &posflags;
5935 
5936 		while (*RExC_parse) {
5937 		    /* && strchr("iogcmsx", *RExC_parse) */
5938 		    /* (?g), (?gc) and (?o) are useless here
5939 		       and must be globally applied -- japhy */
5940                     switch (*RExC_parse) {
5941 	            CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5942                     case ONCE_PAT_MOD: /* 'o' */
5943                     case GLOBAL_PAT_MOD: /* 'g' */
5944 			if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5945 			    const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5946 			    if (! (wastedflags & wflagbit) ) {
5947 				wastedflags |= wflagbit;
5948 				vWARN5(
5949 				    RExC_parse + 1,
5950 				    "Useless (%s%c) - %suse /%c modifier",
5951 				    flagsp == &negflags ? "?-" : "?",
5952 				    *RExC_parse,
5953 				    flagsp == &negflags ? "don't " : "",
5954 				    *RExC_parse
5955 				);
5956 			    }
5957 			}
5958 			break;
5959 
5960 		    case CONTINUE_PAT_MOD: /* 'c' */
5961 			if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5962 			    if (! (wastedflags & WASTED_C) ) {
5963 				wastedflags |= WASTED_GC;
5964 				vWARN3(
5965 				    RExC_parse + 1,
5966 				    "Useless (%sc) - %suse /gc modifier",
5967 				    flagsp == &negflags ? "?-" : "?",
5968 				    flagsp == &negflags ? "don't " : ""
5969 				);
5970 			    }
5971 			}
5972 			break;
5973 	            case KEEPCOPY_PAT_MOD: /* 'p' */
5974                         if (flagsp == &negflags) {
5975                             if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5976                                 vWARN(RExC_parse + 1,"Useless use of (?-p)");
5977                         } else {
5978                             *flagsp |= RXf_PMf_KEEPCOPY;
5979                         }
5980 	                break;
5981                     case '-':
5982                         if (flagsp == &negflags) {
5983                             RExC_parse++;
5984 		            vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5985 		            /*NOTREACHED*/
5986 		        }
5987 			flagsp = &negflags;
5988 		        wastedflags = 0;  /* reset so (?g-c) warns twice */
5989 		        break;
5990                     case ':':
5991 		        paren = ':';
5992 		        /*FALLTHROUGH*/
5993                     case ')':
5994                         RExC_flags |= posflags;
5995                         RExC_flags &= ~negflags;
5996                         if (paren != ':') {
5997                             oregflags |= posflags;
5998                             oregflags &= ~negflags;
5999                         }
6000                         nextchar(pRExC_state);
6001 		        if (paren != ':') {
6002 		            *flagp = TRYAGAIN;
6003 		            return NULL;
6004 		        } else {
6005                             ret = NULL;
6006 		            goto parse_rest;
6007 		        }
6008 		        /*NOTREACHED*/
6009                     default:
6010 		        RExC_parse++;
6011 		        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6012 		        /*NOTREACHED*/
6013                     }
6014 		    ++RExC_parse;
6015 		}
6016 	    }} /* one for the default block, one for the switch */
6017 	}
6018 	else {                  /* (...) */
6019 	  capturing_parens:
6020 	    parno = RExC_npar;
6021 	    RExC_npar++;
6022 
6023 	    ret = reganode(pRExC_state, OPEN, parno);
6024 	    if (!SIZE_ONLY ){
6025 	        if (!RExC_nestroot)
6026 	            RExC_nestroot = parno;
6027 	        if (RExC_seen & REG_SEEN_RECURSE
6028 	            && !RExC_open_parens[parno-1])
6029 	        {
6030 		    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6031 			"Setting open paren #%"IVdf" to %d\n",
6032 			(IV)parno, REG_NODE_NUM(ret)));
6033 	            RExC_open_parens[parno-1]= ret;
6034 	        }
6035 	    }
6036             Set_Node_Length(ret, 1); /* MJD */
6037             Set_Node_Offset(ret, RExC_parse); /* MJD */
6038 	    is_open = 1;
6039 	}
6040     }
6041     else                        /* ! paren */
6042 	ret = NULL;
6043 
6044    parse_rest:
6045     /* Pick up the branches, linking them together. */
6046     parse_start = RExC_parse;   /* MJD */
6047     br = regbranch(pRExC_state, &flags, 1,depth+1);
6048     /*     branch_len = (paren != 0); */
6049 
6050     if (br == NULL)
6051 	return(NULL);
6052     if (*RExC_parse == '|') {
6053 	if (!SIZE_ONLY && RExC_extralen) {
6054 	    reginsert(pRExC_state, BRANCHJ, br, depth+1);
6055 	}
6056 	else {                  /* MJD */
6057 	    reginsert(pRExC_state, BRANCH, br, depth+1);
6058             Set_Node_Length(br, paren != 0);
6059             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6060         }
6061 	have_branch = 1;
6062 	if (SIZE_ONLY)
6063 	    RExC_extralen += 1;		/* For BRANCHJ-BRANCH. */
6064     }
6065     else if (paren == ':') {
6066 	*flagp |= flags&SIMPLE;
6067     }
6068     if (is_open) {				/* Starts with OPEN. */
6069         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6070     }
6071     else if (paren != '?')		/* Not Conditional */
6072 	ret = br;
6073     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6074     lastbr = br;
6075     while (*RExC_parse == '|') {
6076 	if (!SIZE_ONLY && RExC_extralen) {
6077 	    ender = reganode(pRExC_state, LONGJMP,0);
6078             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6079 	}
6080 	if (SIZE_ONLY)
6081 	    RExC_extralen += 2;		/* Account for LONGJMP. */
6082 	nextchar(pRExC_state);
6083 	if (freeze_paren) {
6084 	    if (RExC_npar > after_freeze)
6085 	        after_freeze = RExC_npar;
6086             RExC_npar = freeze_paren;
6087         }
6088         br = regbranch(pRExC_state, &flags, 0, depth+1);
6089 
6090 	if (br == NULL)
6091 	    return(NULL);
6092         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6093 	lastbr = br;
6094 	*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6095     }
6096 
6097     if (have_branch || paren != ':') {
6098 	/* Make a closing node, and hook it on the end. */
6099 	switch (paren) {
6100 	case ':':
6101 	    ender = reg_node(pRExC_state, TAIL);
6102 	    break;
6103 	case 1:
6104 	    ender = reganode(pRExC_state, CLOSE, parno);
6105 	    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6106 		DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6107 			"Setting close paren #%"IVdf" to %d\n",
6108 			(IV)parno, REG_NODE_NUM(ender)));
6109 	        RExC_close_parens[parno-1]= ender;
6110 	        if (RExC_nestroot == parno)
6111 	            RExC_nestroot = 0;
6112 	    }
6113             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6114             Set_Node_Length(ender,1); /* MJD */
6115 	    break;
6116 	case '<':
6117 	case ',':
6118 	case '=':
6119 	case '!':
6120 	    *flagp &= ~HASWIDTH;
6121 	    /* FALL THROUGH */
6122 	case '>':
6123 	    ender = reg_node(pRExC_state, SUCCEED);
6124 	    break;
6125 	case 0:
6126 	    ender = reg_node(pRExC_state, END);
6127 	    if (!SIZE_ONLY) {
6128                 assert(!RExC_opend); /* there can only be one! */
6129                 RExC_opend = ender;
6130             }
6131 	    break;
6132 	}
6133         REGTAIL(pRExC_state, lastbr, ender);
6134 
6135 	if (have_branch && !SIZE_ONLY) {
6136 	    if (depth==1)
6137 	        RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6138 
6139 	    /* Hook the tails of the branches to the closing node. */
6140 	    for (br = ret; br; br = regnext(br)) {
6141 		const U8 op = PL_regkind[OP(br)];
6142 		if (op == BRANCH) {
6143                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6144 		}
6145 		else if (op == BRANCHJ) {
6146                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6147 		}
6148 	    }
6149 	}
6150     }
6151 
6152     {
6153         const char *p;
6154         static const char parens[] = "=!<,>";
6155 
6156 	if (paren && (p = strchr(parens, paren))) {
6157 	    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6158 	    int flag = (p - parens) > 1;
6159 
6160 	    if (paren == '>')
6161 		node = SUSPEND, flag = 0;
6162 	    reginsert(pRExC_state, node,ret, depth+1);
6163 	    Set_Node_Cur_Length(ret);
6164 	    Set_Node_Offset(ret, parse_start + 1);
6165 	    ret->flags = flag;
6166             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6167 	}
6168     }
6169 
6170     /* Check for proper termination. */
6171     if (paren) {
6172 	RExC_flags = oregflags;
6173 	if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6174 	    RExC_parse = oregcomp_parse;
6175 	    vFAIL("Unmatched (");
6176 	}
6177     }
6178     else if (!paren && RExC_parse < RExC_end) {
6179 	if (*RExC_parse == ')') {
6180 	    RExC_parse++;
6181 	    vFAIL("Unmatched )");
6182 	}
6183 	else
6184 	    FAIL("Junk on end of regexp");	/* "Can't happen". */
6185 	/* NOTREACHED */
6186     }
6187     if (after_freeze)
6188         RExC_npar = after_freeze;
6189     return(ret);
6190 }
6191 
6192 /*
6193  - regbranch - one alternative of an | operator
6194  *
6195  * Implements the concatenation operator.
6196  */
6197 STATIC regnode *
6198 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6199 {
6200     dVAR;
6201     register regnode *ret;
6202     register regnode *chain = NULL;
6203     register regnode *latest;
6204     I32 flags = 0, c = 0;
6205     GET_RE_DEBUG_FLAGS_DECL;
6206     DEBUG_PARSE("brnc");
6207 
6208     if (first)
6209 	ret = NULL;
6210     else {
6211 	if (!SIZE_ONLY && RExC_extralen)
6212 	    ret = reganode(pRExC_state, BRANCHJ,0);
6213 	else {
6214 	    ret = reg_node(pRExC_state, BRANCH);
6215             Set_Node_Length(ret, 1);
6216         }
6217     }
6218 
6219     if (!first && SIZE_ONLY)
6220 	RExC_extralen += 1;			/* BRANCHJ */
6221 
6222     *flagp = WORST;			/* Tentatively. */
6223 
6224     RExC_parse--;
6225     nextchar(pRExC_state);
6226     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6227 	flags &= ~TRYAGAIN;
6228         latest = regpiece(pRExC_state, &flags,depth+1);
6229 	if (latest == NULL) {
6230 	    if (flags & TRYAGAIN)
6231 		continue;
6232 	    return(NULL);
6233 	}
6234 	else if (ret == NULL)
6235 	    ret = latest;
6236 	*flagp |= flags&(HASWIDTH|POSTPONED);
6237 	if (chain == NULL) 	/* First piece. */
6238 	    *flagp |= flags&SPSTART;
6239 	else {
6240 	    RExC_naughty++;
6241             REGTAIL(pRExC_state, chain, latest);
6242 	}
6243 	chain = latest;
6244 	c++;
6245     }
6246     if (chain == NULL) {	/* Loop ran zero times. */
6247 	chain = reg_node(pRExC_state, NOTHING);
6248 	if (ret == NULL)
6249 	    ret = chain;
6250     }
6251     if (c == 1) {
6252 	*flagp |= flags&SIMPLE;
6253     }
6254 
6255     return ret;
6256 }
6257 
6258 /*
6259  - regpiece - something followed by possible [*+?]
6260  *
6261  * Note that the branching code sequences used for ? and the general cases
6262  * of * and + are somewhat optimized:  they use the same NOTHING node as
6263  * both the endmarker for their branch list and the body of the last branch.
6264  * It might seem that this node could be dispensed with entirely, but the
6265  * endmarker role is not redundant.
6266  */
6267 STATIC regnode *
6268 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6269 {
6270     dVAR;
6271     register regnode *ret;
6272     register char op;
6273     register char *next;
6274     I32 flags;
6275     const char * const origparse = RExC_parse;
6276     I32 min;
6277     I32 max = REG_INFTY;
6278     char *parse_start;
6279     const char *maxpos = NULL;
6280     GET_RE_DEBUG_FLAGS_DECL;
6281     DEBUG_PARSE("piec");
6282 
6283     ret = regatom(pRExC_state, &flags,depth+1);
6284     if (ret == NULL) {
6285 	if (flags & TRYAGAIN)
6286 	    *flagp |= TRYAGAIN;
6287 	return(NULL);
6288     }
6289 
6290     op = *RExC_parse;
6291 
6292     if (op == '{' && regcurly(RExC_parse)) {
6293 	maxpos = NULL;
6294         parse_start = RExC_parse; /* MJD */
6295 	next = RExC_parse + 1;
6296 	while (isDIGIT(*next) || *next == ',') {
6297 	    if (*next == ',') {
6298 		if (maxpos)
6299 		    break;
6300 		else
6301 		    maxpos = next;
6302 	    }
6303 	    next++;
6304 	}
6305 	if (*next == '}') {		/* got one */
6306 	    if (!maxpos)
6307 		maxpos = next;
6308 	    RExC_parse++;
6309 	    min = atoi(RExC_parse);
6310 	    if (*maxpos == ',')
6311 		maxpos++;
6312 	    else
6313 		maxpos = RExC_parse;
6314 	    max = atoi(maxpos);
6315 	    if (!max && *maxpos != '0')
6316 		max = REG_INFTY;		/* meaning "infinity" */
6317 	    else if (max >= REG_INFTY)
6318 		vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6319 	    RExC_parse = next;
6320 	    nextchar(pRExC_state);
6321 
6322 	do_curly:
6323 	    if ((flags&SIMPLE)) {
6324 		RExC_naughty += 2 + RExC_naughty / 2;
6325 		reginsert(pRExC_state, CURLY, ret, depth+1);
6326                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6327                 Set_Node_Cur_Length(ret);
6328 	    }
6329 	    else {
6330 		regnode * const w = reg_node(pRExC_state, WHILEM);
6331 
6332 		w->flags = 0;
6333                 REGTAIL(pRExC_state, ret, w);
6334 		if (!SIZE_ONLY && RExC_extralen) {
6335 		    reginsert(pRExC_state, LONGJMP,ret, depth+1);
6336 		    reginsert(pRExC_state, NOTHING,ret, depth+1);
6337 		    NEXT_OFF(ret) = 3;	/* Go over LONGJMP. */
6338 		}
6339 		reginsert(pRExC_state, CURLYX,ret, depth+1);
6340                                 /* MJD hk */
6341                 Set_Node_Offset(ret, parse_start+1);
6342                 Set_Node_Length(ret,
6343                                 op == '{' ? (RExC_parse - parse_start) : 1);
6344 
6345 		if (!SIZE_ONLY && RExC_extralen)
6346 		    NEXT_OFF(ret) = 3;	/* Go over NOTHING to LONGJMP. */
6347                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6348 		if (SIZE_ONLY)
6349 		    RExC_whilem_seen++, RExC_extralen += 3;
6350 		RExC_naughty += 4 + RExC_naughty;	/* compound interest */
6351 	    }
6352 	    ret->flags = 0;
6353 
6354 	    if (min > 0)
6355 		*flagp = WORST;
6356 	    if (max > 0)
6357 		*flagp |= HASWIDTH;
6358 	    if (max && max < min)
6359 		vFAIL("Can't do {n,m} with n > m");
6360 	    if (!SIZE_ONLY) {
6361 		ARG1_SET(ret, (U16)min);
6362 		ARG2_SET(ret, (U16)max);
6363 	    }
6364 
6365 	    goto nest_check;
6366 	}
6367     }
6368 
6369     if (!ISMULT1(op)) {
6370 	*flagp = flags;
6371 	return(ret);
6372     }
6373 
6374 #if 0				/* Now runtime fix should be reliable. */
6375 
6376     /* if this is reinstated, don't forget to put this back into perldiag:
6377 
6378 	    =item Regexp *+ operand could be empty at {#} in regex m/%s/
6379 
6380 	   (F) The part of the regexp subject to either the * or + quantifier
6381            could match an empty string. The {#} shows in the regular
6382            expression about where the problem was discovered.
6383 
6384     */
6385 
6386     if (!(flags&HASWIDTH) && op != '?')
6387       vFAIL("Regexp *+ operand could be empty");
6388 #endif
6389 
6390     parse_start = RExC_parse;
6391     nextchar(pRExC_state);
6392 
6393     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6394 
6395     if (op == '*' && (flags&SIMPLE)) {
6396 	reginsert(pRExC_state, STAR, ret, depth+1);
6397 	ret->flags = 0;
6398 	RExC_naughty += 4;
6399     }
6400     else if (op == '*') {
6401 	min = 0;
6402 	goto do_curly;
6403     }
6404     else if (op == '+' && (flags&SIMPLE)) {
6405 	reginsert(pRExC_state, PLUS, ret, depth+1);
6406 	ret->flags = 0;
6407 	RExC_naughty += 3;
6408     }
6409     else if (op == '+') {
6410 	min = 1;
6411 	goto do_curly;
6412     }
6413     else if (op == '?') {
6414 	min = 0; max = 1;
6415 	goto do_curly;
6416     }
6417   nest_check:
6418     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6419 	vWARN3(RExC_parse,
6420 	       "%.*s matches null string many times",
6421 	       (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6422 	       origparse);
6423     }
6424 
6425     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6426 	nextchar(pRExC_state);
6427 	reginsert(pRExC_state, MINMOD, ret, depth+1);
6428         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6429     }
6430 #ifndef REG_ALLOW_MINMOD_SUSPEND
6431     else
6432 #endif
6433     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6434         regnode *ender;
6435         nextchar(pRExC_state);
6436         ender = reg_node(pRExC_state, SUCCEED);
6437         REGTAIL(pRExC_state, ret, ender);
6438         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6439         ret->flags = 0;
6440         ender = reg_node(pRExC_state, TAIL);
6441         REGTAIL(pRExC_state, ret, ender);
6442         /*ret= ender;*/
6443     }
6444 
6445     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6446 	RExC_parse++;
6447 	vFAIL("Nested quantifiers");
6448     }
6449 
6450     return(ret);
6451 }
6452 
6453 
6454 /* reg_namedseq(pRExC_state,UVp)
6455 
6456    This is expected to be called by a parser routine that has
6457    recognized'\N' and needs to handle the rest. RExC_parse is
6458    expected to point at the first char following the N at the time
6459    of the call.
6460 
6461    If valuep is non-null then it is assumed that we are parsing inside
6462    of a charclass definition and the first codepoint in the resolved
6463    string is returned via *valuep and the routine will return NULL.
6464    In this mode if a multichar string is returned from the charnames
6465    handler a warning will be issued, and only the first char in the
6466    sequence will be examined. If the string returned is zero length
6467    then the value of *valuep is undefined and NON-NULL will
6468    be returned to indicate failure. (This will NOT be a valid pointer
6469    to a regnode.)
6470 
6471    If value is null then it is assumed that we are parsing normal text
6472    and inserts a new EXACT node into the program containing the resolved
6473    string and returns a pointer to the new node. If the string is
6474    zerolength a NOTHING node is emitted.
6475 
6476    On success RExC_parse is set to the char following the endbrace.
6477    Parsing failures will generate a fatal errorvia vFAIL(...)
6478 
6479    NOTE: We cache all results from the charnames handler locally in
6480    the RExC_charnames hash (created on first use) to prevent a charnames
6481    handler from playing silly-buggers and returning a short string and
6482    then a long string for a given pattern. Since the regexp program
6483    size is calculated during an initial parse this would result
6484    in a buffer overrun so we cache to prevent the charname result from
6485    changing during the course of the parse.
6486 
6487  */
6488 STATIC regnode *
6489 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6490 {
6491     char * name;        /* start of the content of the name */
6492     char * endbrace;    /* endbrace following the name */
6493     SV *sv_str = NULL;
6494     SV *sv_name = NULL;
6495     STRLEN len; /* this has various purposes throughout the code */
6496     bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6497     regnode *ret = NULL;
6498 
6499     if (*RExC_parse != '{') {
6500         vFAIL("Missing braces on \\N{}");
6501     }
6502     name = RExC_parse+1;
6503     endbrace = strchr(RExC_parse, '}');
6504     if ( ! endbrace ) {
6505         RExC_parse++;
6506         vFAIL("Missing right brace on \\N{}");
6507     }
6508     RExC_parse = endbrace + 1;
6509 
6510 
6511     /* RExC_parse points at the beginning brace,
6512        endbrace points at the last */
6513     if ( name[0]=='U' && name[1]=='+' ) {
6514         /* its a "Unicode hex" notation {U+89AB} */
6515         I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6516             | PERL_SCAN_DISALLOW_PREFIX
6517             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6518         UV cp;
6519 	char string;
6520         len = (STRLEN)(endbrace - name - 2);
6521         cp = grok_hex(name + 2, &len, &fl, NULL);
6522         if ( len != (STRLEN)(endbrace - name - 2) ) {
6523             cp = 0xFFFD;
6524         }
6525         if (cp > 0xff)
6526             RExC_utf8 = 1;
6527         if ( valuep ) {
6528             *valuep = cp;
6529             return NULL;
6530         }
6531 	string = (char)cp;
6532         sv_str= newSVpvn(&string, 1);
6533     } else {
6534         /* fetch the charnames handler for this scope */
6535         HV * const table = GvHV(PL_hintgv);
6536         SV **cvp= table ?
6537             hv_fetchs(table, "charnames", FALSE) :
6538             NULL;
6539         SV *cv= cvp ? *cvp : NULL;
6540         HE *he_str;
6541         int count;
6542         /* create an SV with the name as argument */
6543         sv_name = newSVpvn(name, endbrace - name);
6544 
6545         if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6546             vFAIL2("Constant(\\N{%s}) unknown: "
6547                   "(possibly a missing \"use charnames ...\")",
6548                   SvPVX(sv_name));
6549         }
6550         if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6551             vFAIL2("Constant(\\N{%s}): "
6552                   "$^H{charnames} is not defined",SvPVX(sv_name));
6553         }
6554 
6555 
6556 
6557         if (!RExC_charnames) {
6558             /* make sure our cache is allocated */
6559             RExC_charnames = newHV();
6560             sv_2mortal((SV*)RExC_charnames);
6561         }
6562             /* see if we have looked this one up before */
6563         he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6564         if ( he_str ) {
6565             sv_str = HeVAL(he_str);
6566             cached = 1;
6567         } else {
6568             dSP ;
6569 
6570             ENTER ;
6571             SAVETMPS ;
6572             PUSHMARK(SP) ;
6573 
6574             XPUSHs(sv_name);
6575 
6576             PUTBACK ;
6577 
6578             count= call_sv(cv, G_SCALAR);
6579 
6580             if (count == 1) { /* XXXX is this right? dmq */
6581                 sv_str = POPs;
6582                 SvREFCNT_inc_simple_void(sv_str);
6583             }
6584 
6585             SPAGAIN ;
6586             PUTBACK ;
6587             FREETMPS ;
6588             LEAVE ;
6589 
6590             if ( !sv_str || !SvOK(sv_str) ) {
6591                 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6592                       "did not return a defined value",SvPVX(sv_name));
6593             }
6594             if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6595                 cached = 1;
6596         }
6597     }
6598     if (valuep) {
6599         char *p = SvPV(sv_str, len);
6600         if (len) {
6601             STRLEN numlen = 1;
6602             if ( SvUTF8(sv_str) ) {
6603                 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6604                 if (*valuep > 0x7F)
6605                     RExC_utf8 = 1;
6606                 /* XXXX
6607                   We have to turn on utf8 for high bit chars otherwise
6608                   we get failures with
6609 
6610                    "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6611                    "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6612 
6613                   This is different from what \x{} would do with the same
6614                   codepoint, where the condition is > 0xFF.
6615                   - dmq
6616                 */
6617 
6618 
6619             } else {
6620                 *valuep = (UV)*p;
6621                 /* warn if we havent used the whole string? */
6622             }
6623             if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6624                 vWARN2(RExC_parse,
6625                     "Ignoring excess chars from \\N{%s} in character class",
6626                     SvPVX(sv_name)
6627                 );
6628             }
6629         } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6630             vWARN2(RExC_parse,
6631                     "Ignoring zero length \\N{%s} in character class",
6632                     SvPVX(sv_name)
6633                 );
6634         }
6635         if (sv_name)
6636             SvREFCNT_dec(sv_name);
6637         if (!cached)
6638             SvREFCNT_dec(sv_str);
6639         return len ? NULL : (regnode *)&len;
6640     } else if(SvCUR(sv_str)) {
6641 
6642         char *s;
6643         char *p, *pend;
6644         STRLEN charlen = 1;
6645 #ifdef DEBUGGING
6646         char * parse_start = name-3; /* needed for the offsets */
6647 #endif
6648         GET_RE_DEBUG_FLAGS_DECL;     /* needed for the offsets */
6649 
6650         ret = reg_node(pRExC_state,
6651             (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6652         s= STRING(ret);
6653 
6654         if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6655             sv_utf8_upgrade(sv_str);
6656         } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6657             RExC_utf8= 1;
6658         }
6659 
6660         p = SvPV(sv_str, len);
6661         pend = p + len;
6662         /* len is the length written, charlen is the size the char read */
6663         for ( len = 0; p < pend; p += charlen ) {
6664             if (UTF) {
6665                 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6666                 if (FOLD) {
6667                     STRLEN foldlen,numlen;
6668                     U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6669                     uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6670                     /* Emit all the Unicode characters. */
6671 
6672                     for (foldbuf = tmpbuf;
6673                         foldlen;
6674                         foldlen -= numlen)
6675                     {
6676                         uvc = utf8_to_uvchr(foldbuf, &numlen);
6677                         if (numlen > 0) {
6678                             const STRLEN unilen = reguni(pRExC_state, uvc, s);
6679                             s       += unilen;
6680                             len     += unilen;
6681                             /* In EBCDIC the numlen
6682                             * and unilen can differ. */
6683                             foldbuf += numlen;
6684                             if (numlen >= foldlen)
6685                                 break;
6686                         }
6687                         else
6688                             break; /* "Can't happen." */
6689                     }
6690                 } else {
6691                     const STRLEN unilen = reguni(pRExC_state, uvc, s);
6692         	    if (unilen > 0) {
6693         	       s   += unilen;
6694         	       len += unilen;
6695         	    }
6696         	}
6697 	    } else {
6698                 len++;
6699                 REGC(*p, s++);
6700             }
6701         }
6702         if (SIZE_ONLY) {
6703             RExC_size += STR_SZ(len);
6704         } else {
6705             STR_LEN(ret) = len;
6706             RExC_emit += STR_SZ(len);
6707         }
6708         Set_Node_Cur_Length(ret); /* MJD */
6709         RExC_parse--;
6710         nextchar(pRExC_state);
6711     } else {
6712         ret = reg_node(pRExC_state,NOTHING);
6713     }
6714     if (!cached) {
6715         SvREFCNT_dec(sv_str);
6716     }
6717     if (sv_name) {
6718         SvREFCNT_dec(sv_name);
6719     }
6720     return ret;
6721 
6722 }
6723 
6724 
6725 /*
6726  * reg_recode
6727  *
6728  * It returns the code point in utf8 for the value in *encp.
6729  *    value: a code value in the source encoding
6730  *    encp:  a pointer to an Encode object
6731  *
6732  * If the result from Encode is not a single character,
6733  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6734  */
6735 STATIC UV
6736 S_reg_recode(pTHX_ const char value, SV **encp)
6737 {
6738     STRLEN numlen = 1;
6739     SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6740     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6741     const STRLEN newlen = SvCUR(sv);
6742     UV uv = UNICODE_REPLACEMENT;
6743 
6744     if (newlen)
6745 	uv = SvUTF8(sv)
6746 	     ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6747 	     : *(U8*)s;
6748 
6749     if (!newlen || numlen != newlen) {
6750 	uv = UNICODE_REPLACEMENT;
6751 	*encp = NULL;
6752     }
6753     return uv;
6754 }
6755 
6756 
6757 /*
6758  - regatom - the lowest level
6759 
6760    Try to identify anything special at the start of the pattern. If there
6761    is, then handle it as required. This may involve generating a single regop,
6762    such as for an assertion; or it may involve recursing, such as to
6763    handle a () structure.
6764 
6765    If the string doesn't start with something special then we gobble up
6766    as much literal text as we can.
6767 
6768    Once we have been able to handle whatever type of thing started the
6769    sequence, we return.
6770 
6771    Note: we have to be careful with escapes, as they can be both literal
6772    and special, and in the case of \10 and friends can either, depending
6773    on context. Specifically there are two seperate switches for handling
6774    escape sequences, with the one for handling literal escapes requiring
6775    a dummy entry for all of the special escapes that are actually handled
6776    by the other.
6777 */
6778 
6779 STATIC regnode *
6780 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6781 {
6782     dVAR;
6783     register regnode *ret = NULL;
6784     I32 flags;
6785     char *parse_start = RExC_parse;
6786     GET_RE_DEBUG_FLAGS_DECL;
6787     DEBUG_PARSE("atom");
6788     *flagp = WORST;		/* Tentatively. */
6789 
6790 
6791 tryagain:
6792     switch ((U8)*RExC_parse) {
6793     case '^':
6794 	RExC_seen_zerolen++;
6795 	nextchar(pRExC_state);
6796 	if (RExC_flags & RXf_PMf_MULTILINE)
6797 	    ret = reg_node(pRExC_state, MBOL);
6798 	else if (RExC_flags & RXf_PMf_SINGLELINE)
6799 	    ret = reg_node(pRExC_state, SBOL);
6800 	else
6801 	    ret = reg_node(pRExC_state, BOL);
6802         Set_Node_Length(ret, 1); /* MJD */
6803 	break;
6804     case '$':
6805 	nextchar(pRExC_state);
6806 	if (*RExC_parse)
6807 	    RExC_seen_zerolen++;
6808 	if (RExC_flags & RXf_PMf_MULTILINE)
6809 	    ret = reg_node(pRExC_state, MEOL);
6810 	else if (RExC_flags & RXf_PMf_SINGLELINE)
6811 	    ret = reg_node(pRExC_state, SEOL);
6812 	else
6813 	    ret = reg_node(pRExC_state, EOL);
6814         Set_Node_Length(ret, 1); /* MJD */
6815 	break;
6816     case '.':
6817 	nextchar(pRExC_state);
6818 	if (RExC_flags & RXf_PMf_SINGLELINE)
6819 	    ret = reg_node(pRExC_state, SANY);
6820 	else
6821 	    ret = reg_node(pRExC_state, REG_ANY);
6822 	*flagp |= HASWIDTH|SIMPLE;
6823 	RExC_naughty++;
6824         Set_Node_Length(ret, 1); /* MJD */
6825 	break;
6826     case '[':
6827     {
6828 	char * const oregcomp_parse = ++RExC_parse;
6829         ret = regclass(pRExC_state,depth+1);
6830 	if (*RExC_parse != ']') {
6831 	    RExC_parse = oregcomp_parse;
6832 	    vFAIL("Unmatched [");
6833 	}
6834 	nextchar(pRExC_state);
6835 	*flagp |= HASWIDTH|SIMPLE;
6836         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6837 	break;
6838     }
6839     case '(':
6840 	nextchar(pRExC_state);
6841         ret = reg(pRExC_state, 1, &flags,depth+1);
6842 	if (ret == NULL) {
6843 		if (flags & TRYAGAIN) {
6844 		    if (RExC_parse == RExC_end) {
6845 			 /* Make parent create an empty node if needed. */
6846 			*flagp |= TRYAGAIN;
6847 			return(NULL);
6848 		    }
6849 		    goto tryagain;
6850 		}
6851 		return(NULL);
6852 	}
6853 	*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6854 	break;
6855     case '|':
6856     case ')':
6857 	if (flags & TRYAGAIN) {
6858 	    *flagp |= TRYAGAIN;
6859 	    return NULL;
6860 	}
6861 	vFAIL("Internal urp");
6862 				/* Supposed to be caught earlier. */
6863 	break;
6864     case '{':
6865 	if (!regcurly(RExC_parse)) {
6866 	    RExC_parse++;
6867 	    goto defchar;
6868 	}
6869 	/* FALL THROUGH */
6870     case '?':
6871     case '+':
6872     case '*':
6873 	RExC_parse++;
6874 	vFAIL("Quantifier follows nothing");
6875 	break;
6876     case 0xDF:
6877     case 0xC3:
6878     case 0xCE:
6879         do_foldchar:
6880         if (!LOC && FOLD) {
6881             U32 len,cp;
6882 	    len=0; /* silence a spurious compiler warning */
6883             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
6884                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
6885                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
6886                 ret = reganode(pRExC_state, FOLDCHAR, cp);
6887                 Set_Node_Length(ret, 1); /* MJD */
6888                 nextchar(pRExC_state); /* kill whitespace under /x */
6889                 return ret;
6890             }
6891         }
6892         goto outer_default;
6893     case '\\':
6894 	/* Special Escapes
6895 
6896 	   This switch handles escape sequences that resolve to some kind
6897 	   of special regop and not to literal text. Escape sequnces that
6898 	   resolve to literal text are handled below in the switch marked
6899 	   "Literal Escapes".
6900 
6901 	   Every entry in this switch *must* have a corresponding entry
6902 	   in the literal escape switch. However, the opposite is not
6903 	   required, as the default for this switch is to jump to the
6904 	   literal text handling code.
6905 	*/
6906 	switch ((U8)*++RExC_parse) {
6907 	case 0xDF:
6908 	case 0xC3:
6909 	case 0xCE:
6910 	           goto do_foldchar;
6911 	/* Special Escapes */
6912 	case 'A':
6913 	    RExC_seen_zerolen++;
6914 	    ret = reg_node(pRExC_state, SBOL);
6915 	    *flagp |= SIMPLE;
6916 	    goto finish_meta_pat;
6917 	case 'G':
6918 	    ret = reg_node(pRExC_state, GPOS);
6919 	    RExC_seen |= REG_SEEN_GPOS;
6920 	    *flagp |= SIMPLE;
6921 	    goto finish_meta_pat;
6922 	case 'K':
6923 	    RExC_seen_zerolen++;
6924 	    ret = reg_node(pRExC_state, KEEPS);
6925 	    *flagp |= SIMPLE;
6926 	    /* XXX:dmq : disabling in-place substitution seems to
6927 	     * be necessary here to avoid cases of memory corruption, as
6928 	     * with: C<$_="x" x 80; s/x\K/y/> -- rgs
6929 	     */
6930 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
6931 	    goto finish_meta_pat;
6932 	case 'Z':
6933 	    ret = reg_node(pRExC_state, SEOL);
6934 	    *flagp |= SIMPLE;
6935 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
6936 	    goto finish_meta_pat;
6937 	case 'z':
6938 	    ret = reg_node(pRExC_state, EOS);
6939 	    *flagp |= SIMPLE;
6940 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
6941 	    goto finish_meta_pat;
6942 	case 'C':
6943 	    ret = reg_node(pRExC_state, CANY);
6944 	    RExC_seen |= REG_SEEN_CANY;
6945 	    *flagp |= HASWIDTH|SIMPLE;
6946 	    goto finish_meta_pat;
6947 	case 'X':
6948 	    ret = reg_node(pRExC_state, CLUMP);
6949 	    *flagp |= HASWIDTH;
6950 	    goto finish_meta_pat;
6951 	case 'w':
6952 	    ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
6953 	    *flagp |= HASWIDTH|SIMPLE;
6954 	    goto finish_meta_pat;
6955 	case 'W':
6956 	    ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
6957 	    *flagp |= HASWIDTH|SIMPLE;
6958 	    goto finish_meta_pat;
6959 	case 'b':
6960 	    RExC_seen_zerolen++;
6961 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
6962 	    ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
6963 	    *flagp |= SIMPLE;
6964 	    goto finish_meta_pat;
6965 	case 'B':
6966 	    RExC_seen_zerolen++;
6967 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
6968 	    ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
6969 	    *flagp |= SIMPLE;
6970 	    goto finish_meta_pat;
6971 	case 's':
6972 	    ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
6973 	    *flagp |= HASWIDTH|SIMPLE;
6974 	    goto finish_meta_pat;
6975 	case 'S':
6976 	    ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
6977 	    *flagp |= HASWIDTH|SIMPLE;
6978 	    goto finish_meta_pat;
6979 	case 'd':
6980 	    ret = reg_node(pRExC_state, DIGIT);
6981 	    *flagp |= HASWIDTH|SIMPLE;
6982 	    goto finish_meta_pat;
6983 	case 'D':
6984 	    ret = reg_node(pRExC_state, NDIGIT);
6985 	    *flagp |= HASWIDTH|SIMPLE;
6986 	    goto finish_meta_pat;
6987 	case 'R':
6988 	    ret = reg_node(pRExC_state, LNBREAK);
6989 	    *flagp |= HASWIDTH|SIMPLE;
6990 	    goto finish_meta_pat;
6991 	case 'h':
6992 	    ret = reg_node(pRExC_state, HORIZWS);
6993 	    *flagp |= HASWIDTH|SIMPLE;
6994 	    goto finish_meta_pat;
6995 	case 'H':
6996 	    ret = reg_node(pRExC_state, NHORIZWS);
6997 	    *flagp |= HASWIDTH|SIMPLE;
6998 	    goto finish_meta_pat;
6999 	case 'v':
7000 	    ret = reg_node(pRExC_state, VERTWS);
7001 	    *flagp |= HASWIDTH|SIMPLE;
7002 	    goto finish_meta_pat;
7003 	case 'V':
7004 	    ret = reg_node(pRExC_state, NVERTWS);
7005 	    *flagp |= HASWIDTH|SIMPLE;
7006          finish_meta_pat:
7007 	    nextchar(pRExC_state);
7008             Set_Node_Length(ret, 2); /* MJD */
7009 	    break;
7010 	case 'p':
7011 	case 'P':
7012 	    {
7013 		char* const oldregxend = RExC_end;
7014 #ifdef DEBUGGING
7015 		char* parse_start = RExC_parse - 2;
7016 #endif
7017 
7018 		if (RExC_parse[1] == '{') {
7019 		  /* a lovely hack--pretend we saw [\pX] instead */
7020 		    RExC_end = strchr(RExC_parse, '}');
7021 		    if (!RExC_end) {
7022 		        const U8 c = (U8)*RExC_parse;
7023 			RExC_parse += 2;
7024 			RExC_end = oldregxend;
7025 			vFAIL2("Missing right brace on \\%c{}", c);
7026 		    }
7027 		    RExC_end++;
7028 		}
7029 		else {
7030 		    RExC_end = RExC_parse + 2;
7031 		    if (RExC_end > oldregxend)
7032 			RExC_end = oldregxend;
7033 		}
7034 		RExC_parse--;
7035 
7036                 ret = regclass(pRExC_state,depth+1);
7037 
7038 		RExC_end = oldregxend;
7039 		RExC_parse--;
7040 
7041 		Set_Node_Offset(ret, parse_start + 2);
7042 		Set_Node_Cur_Length(ret);
7043 		nextchar(pRExC_state);
7044 		*flagp |= HASWIDTH|SIMPLE;
7045 	    }
7046 	    break;
7047         case 'N':
7048             /* Handle \N{NAME} here and not below because it can be
7049             multicharacter. join_exact() will join them up later on.
7050             Also this makes sure that things like /\N{BLAH}+/ and
7051             \N{BLAH} being multi char Just Happen. dmq*/
7052             ++RExC_parse;
7053             ret= reg_namedseq(pRExC_state, NULL);
7054             break;
7055 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
7056 	parse_named_seq:
7057         {
7058             char ch= RExC_parse[1];
7059 	    if (ch != '<' && ch != '\'' && ch != '{') {
7060 	        RExC_parse++;
7061 	        vFAIL2("Sequence %.2s... not terminated",parse_start);
7062 	    } else {
7063 	        /* this pretty much dupes the code for (?P=...) in reg(), if
7064                    you change this make sure you change that */
7065 		char* name_start = (RExC_parse += 2);
7066 		U32 num = 0;
7067                 SV *sv_dat = reg_scan_name(pRExC_state,
7068                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7069                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7070                 if (RExC_parse == name_start || *RExC_parse != ch)
7071                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7072 
7073                 if (!SIZE_ONLY) {
7074                     num = add_data( pRExC_state, 1, "S" );
7075                     RExC_rxi->data->data[num]=(void*)sv_dat;
7076                     SvREFCNT_inc_simple_void(sv_dat);
7077                 }
7078 
7079                 RExC_sawback = 1;
7080                 ret = reganode(pRExC_state,
7081                 	   (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7082                 	   num);
7083                 *flagp |= HASWIDTH;
7084 
7085                 /* override incorrect value set in reganode MJD */
7086                 Set_Node_Offset(ret, parse_start+1);
7087                 Set_Node_Cur_Length(ret); /* MJD */
7088                 nextchar(pRExC_state);
7089 
7090             }
7091             break;
7092 	}
7093 	case 'g':
7094 	case '1': case '2': case '3': case '4':
7095 	case '5': case '6': case '7': case '8': case '9':
7096 	    {
7097 		I32 num;
7098 		bool isg = *RExC_parse == 'g';
7099 		bool isrel = 0;
7100 		bool hasbrace = 0;
7101 		if (isg) {
7102 		    RExC_parse++;
7103 		    if (*RExC_parse == '{') {
7104 		        RExC_parse++;
7105 		        hasbrace = 1;
7106 		    }
7107 		    if (*RExC_parse == '-') {
7108 		        RExC_parse++;
7109 		        isrel = 1;
7110 		    }
7111 		    if (hasbrace && !isDIGIT(*RExC_parse)) {
7112 		        if (isrel) RExC_parse--;
7113                         RExC_parse -= 2;
7114 		        goto parse_named_seq;
7115 		}   }
7116 		num = atoi(RExC_parse);
7117 		if (isg && num == 0)
7118 		    vFAIL("Reference to invalid group 0");
7119                 if (isrel) {
7120                     num = RExC_npar - num;
7121                     if (num < 1)
7122                         vFAIL("Reference to nonexistent or unclosed group");
7123                 }
7124 		if (!isg && num > 9 && num >= RExC_npar)
7125 		    goto defchar;
7126 		else {
7127 		    char * const parse_start = RExC_parse - 1; /* MJD */
7128 		    while (isDIGIT(*RExC_parse))
7129 			RExC_parse++;
7130 	            if (parse_start == RExC_parse - 1)
7131 	                vFAIL("Unterminated \\g... pattern");
7132                     if (hasbrace) {
7133                         if (*RExC_parse != '}')
7134                             vFAIL("Unterminated \\g{...} pattern");
7135                         RExC_parse++;
7136                     }
7137 		    if (!SIZE_ONLY) {
7138 		        if (num > (I32)RExC_rx->nparens)
7139 			    vFAIL("Reference to nonexistent group");
7140 		    }
7141 		    RExC_sawback = 1;
7142 		    ret = reganode(pRExC_state,
7143 				   (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7144 				   num);
7145 		    *flagp |= HASWIDTH;
7146 
7147                     /* override incorrect value set in reganode MJD */
7148                     Set_Node_Offset(ret, parse_start+1);
7149                     Set_Node_Cur_Length(ret); /* MJD */
7150 		    RExC_parse--;
7151 		    nextchar(pRExC_state);
7152 		}
7153 	    }
7154 	    break;
7155 	case '\0':
7156 	    if (RExC_parse >= RExC_end)
7157 		FAIL("Trailing \\");
7158 	    /* FALL THROUGH */
7159 	default:
7160 	    /* Do not generate "unrecognized" warnings here, we fall
7161 	       back into the quick-grab loop below */
7162 	    parse_start--;
7163 	    goto defchar;
7164 	}
7165 	break;
7166 
7167     case '#':
7168 	if (RExC_flags & RXf_PMf_EXTENDED) {
7169 	    if ( reg_skipcomment( pRExC_state ) )
7170 		goto tryagain;
7171 	}
7172 	/* FALL THROUGH */
7173 
7174     default:
7175         outer_default:{
7176 	    register STRLEN len;
7177 	    register UV ender;
7178 	    register char *p;
7179 	    char *s;
7180 	    STRLEN foldlen;
7181 	    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7182 
7183             parse_start = RExC_parse - 1;
7184 
7185 	    RExC_parse++;
7186 
7187 	defchar:
7188 	    ender = 0;
7189 	    ret = reg_node(pRExC_state,
7190 			   (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7191 	    s = STRING(ret);
7192 	    for (len = 0, p = RExC_parse - 1;
7193 	      len < 127 && p < RExC_end;
7194 	      len++)
7195 	    {
7196 		char * const oldp = p;
7197 
7198 		if (RExC_flags & RXf_PMf_EXTENDED)
7199 		    p = regwhite( pRExC_state, p );
7200 		switch ((U8)*p) {
7201 		case 0xDF:
7202 		case 0xC3:
7203 		case 0xCE:
7204 		           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7205 		                goto normal_default;
7206 		case '^':
7207 		case '$':
7208 		case '.':
7209 		case '[':
7210 		case '(':
7211 		case ')':
7212 		case '|':
7213 		    goto loopdone;
7214 		case '\\':
7215 		    /* Literal Escapes Switch
7216 
7217 		       This switch is meant to handle escape sequences that
7218 		       resolve to a literal character.
7219 
7220 		       Every escape sequence that represents something
7221 		       else, like an assertion or a char class, is handled
7222 		       in the switch marked 'Special Escapes' above in this
7223 		       routine, but also has an entry here as anything that
7224 		       isn't explicitly mentioned here will be treated as
7225 		       an unescaped equivalent literal.
7226 		    */
7227 
7228 		    switch ((U8)*++p) {
7229 		    /* These are all the special escapes. */
7230     		    case 0xDF:
7231     		    case 0xC3:
7232     		    case 0xCE:
7233     		           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7234     		                goto normal_default;
7235 		    case 'A':             /* Start assertion */
7236 		    case 'b': case 'B':   /* Word-boundary assertion*/
7237 		    case 'C':             /* Single char !DANGEROUS! */
7238 		    case 'd': case 'D':   /* digit class */
7239 		    case 'g': case 'G':   /* generic-backref, pos assertion */
7240 		    case 'h': case 'H':   /* HORIZWS */
7241 		    case 'k': case 'K':   /* named backref, keep marker */
7242 		    case 'N':             /* named char sequence */
7243 		    case 'p': case 'P':   /* Unicode property */
7244 		              case 'R':   /* LNBREAK */
7245 		    case 's': case 'S':   /* space class */
7246 		    case 'v': case 'V':   /* VERTWS */
7247 		    case 'w': case 'W':   /* word class */
7248 		    case 'X':             /* eXtended Unicode "combining character sequence" */
7249 		    case 'z': case 'Z':   /* End of line/string assertion */
7250 			--p;
7251 			goto loopdone;
7252 
7253 	            /* Anything after here is an escape that resolves to a
7254 	               literal. (Except digits, which may or may not)
7255 	             */
7256 		    case 'n':
7257 			ender = '\n';
7258 			p++;
7259 			break;
7260 		    case 'r':
7261 			ender = '\r';
7262 			p++;
7263 			break;
7264 		    case 't':
7265 			ender = '\t';
7266 			p++;
7267 			break;
7268 		    case 'f':
7269 			ender = '\f';
7270 			p++;
7271 			break;
7272 		    case 'e':
7273 			  ender = ASCII_TO_NATIVE('\033');
7274 			p++;
7275 			break;
7276 		    case 'a':
7277 			  ender = ASCII_TO_NATIVE('\007');
7278 			p++;
7279 			break;
7280 		    case 'x':
7281 			if (*++p == '{') {
7282 			    char* const e = strchr(p, '}');
7283 
7284 			    if (!e) {
7285 				RExC_parse = p + 1;
7286 				vFAIL("Missing right brace on \\x{}");
7287 			    }
7288 			    else {
7289                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7290                                     | PERL_SCAN_DISALLOW_PREFIX;
7291                                 STRLEN numlen = e - p - 1;
7292 				ender = grok_hex(p + 1, &numlen, &flags, NULL);
7293 				if (ender > 0xff)
7294 				    RExC_utf8 = 1;
7295 				p = e + 1;
7296 			    }
7297 			}
7298 			else {
7299                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7300 			    STRLEN numlen = 2;
7301 			    ender = grok_hex(p, &numlen, &flags, NULL);
7302 			    p += numlen;
7303 			}
7304 			if (PL_encoding && ender < 0x100)
7305 			    goto recode_encoding;
7306 			break;
7307 		    case 'c':
7308 			p++;
7309 			ender = UCHARAT(p++);
7310 			ender = toCTRL(ender);
7311 			break;
7312 		    case '0': case '1': case '2': case '3':case '4':
7313 		    case '5': case '6': case '7': case '8':case '9':
7314 			if (*p == '0' ||
7315 			  (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7316                             I32 flags = 0;
7317 			    STRLEN numlen = 3;
7318 			    ender = grok_oct(p, &numlen, &flags, NULL);
7319 			    p += numlen;
7320 			}
7321 			else {
7322 			    --p;
7323 			    goto loopdone;
7324 			}
7325 			if (PL_encoding && ender < 0x100)
7326 			    goto recode_encoding;
7327 			break;
7328 		    recode_encoding:
7329 			{
7330 			    SV* enc = PL_encoding;
7331 			    ender = reg_recode((const char)(U8)ender, &enc);
7332 			    if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7333 				vWARN(p, "Invalid escape in the specified encoding");
7334 			    RExC_utf8 = 1;
7335 			}
7336 			break;
7337 		    case '\0':
7338 			if (p >= RExC_end)
7339 			    FAIL("Trailing \\");
7340 			/* FALL THROUGH */
7341 		    default:
7342 			if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7343 			    vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7344 			goto normal_default;
7345 		    }
7346 		    break;
7347 		default:
7348 		  normal_default:
7349 		    if (UTF8_IS_START(*p) && UTF) {
7350 			STRLEN numlen;
7351 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7352 					       &numlen, UTF8_ALLOW_DEFAULT);
7353 			p += numlen;
7354 		    }
7355 		    else
7356 			ender = *p++;
7357 		    break;
7358 		}
7359 		if ( RExC_flags & RXf_PMf_EXTENDED)
7360 		    p = regwhite( pRExC_state, p );
7361 		if (UTF && FOLD) {
7362 		    /* Prime the casefolded buffer. */
7363 		    ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7364 		}
7365 		if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7366 		    if (len)
7367 			p = oldp;
7368 		    else if (UTF) {
7369 			 if (FOLD) {
7370 			      /* Emit all the Unicode characters. */
7371 			      STRLEN numlen;
7372 			      for (foldbuf = tmpbuf;
7373 				   foldlen;
7374 				   foldlen -= numlen) {
7375 				   ender = utf8_to_uvchr(foldbuf, &numlen);
7376 				   if (numlen > 0) {
7377 					const STRLEN unilen = reguni(pRExC_state, ender, s);
7378 					s       += unilen;
7379 					len     += unilen;
7380 					/* In EBCDIC the numlen
7381 					 * and unilen can differ. */
7382 					foldbuf += numlen;
7383 					if (numlen >= foldlen)
7384 					     break;
7385 				   }
7386 				   else
7387 					break; /* "Can't happen." */
7388 			      }
7389 			 }
7390 			 else {
7391 			      const STRLEN unilen = reguni(pRExC_state, ender, s);
7392 			      if (unilen > 0) {
7393 				   s   += unilen;
7394 				   len += unilen;
7395 			      }
7396 			 }
7397 		    }
7398 		    else {
7399 			len++;
7400 			REGC((char)ender, s++);
7401 		    }
7402 		    break;
7403 		}
7404 		if (UTF) {
7405 		     if (FOLD) {
7406 		          /* Emit all the Unicode characters. */
7407 			  STRLEN numlen;
7408 			  for (foldbuf = tmpbuf;
7409 			       foldlen;
7410 			       foldlen -= numlen) {
7411 			       ender = utf8_to_uvchr(foldbuf, &numlen);
7412 			       if (numlen > 0) {
7413 				    const STRLEN unilen = reguni(pRExC_state, ender, s);
7414 				    len     += unilen;
7415 				    s       += unilen;
7416 				    /* In EBCDIC the numlen
7417 				     * and unilen can differ. */
7418 				    foldbuf += numlen;
7419 				    if (numlen >= foldlen)
7420 					 break;
7421 			       }
7422 			       else
7423 				    break;
7424 			  }
7425 		     }
7426 		     else {
7427 			  const STRLEN unilen = reguni(pRExC_state, ender, s);
7428 			  if (unilen > 0) {
7429 			       s   += unilen;
7430 			       len += unilen;
7431 			  }
7432 		     }
7433 		     len--;
7434 		}
7435 		else
7436 		    REGC((char)ender, s++);
7437 	    }
7438 	loopdone:
7439 	    RExC_parse = p - 1;
7440             Set_Node_Cur_Length(ret); /* MJD */
7441 	    nextchar(pRExC_state);
7442 	    {
7443 		/* len is STRLEN which is unsigned, need to copy to signed */
7444 		IV iv = len;
7445 		if (iv < 0)
7446 		    vFAIL("Internal disaster");
7447 	    }
7448 	    if (len > 0)
7449 		*flagp |= HASWIDTH;
7450 	    if (len == 1 && UNI_IS_INVARIANT(ender))
7451 		*flagp |= SIMPLE;
7452 
7453 	    if (SIZE_ONLY)
7454 		RExC_size += STR_SZ(len);
7455 	    else {
7456 		STR_LEN(ret) = len;
7457 		RExC_emit += STR_SZ(len);
7458             }
7459 	}
7460 	break;
7461     }
7462 
7463     return(ret);
7464 }
7465 
7466 STATIC char *
7467 S_regwhite( RExC_state_t *pRExC_state, char *p )
7468 {
7469     const char *e = RExC_end;
7470     while (p < e) {
7471 	if (isSPACE(*p))
7472 	    ++p;
7473 	else if (*p == '#') {
7474             bool ended = 0;
7475 	    do {
7476 		if (*p++ == '\n') {
7477 		    ended = 1;
7478 		    break;
7479 		}
7480 	    } while (p < e);
7481 	    if (!ended)
7482 	        RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7483 	}
7484 	else
7485 	    break;
7486     }
7487     return p;
7488 }
7489 
7490 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7491    Character classes ([:foo:]) can also be negated ([:^foo:]).
7492    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7493    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7494    but trigger failures because they are currently unimplemented. */
7495 
7496 #define POSIXCC_DONE(c)   ((c) == ':')
7497 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7498 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7499 
7500 STATIC I32
7501 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7502 {
7503     dVAR;
7504     I32 namedclass = OOB_NAMEDCLASS;
7505 
7506     if (value == '[' && RExC_parse + 1 < RExC_end &&
7507 	/* I smell either [: or [= or [. -- POSIX has been here, right? */
7508 	POSIXCC(UCHARAT(RExC_parse))) {
7509 	const char c = UCHARAT(RExC_parse);
7510 	char* const s = RExC_parse++;
7511 
7512 	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7513 	    RExC_parse++;
7514 	if (RExC_parse == RExC_end)
7515 	    /* Grandfather lone [:, [=, [. */
7516 	    RExC_parse = s;
7517 	else {
7518 	    const char* const t = RExC_parse++; /* skip over the c */
7519 	    assert(*t == c);
7520 
7521   	    if (UCHARAT(RExC_parse) == ']') {
7522 		const char *posixcc = s + 1;
7523   		RExC_parse++; /* skip over the ending ] */
7524 
7525 		if (*s == ':') {
7526 		    const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7527 		    const I32 skip = t - posixcc;
7528 
7529 		    /* Initially switch on the length of the name.  */
7530 		    switch (skip) {
7531 		    case 4:
7532 			if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7533 			    namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7534 			break;
7535 		    case 5:
7536 			/* Names all of length 5.  */
7537 			/* alnum alpha ascii blank cntrl digit graph lower
7538 			   print punct space upper  */
7539 			/* Offset 4 gives the best switch position.  */
7540 			switch (posixcc[4]) {
7541 			case 'a':
7542 			    if (memEQ(posixcc, "alph", 4)) /* alpha */
7543 				namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7544 			    break;
7545 			case 'e':
7546 			    if (memEQ(posixcc, "spac", 4)) /* space */
7547 				namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7548 			    break;
7549 			case 'h':
7550 			    if (memEQ(posixcc, "grap", 4)) /* graph */
7551 				namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7552 			    break;
7553 			case 'i':
7554 			    if (memEQ(posixcc, "asci", 4)) /* ascii */
7555 				namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7556 			    break;
7557 			case 'k':
7558 			    if (memEQ(posixcc, "blan", 4)) /* blank */
7559 				namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7560 			    break;
7561 			case 'l':
7562 			    if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7563 				namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7564 			    break;
7565 			case 'm':
7566 			    if (memEQ(posixcc, "alnu", 4)) /* alnum */
7567 				namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7568 			    break;
7569 			case 'r':
7570 			    if (memEQ(posixcc, "lowe", 4)) /* lower */
7571 				namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7572 			    else if (memEQ(posixcc, "uppe", 4)) /* upper */
7573 				namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7574 			    break;
7575 			case 't':
7576 			    if (memEQ(posixcc, "digi", 4)) /* digit */
7577 				namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7578 			    else if (memEQ(posixcc, "prin", 4)) /* print */
7579 				namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7580 			    else if (memEQ(posixcc, "punc", 4)) /* punct */
7581 				namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7582 			    break;
7583 			}
7584 			break;
7585 		    case 6:
7586 			if (memEQ(posixcc, "xdigit", 6))
7587 			    namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7588 			break;
7589 		    }
7590 
7591 		    if (namedclass == OOB_NAMEDCLASS)
7592 			Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7593 				      t - s - 1, s + 1);
7594 		    assert (posixcc[skip] == ':');
7595 		    assert (posixcc[skip+1] == ']');
7596 		} else if (!SIZE_ONLY) {
7597 		    /* [[=foo=]] and [[.foo.]] are still future. */
7598 
7599 		    /* adjust RExC_parse so the warning shows after
7600 		       the class closes */
7601 		    while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7602 			RExC_parse++;
7603 		    Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7604 		}
7605 	    } else {
7606 		/* Maternal grandfather:
7607 		 * "[:" ending in ":" but not in ":]" */
7608 		RExC_parse = s;
7609 	    }
7610 	}
7611     }
7612 
7613     return namedclass;
7614 }
7615 
7616 STATIC void
7617 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7618 {
7619     dVAR;
7620     if (POSIXCC(UCHARAT(RExC_parse))) {
7621 	const char *s = RExC_parse;
7622 	const char  c = *s++;
7623 
7624 	while (isALNUM(*s))
7625 	    s++;
7626 	if (*s && c == *s && s[1] == ']') {
7627 	    if (ckWARN(WARN_REGEXP))
7628 		vWARN3(s+2,
7629 			"POSIX syntax [%c %c] belongs inside character classes",
7630 			c, c);
7631 
7632 	    /* [[=foo=]] and [[.foo.]] are still future. */
7633 	    if (POSIXCC_NOTYET(c)) {
7634 		/* adjust RExC_parse so the error shows after
7635 		   the class closes */
7636 		while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7637 		    NOOP;
7638 		Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7639 	    }
7640 	}
7641     }
7642 }
7643 
7644 
7645 #define _C_C_T_(NAME,TEST,WORD)                         \
7646 ANYOF_##NAME:                                           \
7647     if (LOC)                                            \
7648 	ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
7649     else {                                              \
7650 	for (value = 0; value < 256; value++)           \
7651 	    if (TEST)                                   \
7652 		ANYOF_BITMAP_SET(ret, value);           \
7653     }                                                   \
7654     yesno = '+';                                        \
7655     what = WORD;                                        \
7656     break;                                              \
7657 case ANYOF_N##NAME:                                     \
7658     if (LOC)                                            \
7659 	ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
7660     else {                                              \
7661 	for (value = 0; value < 256; value++)           \
7662 	    if (!TEST)                                  \
7663 		ANYOF_BITMAP_SET(ret, value);           \
7664     }                                                   \
7665     yesno = '!';                                        \
7666     what = WORD;                                        \
7667     break
7668 
7669 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
7670 ANYOF_##NAME:                                           \
7671 	for (value = 0; value < 256; value++)           \
7672 	    if (TEST)                                   \
7673 		ANYOF_BITMAP_SET(ret, value);           \
7674     yesno = '+';                                        \
7675     what = WORD;                                        \
7676     break;                                              \
7677 case ANYOF_N##NAME:                                     \
7678 	for (value = 0; value < 256; value++)           \
7679 	    if (!TEST)                                  \
7680 		ANYOF_BITMAP_SET(ret, value);           \
7681     yesno = '!';                                        \
7682     what = WORD;                                        \
7683     break
7684 
7685 /*
7686    parse a class specification and produce either an ANYOF node that
7687    matches the pattern or if the pattern matches a single char only and
7688    that char is < 256 and we are case insensitive then we produce an
7689    EXACT node instead.
7690 */
7691 
7692 STATIC regnode *
7693 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7694 {
7695     dVAR;
7696     register UV nextvalue;
7697     register IV prevvalue = OOB_UNICODE;
7698     register IV range = 0;
7699     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7700     register regnode *ret;
7701     STRLEN numlen;
7702     IV namedclass;
7703     char *rangebegin = NULL;
7704     bool need_class = 0;
7705     SV *listsv = NULL;
7706     UV n;
7707     bool optimize_invert   = TRUE;
7708     AV* unicode_alternate  = NULL;
7709 #ifdef EBCDIC
7710     UV literal_endpoint = 0;
7711 #endif
7712     UV stored = 0;  /* number of chars stored in the class */
7713 
7714     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7715         case we need to change the emitted regop to an EXACT. */
7716     const char * orig_parse = RExC_parse;
7717     GET_RE_DEBUG_FLAGS_DECL;
7718 #ifndef DEBUGGING
7719     PERL_UNUSED_ARG(depth);
7720 #endif
7721 
7722     DEBUG_PARSE("clas");
7723 
7724     /* Assume we are going to generate an ANYOF node. */
7725     ret = reganode(pRExC_state, ANYOF, 0);
7726 
7727     if (!SIZE_ONLY)
7728 	ANYOF_FLAGS(ret) = 0;
7729 
7730     if (UCHARAT(RExC_parse) == '^') {	/* Complement of range. */
7731 	RExC_naughty++;
7732 	RExC_parse++;
7733 	if (!SIZE_ONLY)
7734 	    ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7735     }
7736 
7737     if (SIZE_ONLY) {
7738 	RExC_size += ANYOF_SKIP;
7739 	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7740     }
7741     else {
7742  	RExC_emit += ANYOF_SKIP;
7743 	if (FOLD)
7744 	    ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7745 	if (LOC)
7746 	    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7747 	ANYOF_BITMAP_ZERO(ret);
7748 	listsv = newSVpvs("# comment\n");
7749     }
7750 
7751     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7752 
7753     if (!SIZE_ONLY && POSIXCC(nextvalue))
7754 	checkposixcc(pRExC_state);
7755 
7756     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7757     if (UCHARAT(RExC_parse) == ']')
7758 	goto charclassloop;
7759 
7760 parseit:
7761     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7762 
7763     charclassloop:
7764 
7765 	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7766 
7767 	if (!range)
7768 	    rangebegin = RExC_parse;
7769 	if (UTF) {
7770 	    value = utf8n_to_uvchr((U8*)RExC_parse,
7771 				   RExC_end - RExC_parse,
7772 				   &numlen, UTF8_ALLOW_DEFAULT);
7773 	    RExC_parse += numlen;
7774 	}
7775 	else
7776 	    value = UCHARAT(RExC_parse++);
7777 
7778 	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7779 	if (value == '[' && POSIXCC(nextvalue))
7780 	    namedclass = regpposixcc(pRExC_state, value);
7781 	else if (value == '\\') {
7782 	    if (UTF) {
7783 		value = utf8n_to_uvchr((U8*)RExC_parse,
7784 				   RExC_end - RExC_parse,
7785 				   &numlen, UTF8_ALLOW_DEFAULT);
7786 		RExC_parse += numlen;
7787 	    }
7788 	    else
7789 		value = UCHARAT(RExC_parse++);
7790 	    /* Some compilers cannot handle switching on 64-bit integer
7791 	     * values, therefore value cannot be an UV.  Yes, this will
7792 	     * be a problem later if we want switch on Unicode.
7793 	     * A similar issue a little bit later when switching on
7794 	     * namedclass. --jhi */
7795 	    switch ((I32)value) {
7796 	    case 'w':	namedclass = ANYOF_ALNUM;	break;
7797 	    case 'W':	namedclass = ANYOF_NALNUM;	break;
7798 	    case 's':	namedclass = ANYOF_SPACE;	break;
7799 	    case 'S':	namedclass = ANYOF_NSPACE;	break;
7800 	    case 'd':	namedclass = ANYOF_DIGIT;	break;
7801 	    case 'D':	namedclass = ANYOF_NDIGIT;	break;
7802 	    case 'v':	namedclass = ANYOF_VERTWS;	break;
7803 	    case 'V':	namedclass = ANYOF_NVERTWS;	break;
7804 	    case 'h':	namedclass = ANYOF_HORIZWS;	break;
7805 	    case 'H':	namedclass = ANYOF_NHORIZWS;	break;
7806             case 'N':  /* Handle \N{NAME} in class */
7807                 {
7808                     /* We only pay attention to the first char of
7809                     multichar strings being returned. I kinda wonder
7810                     if this makes sense as it does change the behaviour
7811                     from earlier versions, OTOH that behaviour was broken
7812                     as well. */
7813                     UV v; /* value is register so we cant & it /grrr */
7814                     if (reg_namedseq(pRExC_state, &v)) {
7815                         goto parseit;
7816                     }
7817                     value= v;
7818                 }
7819                 break;
7820 	    case 'p':
7821 	    case 'P':
7822 		{
7823 		char *e;
7824 		if (RExC_parse >= RExC_end)
7825 		    vFAIL2("Empty \\%c{}", (U8)value);
7826 		if (*RExC_parse == '{') {
7827 		    const U8 c = (U8)value;
7828 		    e = strchr(RExC_parse++, '}');
7829                     if (!e)
7830                         vFAIL2("Missing right brace on \\%c{}", c);
7831 		    while (isSPACE(UCHARAT(RExC_parse)))
7832 		        RExC_parse++;
7833                     if (e == RExC_parse)
7834                         vFAIL2("Empty \\%c{}", c);
7835 		    n = e - RExC_parse;
7836 		    while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7837 		        n--;
7838 		}
7839 		else {
7840 		    e = RExC_parse;
7841 		    n = 1;
7842 		}
7843 		if (!SIZE_ONLY) {
7844 		    if (UCHARAT(RExC_parse) == '^') {
7845 			 RExC_parse++;
7846 			 n--;
7847 			 value = value == 'p' ? 'P' : 'p'; /* toggle */
7848 			 while (isSPACE(UCHARAT(RExC_parse))) {
7849 			      RExC_parse++;
7850 			      n--;
7851 			 }
7852 		    }
7853 		    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7854 			(value=='p' ? '+' : '!'), (int)n, RExC_parse);
7855 		}
7856 		RExC_parse = e + 1;
7857 		ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7858 		namedclass = ANYOF_MAX;  /* no official name, but it's named */
7859 		}
7860 		break;
7861 	    case 'n':	value = '\n';			break;
7862 	    case 'r':	value = '\r';			break;
7863 	    case 't':	value = '\t';			break;
7864 	    case 'f':	value = '\f';			break;
7865 	    case 'b':	value = '\b';			break;
7866 	    case 'e':	value = ASCII_TO_NATIVE('\033');break;
7867 	    case 'a':	value = ASCII_TO_NATIVE('\007');break;
7868 	    case 'x':
7869 		if (*RExC_parse == '{') {
7870                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7871                         | PERL_SCAN_DISALLOW_PREFIX;
7872 		    char * const e = strchr(RExC_parse++, '}');
7873                     if (!e)
7874                         vFAIL("Missing right brace on \\x{}");
7875 
7876 		    numlen = e - RExC_parse;
7877 		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7878 		    RExC_parse = e + 1;
7879 		}
7880 		else {
7881                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7882 		    numlen = 2;
7883 		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7884 		    RExC_parse += numlen;
7885 		}
7886 		if (PL_encoding && value < 0x100)
7887 		    goto recode_encoding;
7888 		break;
7889 	    case 'c':
7890 		value = UCHARAT(RExC_parse++);
7891 		value = toCTRL(value);
7892 		break;
7893 	    case '0': case '1': case '2': case '3': case '4':
7894 	    case '5': case '6': case '7': case '8': case '9':
7895 		{
7896 		    I32 flags = 0;
7897 		    numlen = 3;
7898 		    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7899 		    RExC_parse += numlen;
7900 		    if (PL_encoding && value < 0x100)
7901 			goto recode_encoding;
7902 		    break;
7903 		}
7904 	    recode_encoding:
7905 		{
7906 		    SV* enc = PL_encoding;
7907 		    value = reg_recode((const char)(U8)value, &enc);
7908 		    if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7909 			vWARN(RExC_parse,
7910 			      "Invalid escape in the specified encoding");
7911 		    break;
7912 		}
7913 	    default:
7914 		if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7915 		    vWARN2(RExC_parse,
7916 			   "Unrecognized escape \\%c in character class passed through",
7917 			   (int)value);
7918 		break;
7919 	    }
7920 	} /* end of \blah */
7921 #ifdef EBCDIC
7922 	else
7923 	    literal_endpoint++;
7924 #endif
7925 
7926 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7927 
7928 	    if (!SIZE_ONLY && !need_class)
7929 		ANYOF_CLASS_ZERO(ret);
7930 
7931 	    need_class = 1;
7932 
7933 	    /* a bad range like a-\d, a-[:digit:] ? */
7934 	    if (range) {
7935 		if (!SIZE_ONLY) {
7936 		    if (ckWARN(WARN_REGEXP)) {
7937 			const int w =
7938 			    RExC_parse >= rangebegin ?
7939 			    RExC_parse - rangebegin : 0;
7940 			vWARN4(RExC_parse,
7941 			       "False [] range \"%*.*s\"",
7942 			       w, w, rangebegin);
7943 		    }
7944 		    if (prevvalue < 256) {
7945 			ANYOF_BITMAP_SET(ret, prevvalue);
7946 			ANYOF_BITMAP_SET(ret, '-');
7947 		    }
7948 		    else {
7949 			ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7950 			Perl_sv_catpvf(aTHX_ listsv,
7951 				       "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7952 		    }
7953 		}
7954 
7955 		range = 0; /* this was not a true range */
7956 	    }
7957 
7958 
7959 
7960 	    if (!SIZE_ONLY) {
7961 		const char *what = NULL;
7962 		char yesno = 0;
7963 
7964 	        if (namedclass > OOB_NAMEDCLASS)
7965 		    optimize_invert = FALSE;
7966 		/* Possible truncation here but in some 64-bit environments
7967 		 * the compiler gets heartburn about switch on 64-bit values.
7968 		 * A similar issue a little earlier when switching on value.
7969 		 * --jhi */
7970 		switch ((I32)namedclass) {
7971 		case _C_C_T_(ALNUM, isALNUM(value), "Word");
7972 		case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7973 		case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7974 		case _C_C_T_(BLANK, isBLANK(value), "Blank");
7975 		case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7976 		case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7977 		case _C_C_T_(LOWER, isLOWER(value), "Lower");
7978 		case _C_C_T_(PRINT, isPRINT(value), "Print");
7979 		case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7980 		case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7981 		case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7982 		case _C_C_T_(UPPER, isUPPER(value), "Upper");
7983 		case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
7984 		case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
7985 		case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
7986 		case ANYOF_ASCII:
7987 		    if (LOC)
7988 			ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7989 		    else {
7990 #ifndef EBCDIC
7991 			for (value = 0; value < 128; value++)
7992 			    ANYOF_BITMAP_SET(ret, value);
7993 #else  /* EBCDIC */
7994 			for (value = 0; value < 256; value++) {
7995 			    if (isASCII(value))
7996 			        ANYOF_BITMAP_SET(ret, value);
7997 			}
7998 #endif /* EBCDIC */
7999 		    }
8000 		    yesno = '+';
8001 		    what = "ASCII";
8002 		    break;
8003 		case ANYOF_NASCII:
8004 		    if (LOC)
8005 			ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8006 		    else {
8007 #ifndef EBCDIC
8008 			for (value = 128; value < 256; value++)
8009 			    ANYOF_BITMAP_SET(ret, value);
8010 #else  /* EBCDIC */
8011 			for (value = 0; value < 256; value++) {
8012 			    if (!isASCII(value))
8013 			        ANYOF_BITMAP_SET(ret, value);
8014 			}
8015 #endif /* EBCDIC */
8016 		    }
8017 		    yesno = '!';
8018 		    what = "ASCII";
8019 		    break;
8020 		case ANYOF_DIGIT:
8021 		    if (LOC)
8022 			ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8023 		    else {
8024 			/* consecutive digits assumed */
8025 			for (value = '0'; value <= '9'; value++)
8026 			    ANYOF_BITMAP_SET(ret, value);
8027 		    }
8028 		    yesno = '+';
8029 		    what = "Digit";
8030 		    break;
8031 		case ANYOF_NDIGIT:
8032 		    if (LOC)
8033 			ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8034 		    else {
8035 			/* consecutive digits assumed */
8036 			for (value = 0; value < '0'; value++)
8037 			    ANYOF_BITMAP_SET(ret, value);
8038 			for (value = '9' + 1; value < 256; value++)
8039 			    ANYOF_BITMAP_SET(ret, value);
8040 		    }
8041 		    yesno = '!';
8042 		    what = "Digit";
8043 		    break;
8044 		case ANYOF_MAX:
8045 		    /* this is to handle \p and \P */
8046 		    break;
8047 		default:
8048 		    vFAIL("Invalid [::] class");
8049 		    break;
8050 		}
8051 		if (what) {
8052 		    /* Strings such as "+utf8::isWord\n" */
8053 		    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8054 		}
8055 		if (LOC)
8056 		    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8057 		continue;
8058 	    }
8059 	} /* end of namedclass \blah */
8060 
8061 	if (range) {
8062 	    if (prevvalue > (IV)value) /* b-a */ {
8063 		const int w = RExC_parse - rangebegin;
8064 		Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8065 		range = 0; /* not a valid range */
8066 	    }
8067 	}
8068 	else {
8069 	    prevvalue = value; /* save the beginning of the range */
8070 	    if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8071 		RExC_parse[1] != ']') {
8072 		RExC_parse++;
8073 
8074 		/* a bad range like \w-, [:word:]- ? */
8075 		if (namedclass > OOB_NAMEDCLASS) {
8076 		    if (ckWARN(WARN_REGEXP)) {
8077 			const int w =
8078 			    RExC_parse >= rangebegin ?
8079 			    RExC_parse - rangebegin : 0;
8080 			vWARN4(RExC_parse,
8081 			       "False [] range \"%*.*s\"",
8082 			       w, w, rangebegin);
8083 		    }
8084 		    if (!SIZE_ONLY)
8085 			ANYOF_BITMAP_SET(ret, '-');
8086 		} else
8087 		    range = 1;	/* yeah, it's a range! */
8088 		continue;	/* but do it the next time */
8089 	    }
8090 	}
8091 
8092 	/* now is the next time */
8093         /*stored += (value - prevvalue + 1);*/
8094 	if (!SIZE_ONLY) {
8095 	    if (prevvalue < 256) {
8096 	        const IV ceilvalue = value < 256 ? value : 255;
8097 		IV i;
8098 #ifdef EBCDIC
8099 		/* In EBCDIC [\x89-\x91] should include
8100 		 * the \x8e but [i-j] should not. */
8101 		if (literal_endpoint == 2 &&
8102 		    ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8103 		     (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8104 		{
8105 		    if (isLOWER(prevvalue)) {
8106 			for (i = prevvalue; i <= ceilvalue; i++)
8107 			    if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8108 				stored++;
8109 				ANYOF_BITMAP_SET(ret, i);
8110 			    }
8111 		    } else {
8112 			for (i = prevvalue; i <= ceilvalue; i++)
8113 			    if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8114 				stored++;
8115 				ANYOF_BITMAP_SET(ret, i);
8116 			    }
8117 		    }
8118 		}
8119 		else
8120 #endif
8121 		      for (i = prevvalue; i <= ceilvalue; i++) {
8122 		        if (!ANYOF_BITMAP_TEST(ret,i)) {
8123 		            stored++;
8124 			    ANYOF_BITMAP_SET(ret, i);
8125 		        }
8126 	              }
8127 	  }
8128 	  if (value > 255 || UTF) {
8129 	        const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8130 		const UV natvalue      = NATIVE_TO_UNI(value);
8131                 stored+=2; /* can't optimize this class */
8132 		ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8133 		if (prevnatvalue < natvalue) { /* what about > ? */
8134 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8135 				   prevnatvalue, natvalue);
8136 		}
8137 		else if (prevnatvalue == natvalue) {
8138 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8139 		    if (FOLD) {
8140 			 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8141 			 STRLEN foldlen;
8142 			 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8143 
8144 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8145 			 if (RExC_precomp[0] == ':' &&
8146 			     RExC_precomp[1] == '[' &&
8147 			     (f == 0xDF || f == 0x92)) {
8148 			     f = NATIVE_TO_UNI(f);
8149                         }
8150 #endif
8151 			 /* If folding and foldable and a single
8152 			  * character, insert also the folded version
8153 			  * to the charclass. */
8154 			 if (f != value) {
8155 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8156 			     if ((RExC_precomp[0] == ':' &&
8157 				  RExC_precomp[1] == '[' &&
8158 				  (f == 0xA2 &&
8159 				   (value == 0xFB05 || value == 0xFB06))) ?
8160 				 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8161 				 foldlen == (STRLEN)UNISKIP(f) )
8162 #else
8163 			      if (foldlen == (STRLEN)UNISKIP(f))
8164 #endif
8165 				  Perl_sv_catpvf(aTHX_ listsv,
8166 						 "%04"UVxf"\n", f);
8167 			      else {
8168 				  /* Any multicharacter foldings
8169 				   * require the following transform:
8170 				   * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8171 				   * where E folds into "pq" and F folds
8172 				   * into "rst", all other characters
8173 				   * fold to single characters.  We save
8174 				   * away these multicharacter foldings,
8175 				   * to be later saved as part of the
8176 				   * additional "s" data. */
8177 				  SV *sv;
8178 
8179 				  if (!unicode_alternate)
8180 				      unicode_alternate = newAV();
8181 				  sv = newSVpvn((char*)foldbuf, foldlen);
8182 				  SvUTF8_on(sv);
8183 				  av_push(unicode_alternate, sv);
8184 			      }
8185 			 }
8186 
8187 			 /* If folding and the value is one of the Greek
8188 			  * sigmas insert a few more sigmas to make the
8189 			  * folding rules of the sigmas to work right.
8190 			  * Note that not all the possible combinations
8191 			  * are handled here: some of them are handled
8192 			  * by the standard folding rules, and some of
8193 			  * them (literal or EXACTF cases) are handled
8194 			  * during runtime in regexec.c:S_find_byclass(). */
8195 			 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8196 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8197 					     (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8198 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8199 					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8200 			 }
8201 			 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8202 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8203 					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8204 		    }
8205 		}
8206 	    }
8207 #ifdef EBCDIC
8208 	    literal_endpoint = 0;
8209 #endif
8210         }
8211 
8212 	range = 0; /* this range (if it was one) is done now */
8213     }
8214 
8215     if (need_class) {
8216 	ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8217 	if (SIZE_ONLY)
8218 	    RExC_size += ANYOF_CLASS_ADD_SKIP;
8219 	else
8220 	    RExC_emit += ANYOF_CLASS_ADD_SKIP;
8221     }
8222 
8223 
8224     if (SIZE_ONLY)
8225         return ret;
8226     /****** !SIZE_ONLY AFTER HERE *********/
8227 
8228     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8229         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8230     ) {
8231         /* optimize single char class to an EXACT node
8232            but *only* when its not a UTF/high char  */
8233         const char * cur_parse= RExC_parse;
8234         RExC_emit = (regnode *)orig_emit;
8235         RExC_parse = (char *)orig_parse;
8236         ret = reg_node(pRExC_state,
8237                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8238         RExC_parse = (char *)cur_parse;
8239         *STRING(ret)= (char)value;
8240         STR_LEN(ret)= 1;
8241         RExC_emit += STR_SZ(1);
8242         return ret;
8243     }
8244     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8245     if ( /* If the only flag is folding (plus possibly inversion). */
8246 	((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8247        ) {
8248 	for (value = 0; value < 256; ++value) {
8249 	    if (ANYOF_BITMAP_TEST(ret, value)) {
8250 		UV fold = PL_fold[value];
8251 
8252 		if (fold != value)
8253 		    ANYOF_BITMAP_SET(ret, fold);
8254 	    }
8255 	}
8256 	ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8257     }
8258 
8259     /* optimize inverted simple patterns (e.g. [^a-z]) */
8260     if (optimize_invert &&
8261 	/* If the only flag is inversion. */
8262 	(ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) ==	ANYOF_INVERT) {
8263 	for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8264 	    ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8265 	ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8266     }
8267     {
8268 	AV * const av = newAV();
8269 	SV *rv;
8270 	/* The 0th element stores the character class description
8271 	 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8272 	 * to initialize the appropriate swash (which gets stored in
8273 	 * the 1st element), and also useful for dumping the regnode.
8274 	 * The 2nd element stores the multicharacter foldings,
8275 	 * used later (regexec.c:S_reginclass()). */
8276 	av_store(av, 0, listsv);
8277 	av_store(av, 1, NULL);
8278 	av_store(av, 2, (SV*)unicode_alternate);
8279 	rv = newRV_noinc((SV*)av);
8280 	n = add_data(pRExC_state, 1, "s");
8281 	RExC_rxi->data->data[n] = (void*)rv;
8282 	ARG_SET(ret, n);
8283     }
8284     return ret;
8285 }
8286 #undef _C_C_T_
8287 
8288 
8289 /* reg_skipcomment()
8290 
8291    Absorbs an /x style # comments from the input stream.
8292    Returns true if there is more text remaining in the stream.
8293    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8294    terminates the pattern without including a newline.
8295 
8296    Note its the callers responsibility to ensure that we are
8297    actually in /x mode
8298 
8299 */
8300 
8301 STATIC bool
8302 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8303 {
8304     bool ended = 0;
8305     while (RExC_parse < RExC_end)
8306         if (*RExC_parse++ == '\n') {
8307             ended = 1;
8308             break;
8309         }
8310     if (!ended) {
8311         /* we ran off the end of the pattern without ending
8312            the comment, so we have to add an \n when wrapping */
8313         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8314         return 0;
8315     } else
8316         return 1;
8317 }
8318 
8319 /* nextchar()
8320 
8321    Advance that parse position, and optionall absorbs
8322    "whitespace" from the inputstream.
8323 
8324    Without /x "whitespace" means (?#...) style comments only,
8325    with /x this means (?#...) and # comments and whitespace proper.
8326 
8327    Returns the RExC_parse point from BEFORE the scan occurs.
8328 
8329    This is the /x friendly way of saying RExC_parse++.
8330 */
8331 
8332 STATIC char*
8333 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8334 {
8335     char* const retval = RExC_parse++;
8336 
8337     for (;;) {
8338 	if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8339 		RExC_parse[2] == '#') {
8340 	    while (*RExC_parse != ')') {
8341 		if (RExC_parse == RExC_end)
8342 		    FAIL("Sequence (?#... not terminated");
8343 		RExC_parse++;
8344 	    }
8345 	    RExC_parse++;
8346 	    continue;
8347 	}
8348 	if (RExC_flags & RXf_PMf_EXTENDED) {
8349 	    if (isSPACE(*RExC_parse)) {
8350 		RExC_parse++;
8351 		continue;
8352 	    }
8353 	    else if (*RExC_parse == '#') {
8354 	        if ( reg_skipcomment( pRExC_state ) )
8355 	            continue;
8356 	    }
8357 	}
8358 	return retval;
8359     }
8360 }
8361 
8362 /*
8363 - reg_node - emit a node
8364 */
8365 STATIC regnode *			/* Location. */
8366 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8367 {
8368     dVAR;
8369     register regnode *ptr;
8370     regnode * const ret = RExC_emit;
8371     GET_RE_DEBUG_FLAGS_DECL;
8372 
8373     if (SIZE_ONLY) {
8374 	SIZE_ALIGN(RExC_size);
8375 	RExC_size += 1;
8376 	return(ret);
8377     }
8378     if (RExC_emit >= RExC_emit_bound)
8379         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8380 
8381     NODE_ALIGN_FILL(ret);
8382     ptr = ret;
8383     FILL_ADVANCE_NODE(ptr, op);
8384 #ifdef RE_TRACK_PATTERN_OFFSETS
8385     if (RExC_offsets) {         /* MJD */
8386 	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8387               "reg_node", __LINE__,
8388               PL_reg_name[op],
8389               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8390 		? "Overwriting end of array!\n" : "OK",
8391               (UV)(RExC_emit - RExC_emit_start),
8392               (UV)(RExC_parse - RExC_start),
8393               (UV)RExC_offsets[0]));
8394 	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8395     }
8396 #endif
8397     RExC_emit = ptr;
8398     return(ret);
8399 }
8400 
8401 /*
8402 - reganode - emit a node with an argument
8403 */
8404 STATIC regnode *			/* Location. */
8405 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8406 {
8407     dVAR;
8408     register regnode *ptr;
8409     regnode * const ret = RExC_emit;
8410     GET_RE_DEBUG_FLAGS_DECL;
8411 
8412     if (SIZE_ONLY) {
8413 	SIZE_ALIGN(RExC_size);
8414 	RExC_size += 2;
8415 	/*
8416 	   We can't do this:
8417 
8418 	   assert(2==regarglen[op]+1);
8419 
8420 	   Anything larger than this has to allocate the extra amount.
8421 	   If we changed this to be:
8422 
8423 	   RExC_size += (1 + regarglen[op]);
8424 
8425 	   then it wouldn't matter. Its not clear what side effect
8426 	   might come from that so its not done so far.
8427 	   -- dmq
8428 	*/
8429 	return(ret);
8430     }
8431     if (RExC_emit >= RExC_emit_bound)
8432         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8433 
8434     NODE_ALIGN_FILL(ret);
8435     ptr = ret;
8436     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8437 #ifdef RE_TRACK_PATTERN_OFFSETS
8438     if (RExC_offsets) {         /* MJD */
8439 	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8440               "reganode",
8441 	      __LINE__,
8442 	      PL_reg_name[op],
8443               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8444               "Overwriting end of array!\n" : "OK",
8445               (UV)(RExC_emit - RExC_emit_start),
8446               (UV)(RExC_parse - RExC_start),
8447               (UV)RExC_offsets[0]));
8448 	Set_Cur_Node_Offset;
8449     }
8450 #endif
8451     RExC_emit = ptr;
8452     return(ret);
8453 }
8454 
8455 /*
8456 - reguni - emit (if appropriate) a Unicode character
8457 */
8458 STATIC STRLEN
8459 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8460 {
8461     dVAR;
8462     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8463 }
8464 
8465 /*
8466 - reginsert - insert an operator in front of already-emitted operand
8467 *
8468 * Means relocating the operand.
8469 */
8470 STATIC void
8471 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8472 {
8473     dVAR;
8474     register regnode *src;
8475     register regnode *dst;
8476     register regnode *place;
8477     const int offset = regarglen[(U8)op];
8478     const int size = NODE_STEP_REGNODE + offset;
8479     GET_RE_DEBUG_FLAGS_DECL;
8480     PERL_UNUSED_ARG(depth);
8481 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8482     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8483     if (SIZE_ONLY) {
8484 	RExC_size += size;
8485 	return;
8486     }
8487 
8488     src = RExC_emit;
8489     RExC_emit += size;
8490     dst = RExC_emit;
8491     if (RExC_open_parens) {
8492         int paren;
8493         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8494         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8495             if ( RExC_open_parens[paren] >= opnd ) {
8496                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8497                 RExC_open_parens[paren] += size;
8498             } else {
8499                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8500             }
8501             if ( RExC_close_parens[paren] >= opnd ) {
8502                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8503                 RExC_close_parens[paren] += size;
8504             } else {
8505                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8506             }
8507         }
8508     }
8509 
8510     while (src > opnd) {
8511 	StructCopy(--src, --dst, regnode);
8512 #ifdef RE_TRACK_PATTERN_OFFSETS
8513         if (RExC_offsets) {     /* MJD 20010112 */
8514 	    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8515                   "reg_insert",
8516 		  __LINE__,
8517 		  PL_reg_name[op],
8518                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8519 		    ? "Overwriting end of array!\n" : "OK",
8520                   (UV)(src - RExC_emit_start),
8521                   (UV)(dst - RExC_emit_start),
8522                   (UV)RExC_offsets[0]));
8523 	    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8524 	    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8525         }
8526 #endif
8527     }
8528 
8529 
8530     place = opnd;		/* Op node, where operand used to be. */
8531 #ifdef RE_TRACK_PATTERN_OFFSETS
8532     if (RExC_offsets) {         /* MJD */
8533 	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8534               "reginsert",
8535 	      __LINE__,
8536 	      PL_reg_name[op],
8537               (UV)(place - RExC_emit_start) > RExC_offsets[0]
8538               ? "Overwriting end of array!\n" : "OK",
8539               (UV)(place - RExC_emit_start),
8540               (UV)(RExC_parse - RExC_start),
8541               (UV)RExC_offsets[0]));
8542 	Set_Node_Offset(place, RExC_parse);
8543 	Set_Node_Length(place, 1);
8544     }
8545 #endif
8546     src = NEXTOPER(place);
8547     FILL_ADVANCE_NODE(place, op);
8548     Zero(src, offset, regnode);
8549 }
8550 
8551 /*
8552 - regtail - set the next-pointer at the end of a node chain of p to val.
8553 - SEE ALSO: regtail_study
8554 */
8555 /* TODO: All three parms should be const */
8556 STATIC void
8557 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8558 {
8559     dVAR;
8560     register regnode *scan;
8561     GET_RE_DEBUG_FLAGS_DECL;
8562 #ifndef DEBUGGING
8563     PERL_UNUSED_ARG(depth);
8564 #endif
8565 
8566     if (SIZE_ONLY)
8567 	return;
8568 
8569     /* Find last node. */
8570     scan = p;
8571     for (;;) {
8572 	regnode * const temp = regnext(scan);
8573         DEBUG_PARSE_r({
8574             SV * const mysv=sv_newmortal();
8575             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8576             regprop(RExC_rx, mysv, scan);
8577             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8578                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8579                     (temp == NULL ? "->" : ""),
8580                     (temp == NULL ? PL_reg_name[OP(val)] : "")
8581             );
8582         });
8583         if (temp == NULL)
8584             break;
8585         scan = temp;
8586     }
8587 
8588     if (reg_off_by_arg[OP(scan)]) {
8589         ARG_SET(scan, val - scan);
8590     }
8591     else {
8592         NEXT_OFF(scan) = val - scan;
8593     }
8594 }
8595 
8596 #ifdef DEBUGGING
8597 /*
8598 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8599 - Look for optimizable sequences at the same time.
8600 - currently only looks for EXACT chains.
8601 
8602 This is expermental code. The idea is to use this routine to perform
8603 in place optimizations on branches and groups as they are constructed,
8604 with the long term intention of removing optimization from study_chunk so
8605 that it is purely analytical.
8606 
8607 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8608 to control which is which.
8609 
8610 */
8611 /* TODO: All four parms should be const */
8612 
8613 STATIC U8
8614 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8615 {
8616     dVAR;
8617     register regnode *scan;
8618     U8 exact = PSEUDO;
8619 #ifdef EXPERIMENTAL_INPLACESCAN
8620     I32 min = 0;
8621 #endif
8622 
8623     GET_RE_DEBUG_FLAGS_DECL;
8624 
8625 
8626     if (SIZE_ONLY)
8627         return exact;
8628 
8629     /* Find last node. */
8630 
8631     scan = p;
8632     for (;;) {
8633         regnode * const temp = regnext(scan);
8634 #ifdef EXPERIMENTAL_INPLACESCAN
8635         if (PL_regkind[OP(scan)] == EXACT)
8636             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8637                 return EXACT;
8638 #endif
8639         if ( exact ) {
8640             switch (OP(scan)) {
8641                 case EXACT:
8642                 case EXACTF:
8643                 case EXACTFL:
8644                         if( exact == PSEUDO )
8645                             exact= OP(scan);
8646                         else if ( exact != OP(scan) )
8647                             exact= 0;
8648                 case NOTHING:
8649                     break;
8650                 default:
8651                     exact= 0;
8652             }
8653         }
8654         DEBUG_PARSE_r({
8655             SV * const mysv=sv_newmortal();
8656             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8657             regprop(RExC_rx, mysv, scan);
8658             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8659                 SvPV_nolen_const(mysv),
8660                 REG_NODE_NUM(scan),
8661                 PL_reg_name[exact]);
8662         });
8663 	if (temp == NULL)
8664 	    break;
8665 	scan = temp;
8666     }
8667     DEBUG_PARSE_r({
8668         SV * const mysv_val=sv_newmortal();
8669         DEBUG_PARSE_MSG("");
8670         regprop(RExC_rx, mysv_val, val);
8671         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8672 		      SvPV_nolen_const(mysv_val),
8673 		      (IV)REG_NODE_NUM(val),
8674 		      (IV)(val - scan)
8675         );
8676     });
8677     if (reg_off_by_arg[OP(scan)]) {
8678 	ARG_SET(scan, val - scan);
8679     }
8680     else {
8681 	NEXT_OFF(scan) = val - scan;
8682     }
8683 
8684     return exact;
8685 }
8686 #endif
8687 
8688 /*
8689  - regcurly - a little FSA that accepts {\d+,?\d*}
8690  */
8691 STATIC I32
8692 S_regcurly(register const char *s)
8693 {
8694     if (*s++ != '{')
8695 	return FALSE;
8696     if (!isDIGIT(*s))
8697 	return FALSE;
8698     while (isDIGIT(*s))
8699 	s++;
8700     if (*s == ',')
8701 	s++;
8702     while (isDIGIT(*s))
8703 	s++;
8704     if (*s != '}')
8705 	return FALSE;
8706     return TRUE;
8707 }
8708 
8709 
8710 /*
8711  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8712  */
8713 #ifdef DEBUGGING
8714 void
8715 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) {
8716     int bit;
8717     int set=0;
8718     for (bit=0; bit<32; bit++) {
8719         if (flags & (1<<bit)) {
8720             if (!set++ && lead)
8721                 PerlIO_printf(Perl_debug_log, "%s",lead);
8722             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8723         }
8724     }
8725     if (lead)  {
8726         if (set)
8727             PerlIO_printf(Perl_debug_log, "\n");
8728         else
8729             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8730     }
8731 }
8732 #endif
8733 
8734 void
8735 Perl_regdump(pTHX_ const regexp *r)
8736 {
8737 #ifdef DEBUGGING
8738     dVAR;
8739     SV * const sv = sv_newmortal();
8740     SV *dsv= sv_newmortal();
8741     RXi_GET_DECL(r,ri);
8742     GET_RE_DEBUG_FLAGS_DECL;
8743 
8744     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8745 
8746     /* Header fields of interest. */
8747     if (r->anchored_substr) {
8748 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8749 	    RE_SV_DUMPLEN(r->anchored_substr), 30);
8750 	PerlIO_printf(Perl_debug_log,
8751 		      "anchored %s%s at %"IVdf" ",
8752 		      s, RE_SV_TAIL(r->anchored_substr),
8753 		      (IV)r->anchored_offset);
8754     } else if (r->anchored_utf8) {
8755 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8756 	    RE_SV_DUMPLEN(r->anchored_utf8), 30);
8757 	PerlIO_printf(Perl_debug_log,
8758 		      "anchored utf8 %s%s at %"IVdf" ",
8759 		      s, RE_SV_TAIL(r->anchored_utf8),
8760 		      (IV)r->anchored_offset);
8761     }
8762     if (r->float_substr) {
8763 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8764 	    RE_SV_DUMPLEN(r->float_substr), 30);
8765 	PerlIO_printf(Perl_debug_log,
8766 		      "floating %s%s at %"IVdf"..%"UVuf" ",
8767 		      s, RE_SV_TAIL(r->float_substr),
8768 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
8769     } else if (r->float_utf8) {
8770 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8771 	    RE_SV_DUMPLEN(r->float_utf8), 30);
8772 	PerlIO_printf(Perl_debug_log,
8773 		      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8774 		      s, RE_SV_TAIL(r->float_utf8),
8775 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
8776     }
8777     if (r->check_substr || r->check_utf8)
8778 	PerlIO_printf(Perl_debug_log,
8779 		      (const char *)
8780 		      (r->check_substr == r->float_substr
8781 		       && r->check_utf8 == r->float_utf8
8782 		       ? "(checking floating" : "(checking anchored"));
8783     if (r->extflags & RXf_NOSCAN)
8784 	PerlIO_printf(Perl_debug_log, " noscan");
8785     if (r->extflags & RXf_CHECK_ALL)
8786 	PerlIO_printf(Perl_debug_log, " isall");
8787     if (r->check_substr || r->check_utf8)
8788 	PerlIO_printf(Perl_debug_log, ") ");
8789 
8790     if (ri->regstclass) {
8791 	regprop(r, sv, ri->regstclass);
8792 	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8793     }
8794     if (r->extflags & RXf_ANCH) {
8795 	PerlIO_printf(Perl_debug_log, "anchored");
8796 	if (r->extflags & RXf_ANCH_BOL)
8797 	    PerlIO_printf(Perl_debug_log, "(BOL)");
8798 	if (r->extflags & RXf_ANCH_MBOL)
8799 	    PerlIO_printf(Perl_debug_log, "(MBOL)");
8800 	if (r->extflags & RXf_ANCH_SBOL)
8801 	    PerlIO_printf(Perl_debug_log, "(SBOL)");
8802 	if (r->extflags & RXf_ANCH_GPOS)
8803 	    PerlIO_printf(Perl_debug_log, "(GPOS)");
8804 	PerlIO_putc(Perl_debug_log, ' ');
8805     }
8806     if (r->extflags & RXf_GPOS_SEEN)
8807 	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8808     if (r->intflags & PREGf_SKIP)
8809 	PerlIO_printf(Perl_debug_log, "plus ");
8810     if (r->intflags & PREGf_IMPLICIT)
8811 	PerlIO_printf(Perl_debug_log, "implicit ");
8812     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8813     if (r->extflags & RXf_EVAL_SEEN)
8814 	PerlIO_printf(Perl_debug_log, "with eval ");
8815     PerlIO_printf(Perl_debug_log, "\n");
8816     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
8817 #else
8818     PERL_UNUSED_CONTEXT;
8819     PERL_UNUSED_ARG(r);
8820 #endif	/* DEBUGGING */
8821 }
8822 
8823 /*
8824 - regprop - printable representation of opcode
8825 */
8826 void
8827 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8828 {
8829 #ifdef DEBUGGING
8830     dVAR;
8831     register int k;
8832     RXi_GET_DECL(prog,progi);
8833     GET_RE_DEBUG_FLAGS_DECL;
8834 
8835 
8836     sv_setpvn(sv, "", 0);
8837 
8838     if (OP(o) > REGNODE_MAX)		/* regnode.type is unsigned */
8839 	/* It would be nice to FAIL() here, but this may be called from
8840 	   regexec.c, and it would be hard to supply pRExC_state. */
8841 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8842     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
8843 
8844     k = PL_regkind[OP(o)];
8845 
8846     if (k == EXACT) {
8847 	sv_catpvs(sv, " ");
8848 	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8849 	 * is a crude hack but it may be the best for now since
8850 	 * we have no flag "this EXACTish node was UTF-8"
8851 	 * --jhi */
8852 	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
8853 		  PERL_PV_ESCAPE_UNI_DETECT |
8854 		  PERL_PV_PRETTY_ELLIPSES   |
8855 		  PERL_PV_PRETTY_LTGT       |
8856 		  PERL_PV_PRETTY_NOCLEAR
8857 		  );
8858     } else if (k == TRIE) {
8859 	/* print the details of the trie in dumpuntil instead, as
8860 	 * progi->data isn't available here */
8861         const char op = OP(o);
8862         const U32 n = ARG(o);
8863         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8864                (reg_ac_data *)progi->data->data[n] :
8865                NULL;
8866         const reg_trie_data * const trie
8867 	    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
8868 
8869         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
8870         DEBUG_TRIE_COMPILE_r(
8871             Perl_sv_catpvf(aTHX_ sv,
8872                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8873                 (UV)trie->startstate,
8874                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8875                 (UV)trie->wordcount,
8876                 (UV)trie->minlen,
8877                 (UV)trie->maxlen,
8878                 (UV)TRIE_CHARCOUNT(trie),
8879                 (UV)trie->uniquecharcount
8880             )
8881         );
8882         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8883             int i;
8884             int rangestart = -1;
8885             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8886             sv_catpvs(sv, "[");
8887             for (i = 0; i <= 256; i++) {
8888                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8889                     if (rangestart == -1)
8890                         rangestart = i;
8891                 } else if (rangestart != -1) {
8892                     if (i <= rangestart + 3)
8893                         for (; rangestart < i; rangestart++)
8894                             put_byte(sv, rangestart);
8895                     else {
8896                         put_byte(sv, rangestart);
8897                         sv_catpvs(sv, "-");
8898                         put_byte(sv, i - 1);
8899                     }
8900                     rangestart = -1;
8901                 }
8902             }
8903             sv_catpvs(sv, "]");
8904         }
8905 
8906     } else if (k == CURLY) {
8907 	if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8908 	    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8909 	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8910     }
8911     else if (k == WHILEM && o->flags)			/* Ordinal/of */
8912 	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8913     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
8914 	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));	/* Parenth number */
8915 	if ( prog->paren_names ) {
8916             if ( k != REF || OP(o) < NREF) {
8917 	        AV *list= (AV *)progi->data->data[progi->name_list_idx];
8918 	        SV **name= av_fetch(list, ARG(o), 0 );
8919 	        if (name)
8920 	            Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8921             }
8922             else {
8923                 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8924                 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8925                 I32 *nums=(I32*)SvPVX(sv_dat);
8926                 SV **name= av_fetch(list, nums[0], 0 );
8927                 I32 n;
8928                 if (name) {
8929                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
8930                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8931 			   	    (n ? "," : ""), (IV)nums[n]);
8932                     }
8933                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8934                 }
8935             }
8936         }
8937     } else if (k == GOSUB)
8938 	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));	/* Paren and offset */
8939     else if (k == VERB) {
8940         if (!o->flags)
8941             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8942                 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
8943     } else if (k == LOGICAL)
8944 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);	/* 2: embedded, otherwise 1 */
8945     else if (k == FOLDCHAR)
8946 	Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
8947     else if (k == ANYOF) {
8948 	int i, rangestart = -1;
8949 	const U8 flags = ANYOF_FLAGS(o);
8950 
8951 	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8952 	static const char * const anyofs[] = {
8953 	    "\\w",
8954 	    "\\W",
8955 	    "\\s",
8956 	    "\\S",
8957 	    "\\d",
8958 	    "\\D",
8959 	    "[:alnum:]",
8960 	    "[:^alnum:]",
8961 	    "[:alpha:]",
8962 	    "[:^alpha:]",
8963 	    "[:ascii:]",
8964 	    "[:^ascii:]",
8965 	    "[:ctrl:]",
8966 	    "[:^ctrl:]",
8967 	    "[:graph:]",
8968 	    "[:^graph:]",
8969 	    "[:lower:]",
8970 	    "[:^lower:]",
8971 	    "[:print:]",
8972 	    "[:^print:]",
8973 	    "[:punct:]",
8974 	    "[:^punct:]",
8975 	    "[:upper:]",
8976 	    "[:^upper:]",
8977 	    "[:xdigit:]",
8978 	    "[:^xdigit:]",
8979 	    "[:space:]",
8980 	    "[:^space:]",
8981 	    "[:blank:]",
8982 	    "[:^blank:]"
8983 	};
8984 
8985 	if (flags & ANYOF_LOCALE)
8986 	    sv_catpvs(sv, "{loc}");
8987 	if (flags & ANYOF_FOLD)
8988 	    sv_catpvs(sv, "{i}");
8989 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8990 	if (flags & ANYOF_INVERT)
8991 	    sv_catpvs(sv, "^");
8992 	for (i = 0; i <= 256; i++) {
8993 	    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8994 		if (rangestart == -1)
8995 		    rangestart = i;
8996 	    } else if (rangestart != -1) {
8997 		if (i <= rangestart + 3)
8998 		    for (; rangestart < i; rangestart++)
8999 			put_byte(sv, rangestart);
9000 		else {
9001 		    put_byte(sv, rangestart);
9002 		    sv_catpvs(sv, "-");
9003 		    put_byte(sv, i - 1);
9004 		}
9005 		rangestart = -1;
9006 	    }
9007 	}
9008 
9009 	if (o->flags & ANYOF_CLASS)
9010 	    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9011 		if (ANYOF_CLASS_TEST(o,i))
9012 		    sv_catpv(sv, anyofs[i]);
9013 
9014 	if (flags & ANYOF_UNICODE)
9015 	    sv_catpvs(sv, "{unicode}");
9016 	else if (flags & ANYOF_UNICODE_ALL)
9017 	    sv_catpvs(sv, "{unicode_all}");
9018 
9019 	{
9020 	    SV *lv;
9021 	    SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9022 
9023 	    if (lv) {
9024 		if (sw) {
9025 		    U8 s[UTF8_MAXBYTES_CASE+1];
9026 
9027 		    for (i = 0; i <= 256; i++) { /* just the first 256 */
9028 			uvchr_to_utf8(s, i);
9029 
9030 			if (i < 256 && swash_fetch(sw, s, TRUE)) {
9031 			    if (rangestart == -1)
9032 				rangestart = i;
9033 			} else if (rangestart != -1) {
9034 			    if (i <= rangestart + 3)
9035 				for (; rangestart < i; rangestart++) {
9036 				    const U8 * const e = uvchr_to_utf8(s,rangestart);
9037 				    U8 *p;
9038 				    for(p = s; p < e; p++)
9039 					put_byte(sv, *p);
9040 				}
9041 			    else {
9042 				const U8 *e = uvchr_to_utf8(s,rangestart);
9043 				U8 *p;
9044 				for (p = s; p < e; p++)
9045 				    put_byte(sv, *p);
9046 				sv_catpvs(sv, "-");
9047 				e = uvchr_to_utf8(s, i-1);
9048 				for (p = s; p < e; p++)
9049 				    put_byte(sv, *p);
9050 				}
9051 				rangestart = -1;
9052 			    }
9053 			}
9054 
9055 		    sv_catpvs(sv, "..."); /* et cetera */
9056 		}
9057 
9058 		{
9059 		    char *s = savesvpv(lv);
9060 		    char * const origs = s;
9061 
9062 		    while (*s && *s != '\n')
9063 			s++;
9064 
9065 		    if (*s == '\n') {
9066 			const char * const t = ++s;
9067 
9068 			while (*s) {
9069 			    if (*s == '\n')
9070 				*s = ' ';
9071 			    s++;
9072 			}
9073 			if (s[-1] == ' ')
9074 			    s[-1] = 0;
9075 
9076 			sv_catpv(sv, t);
9077 		    }
9078 
9079 		    Safefree(origs);
9080 		}
9081 	    }
9082 	}
9083 
9084 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9085     }
9086     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9087 	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9088 #else
9089     PERL_UNUSED_CONTEXT;
9090     PERL_UNUSED_ARG(sv);
9091     PERL_UNUSED_ARG(o);
9092     PERL_UNUSED_ARG(prog);
9093 #endif	/* DEBUGGING */
9094 }
9095 
9096 SV *
9097 Perl_re_intuit_string(pTHX_ REGEXP * const prog)
9098 {				/* Assume that RE_INTUIT is set */
9099     dVAR;
9100     GET_RE_DEBUG_FLAGS_DECL;
9101     PERL_UNUSED_CONTEXT;
9102 
9103     DEBUG_COMPILE_r(
9104 	{
9105 	    const char * const s = SvPV_nolen_const(prog->check_substr
9106 		      ? prog->check_substr : prog->check_utf8);
9107 
9108 	    if (!PL_colorset) reginitcolors();
9109 	    PerlIO_printf(Perl_debug_log,
9110 		      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9111 		      PL_colors[4],
9112 		      prog->check_substr ? "" : "utf8 ",
9113 		      PL_colors[5],PL_colors[0],
9114 		      s,
9115 		      PL_colors[1],
9116 		      (strlen(s) > 60 ? "..." : ""));
9117 	} );
9118 
9119     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9120 }
9121 
9122 /*
9123    pregfree()
9124 
9125    handles refcounting and freeing the perl core regexp structure. When
9126    it is necessary to actually free the structure the first thing it
9127    does is call the 'free' method of the regexp_engine associated to to
9128    the regexp, allowing the handling of the void *pprivate; member
9129    first. (This routine is not overridable by extensions, which is why
9130    the extensions free is called first.)
9131 
9132    See regdupe and regdupe_internal if you change anything here.
9133 */
9134 #ifndef PERL_IN_XSUB_RE
9135 void
9136 Perl_pregfree(pTHX_ struct regexp *r)
9137 {
9138     dVAR;
9139     GET_RE_DEBUG_FLAGS_DECL;
9140 
9141     if (!r || (--r->refcnt > 0))
9142 	return;
9143     if (r->mother_re) {
9144         ReREFCNT_dec(r->mother_re);
9145     } else {
9146         CALLREGFREE_PVT(r); /* free the private data */
9147         if (r->paren_names)
9148             SvREFCNT_dec(r->paren_names);
9149         Safefree(r->wrapped);
9150     }
9151     if (r->substrs) {
9152         if (r->anchored_substr)
9153             SvREFCNT_dec(r->anchored_substr);
9154         if (r->anchored_utf8)
9155             SvREFCNT_dec(r->anchored_utf8);
9156         if (r->float_substr)
9157             SvREFCNT_dec(r->float_substr);
9158         if (r->float_utf8)
9159             SvREFCNT_dec(r->float_utf8);
9160 	Safefree(r->substrs);
9161     }
9162     RX_MATCH_COPY_FREE(r);
9163 #ifdef PERL_OLD_COPY_ON_WRITE
9164     if (r->saved_copy)
9165         SvREFCNT_dec(r->saved_copy);
9166 #endif
9167     Safefree(r->swap);
9168     Safefree(r->offs);
9169     Safefree(r);
9170 }
9171 
9172 /*  reg_temp_copy()
9173 
9174     This is a hacky workaround to the structural issue of match results
9175     being stored in the regexp structure which is in turn stored in
9176     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9177     could be PL_curpm in multiple contexts, and could require multiple
9178     result sets being associated with the pattern simultaneously, such
9179     as when doing a recursive match with (??{$qr})
9180 
9181     The solution is to make a lightweight copy of the regexp structure
9182     when a qr// is returned from the code executed by (??{$qr}) this
9183     lightweight copy doesnt actually own any of its data except for
9184     the starp/end and the actual regexp structure itself.
9185 
9186 */
9187 
9188 
9189 regexp *
9190 Perl_reg_temp_copy (pTHX_ struct regexp *r) {
9191     regexp *ret;
9192     register const I32 npar = r->nparens+1;
9193     (void)ReREFCNT_inc(r);
9194     Newx(ret, 1, regexp);
9195     StructCopy(r, ret, regexp);
9196     Newx(ret->offs, npar, regexp_paren_pair);
9197     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9198     ret->refcnt = 1;
9199     if (r->substrs) {
9200         Newx(ret->substrs, 1, struct reg_substr_data);
9201 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9202 
9203 	SvREFCNT_inc_void(ret->anchored_substr);
9204 	SvREFCNT_inc_void(ret->anchored_utf8);
9205 	SvREFCNT_inc_void(ret->float_substr);
9206 	SvREFCNT_inc_void(ret->float_utf8);
9207 
9208 	/* check_substr and check_utf8, if non-NULL, point to either their
9209 	   anchored or float namesakes, and don't hold a second reference.  */
9210     }
9211     RX_MATCH_COPIED_off(ret);
9212 #ifdef PERL_OLD_COPY_ON_WRITE
9213     ret->saved_copy = NULL;
9214 #endif
9215     ret->mother_re = r;
9216     ret->swap = NULL;
9217 
9218     return ret;
9219 }
9220 #endif
9221 
9222 /* regfree_internal()
9223 
9224    Free the private data in a regexp. This is overloadable by
9225    extensions. Perl takes care of the regexp structure in pregfree(),
9226    this covers the *pprivate pointer which technically perldoesnt
9227    know about, however of course we have to handle the
9228    regexp_internal structure when no extension is in use.
9229 
9230    Note this is called before freeing anything in the regexp
9231    structure.
9232  */
9233 
9234 void
9235 Perl_regfree_internal(pTHX_ REGEXP * const r)
9236 {
9237     dVAR;
9238     RXi_GET_DECL(r,ri);
9239     GET_RE_DEBUG_FLAGS_DECL;
9240 
9241     DEBUG_COMPILE_r({
9242 	if (!PL_colorset)
9243 	    reginitcolors();
9244 	{
9245 	    SV *dsv= sv_newmortal();
9246             RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
9247                 dsv, r->precomp, r->prelen, 60);
9248             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9249                 PL_colors[4],PL_colors[5],s);
9250         }
9251     });
9252 #ifdef RE_TRACK_PATTERN_OFFSETS
9253     if (ri->u.offsets)
9254         Safefree(ri->u.offsets);             /* 20010421 MJD */
9255 #endif
9256     if (ri->data) {
9257 	int n = ri->data->count;
9258 	PAD* new_comppad = NULL;
9259 	PAD* old_comppad;
9260 	PADOFFSET refcnt;
9261 
9262 	while (--n >= 0) {
9263           /* If you add a ->what type here, update the comment in regcomp.h */
9264 	    switch (ri->data->what[n]) {
9265 	    case 's':
9266 	    case 'S':
9267 	    case 'u':
9268 		SvREFCNT_dec((SV*)ri->data->data[n]);
9269 		break;
9270 	    case 'f':
9271 		Safefree(ri->data->data[n]);
9272 		break;
9273 	    case 'p':
9274 		new_comppad = (AV*)ri->data->data[n];
9275 		break;
9276 	    case 'o':
9277 		if (new_comppad == NULL)
9278 		    Perl_croak(aTHX_ "panic: pregfree comppad");
9279 		PAD_SAVE_LOCAL(old_comppad,
9280 		    /* Watch out for global destruction's random ordering. */
9281 		    (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9282 		);
9283 		OP_REFCNT_LOCK;
9284 		refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9285 		OP_REFCNT_UNLOCK;
9286 		if (!refcnt)
9287                     op_free((OP_4tree*)ri->data->data[n]);
9288 
9289 		PAD_RESTORE_LOCAL(old_comppad);
9290 		SvREFCNT_dec((SV*)new_comppad);
9291 		new_comppad = NULL;
9292 		break;
9293 	    case 'n':
9294 	        break;
9295             case 'T':
9296                 { /* Aho Corasick add-on structure for a trie node.
9297                      Used in stclass optimization only */
9298                     U32 refcount;
9299                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9300                     OP_REFCNT_LOCK;
9301                     refcount = --aho->refcount;
9302                     OP_REFCNT_UNLOCK;
9303                     if ( !refcount ) {
9304                         PerlMemShared_free(aho->states);
9305                         PerlMemShared_free(aho->fail);
9306 			 /* do this last!!!! */
9307                         PerlMemShared_free(ri->data->data[n]);
9308                         PerlMemShared_free(ri->regstclass);
9309                     }
9310                 }
9311                 break;
9312 	    case 't':
9313 	        {
9314 	            /* trie structure. */
9315 	            U32 refcount;
9316 	            reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9317                     OP_REFCNT_LOCK;
9318                     refcount = --trie->refcount;
9319                     OP_REFCNT_UNLOCK;
9320                     if ( !refcount ) {
9321                         PerlMemShared_free(trie->charmap);
9322                         PerlMemShared_free(trie->states);
9323                         PerlMemShared_free(trie->trans);
9324                         if (trie->bitmap)
9325                             PerlMemShared_free(trie->bitmap);
9326                         if (trie->wordlen)
9327                             PerlMemShared_free(trie->wordlen);
9328                         if (trie->jump)
9329                             PerlMemShared_free(trie->jump);
9330                         if (trie->nextword)
9331                             PerlMemShared_free(trie->nextword);
9332                         /* do this last!!!! */
9333                         PerlMemShared_free(ri->data->data[n]);
9334 		    }
9335 		}
9336 		break;
9337 	    default:
9338 		Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9339 	    }
9340 	}
9341 	Safefree(ri->data->what);
9342 	Safefree(ri->data);
9343     }
9344 
9345     Safefree(ri);
9346 }
9347 
9348 #define sv_dup_inc(s,t)	SvREFCNT_inc(sv_dup(s,t))
9349 #define av_dup_inc(s,t)	(AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9350 #define hv_dup_inc(s,t)	(HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9351 #define SAVEPVN(p,n)	((p) ? savepvn(p,n) : NULL)
9352 
9353 /*
9354    re_dup - duplicate a regexp.
9355 
9356    This routine is expected to clone a given regexp structure. It is not
9357    compiler under USE_ITHREADS.
9358 
9359    After all of the core data stored in struct regexp is duplicated
9360    the regexp_engine.dupe method is used to copy any private data
9361    stored in the *pprivate pointer. This allows extensions to handle
9362    any duplication it needs to do.
9363 
9364    See pregfree() and regfree_internal() if you change anything here.
9365 */
9366 #if defined(USE_ITHREADS)
9367 #ifndef PERL_IN_XSUB_RE
9368 regexp *
9369 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
9370 {
9371     dVAR;
9372     regexp *ret;
9373     I32 npar;
9374 
9375     if (!r)
9376 	return (REGEXP *)NULL;
9377 
9378     if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
9379 	return ret;
9380 
9381 
9382     npar = r->nparens+1;
9383     Newx(ret, 1, regexp);
9384     StructCopy(r, ret, regexp);
9385     Newx(ret->offs, npar, regexp_paren_pair);
9386     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9387     if(ret->swap) {
9388         /* no need to copy these */
9389         Newx(ret->swap, npar, regexp_paren_pair);
9390     }
9391 
9392     if (ret->substrs) {
9393 	/* Do it this way to avoid reading from *r after the StructCopy().
9394 	   That way, if any of the sv_dup_inc()s dislodge *r from the L1
9395 	   cache, it doesn't matter.  */
9396 	const bool anchored = r->check_substr == r->anchored_substr;
9397         Newx(ret->substrs, 1, struct reg_substr_data);
9398 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9399 
9400 	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9401 	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9402 	ret->float_substr = sv_dup_inc(ret->float_substr, param);
9403 	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9404 
9405 	/* check_substr and check_utf8, if non-NULL, point to either their
9406 	   anchored or float namesakes, and don't hold a second reference.  */
9407 
9408 	if (ret->check_substr) {
9409 	    if (anchored) {
9410 		assert(r->check_utf8 == r->anchored_utf8);
9411 		ret->check_substr = ret->anchored_substr;
9412 		ret->check_utf8 = ret->anchored_utf8;
9413 	    } else {
9414 		assert(r->check_substr == r->float_substr);
9415 		assert(r->check_utf8 == r->float_utf8);
9416 		ret->check_substr = ret->float_substr;
9417 		ret->check_utf8 = ret->float_utf8;
9418 	    }
9419 	}
9420     }
9421 
9422     ret->wrapped        = SAVEPVN(ret->wrapped, ret->wraplen+1);
9423     ret->precomp        = ret->wrapped + (ret->precomp - ret->wrapped);
9424     ret->paren_names    = hv_dup_inc(ret->paren_names, param);
9425 
9426     if (ret->pprivate)
9427 	RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
9428 
9429     if (RX_MATCH_COPIED(ret))
9430 	ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9431     else
9432 	ret->subbeg = NULL;
9433 #ifdef PERL_OLD_COPY_ON_WRITE
9434     ret->saved_copy = NULL;
9435 #endif
9436 
9437     ret->mother_re      = NULL;
9438     ret->gofs = 0;
9439     ret->seen_evals = 0;
9440 
9441     ptr_table_store(PL_ptr_table, r, ret);
9442     return ret;
9443 }
9444 #endif /* PERL_IN_XSUB_RE */
9445 
9446 /*
9447    regdupe_internal()
9448 
9449    This is the internal complement to regdupe() which is used to copy
9450    the structure pointed to by the *pprivate pointer in the regexp.
9451    This is the core version of the extension overridable cloning hook.
9452    The regexp structure being duplicated will be copied by perl prior
9453    to this and will be provided as the regexp *r argument, however
9454    with the /old/ structures pprivate pointer value. Thus this routine
9455    may override any copying normally done by perl.
9456 
9457    It returns a pointer to the new regexp_internal structure.
9458 */
9459 
9460 void *
9461 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param)
9462 {
9463     dVAR;
9464     regexp_internal *reti;
9465     int len, npar;
9466     RXi_GET_DECL(r,ri);
9467 
9468     npar = r->nparens+1;
9469     len = ProgLen(ri);
9470 
9471     Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
9472     Copy(ri->program, reti->program, len+1, regnode);
9473 
9474 
9475     reti->regstclass = NULL;
9476 
9477     if (ri->data) {
9478 	struct reg_data *d;
9479         const int count = ri->data->count;
9480 	int i;
9481 
9482 	Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9483 		char, struct reg_data);
9484 	Newx(d->what, count, U8);
9485 
9486 	d->count = count;
9487 	for (i = 0; i < count; i++) {
9488 	    d->what[i] = ri->data->what[i];
9489 	    switch (d->what[i]) {
9490 	        /* legal options are one of: sSfpontTu
9491 	           see also regcomp.h and pregfree() */
9492 	    case 's':
9493 	    case 'S':
9494 	    case 'p': /* actually an AV, but the dup function is identical.  */
9495 	    case 'u': /* actually an HV, but the dup function is identical.  */
9496 		d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
9497 		break;
9498 	    case 'f':
9499 		/* This is cheating. */
9500 		Newx(d->data[i], 1, struct regnode_charclass_class);
9501 		StructCopy(ri->data->data[i], d->data[i],
9502 			    struct regnode_charclass_class);
9503 		reti->regstclass = (regnode*)d->data[i];
9504 		break;
9505 	    case 'o':
9506 		/* Compiled op trees are readonly and in shared memory,
9507 		   and can thus be shared without duplication. */
9508 		OP_REFCNT_LOCK;
9509 		d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9510 		OP_REFCNT_UNLOCK;
9511 		break;
9512 	    case 'T':
9513 		/* Trie stclasses are readonly and can thus be shared
9514 		 * without duplication. We free the stclass in pregfree
9515 		 * when the corresponding reg_ac_data struct is freed.
9516 		 */
9517 		reti->regstclass= ri->regstclass;
9518 		/* Fall through */
9519 	    case 't':
9520 		OP_REFCNT_LOCK;
9521 		((reg_trie_data*)ri->data->data[i])->refcount++;
9522 		OP_REFCNT_UNLOCK;
9523 		/* Fall through */
9524 	    case 'n':
9525 		d->data[i] = ri->data->data[i];
9526 		break;
9527             default:
9528 		Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9529 	    }
9530 	}
9531 
9532 	reti->data = d;
9533     }
9534     else
9535 	reti->data = NULL;
9536 
9537     reti->name_list_idx = ri->name_list_idx;
9538 
9539 #ifdef RE_TRACK_PATTERN_OFFSETS
9540     if (ri->u.offsets) {
9541         Newx(reti->u.offsets, 2*len+1, U32);
9542         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9543     }
9544 #else
9545     SetProgLen(reti,len);
9546 #endif
9547 
9548     return (void*)reti;
9549 }
9550 
9551 #endif    /* USE_ITHREADS */
9552 
9553 /*
9554    reg_stringify()
9555 
9556    converts a regexp embedded in a MAGIC struct to its stringified form,
9557    caching the converted form in the struct and returns the cached
9558    string.
9559 
9560    If lp is nonnull then it is used to return the length of the
9561    resulting string
9562 
9563    If flags is nonnull and the returned string contains UTF8 then
9564    (*flags & 1) will be true.
9565 
9566    If haseval is nonnull then it is used to return whether the pattern
9567    contains evals.
9568 
9569    Normally called via macro:
9570 
9571         CALLREG_STRINGIFY(mg,&len,&utf8);
9572 
9573    And internally with
9574 
9575         CALLREG_AS_STR(mg,&lp,&flags,&haseval)
9576 
9577    See sv_2pv_flags() in sv.c for an example of internal usage.
9578 
9579  */
9580 #ifndef PERL_IN_XSUB_RE
9581 
9582 char *
9583 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9584     dVAR;
9585     const regexp * const re = (regexp *)mg->mg_obj;
9586     if (haseval)
9587         *haseval = re->seen_evals;
9588     if (flags)
9589 	*flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
9590     if (lp)
9591 	*lp = re->wraplen;
9592     return re->wrapped;
9593 }
9594 
9595 /*
9596  - regnext - dig the "next" pointer out of a node
9597  */
9598 regnode *
9599 Perl_regnext(pTHX_ register regnode *p)
9600 {
9601     dVAR;
9602     register I32 offset;
9603 
9604     if (!p)
9605 	return(NULL);
9606 
9607     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9608     if (offset == 0)
9609 	return(NULL);
9610 
9611     return(p+offset);
9612 }
9613 #endif
9614 
9615 STATIC void
9616 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9617 {
9618     va_list args;
9619     STRLEN l1 = strlen(pat1);
9620     STRLEN l2 = strlen(pat2);
9621     char buf[512];
9622     SV *msv;
9623     const char *message;
9624 
9625     if (l1 > 510)
9626 	l1 = 510;
9627     if (l1 + l2 > 510)
9628 	l2 = 510 - l1;
9629     Copy(pat1, buf, l1 , char);
9630     Copy(pat2, buf + l1, l2 , char);
9631     buf[l1 + l2] = '\n';
9632     buf[l1 + l2 + 1] = '\0';
9633 #ifdef I_STDARG
9634     /* ANSI variant takes additional second argument */
9635     va_start(args, pat2);
9636 #else
9637     va_start(args);
9638 #endif
9639     msv = vmess(buf, &args);
9640     va_end(args);
9641     message = SvPV_const(msv,l1);
9642     if (l1 > 512)
9643 	l1 = 512;
9644     Copy(message, buf, l1 , char);
9645     buf[l1-1] = '\0';			/* Overwrite \n */
9646     Perl_croak(aTHX_ "%s", buf);
9647 }
9648 
9649 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
9650 
9651 #ifndef PERL_IN_XSUB_RE
9652 void
9653 Perl_save_re_context(pTHX)
9654 {
9655     dVAR;
9656 
9657     struct re_save_state *state;
9658 
9659     SAVEVPTR(PL_curcop);
9660     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9661 
9662     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9663     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9664     SSPUSHINT(SAVEt_RE_STATE);
9665 
9666     Copy(&PL_reg_state, state, 1, struct re_save_state);
9667 
9668     PL_reg_start_tmp = 0;
9669     PL_reg_start_tmpl = 0;
9670     PL_reg_oldsaved = NULL;
9671     PL_reg_oldsavedlen = 0;
9672     PL_reg_maxiter = 0;
9673     PL_reg_leftiter = 0;
9674     PL_reg_poscache = NULL;
9675     PL_reg_poscache_size = 0;
9676 #ifdef PERL_OLD_COPY_ON_WRITE
9677     PL_nrs = NULL;
9678 #endif
9679 
9680     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9681     if (PL_curpm) {
9682 	const REGEXP * const rx = PM_GETRE(PL_curpm);
9683 	if (rx) {
9684 	    U32 i;
9685 	    for (i = 1; i <= rx->nparens; i++) {
9686 		char digits[TYPE_CHARS(long)];
9687 		const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9688 		GV *const *const gvp
9689 		    = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9690 
9691 		if (gvp) {
9692 		    GV * const gv = *gvp;
9693 		    if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9694 			save_scalar(gv);
9695 		}
9696 	    }
9697 	}
9698     }
9699 }
9700 #endif
9701 
9702 static void
9703 clear_re(pTHX_ void *r)
9704 {
9705     dVAR;
9706     ReREFCNT_dec((regexp *)r);
9707 }
9708 
9709 #ifdef DEBUGGING
9710 
9711 STATIC void
9712 S_put_byte(pTHX_ SV *sv, int c)
9713 {
9714     /* Our definition of isPRINT() ignores locales, so only bytes that are
9715        not part of UTF-8 are considered printable. I assume that the same
9716        holds for UTF-EBCDIC.
9717        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9718        which Wikipedia says:
9719 
9720        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9721        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9722        identical, to the ASCII delete (DEL) or rubout control character.
9723        ) So the old condition can be simplified to !isPRINT(c)  */
9724     if (!isPRINT(c))
9725 	Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9726     else {
9727 	const char string = c;
9728 	if (c == '-' || c == ']' || c == '\\' || c == '^')
9729 	    sv_catpvs(sv, "\\");
9730 	sv_catpvn(sv, &string, 1);
9731     }
9732 }
9733 
9734 
9735 #define CLEAR_OPTSTART \
9736     if (optstart) STMT_START { \
9737 	    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9738 	    optstart=NULL; \
9739     } STMT_END
9740 
9741 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9742 
9743 STATIC const regnode *
9744 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9745 	    const regnode *last, const regnode *plast,
9746 	    SV* sv, I32 indent, U32 depth)
9747 {
9748     dVAR;
9749     register U8 op = PSEUDO;	/* Arbitrary non-END op. */
9750     register const regnode *next;
9751     const regnode *optstart= NULL;
9752 
9753     RXi_GET_DECL(r,ri);
9754     GET_RE_DEBUG_FLAGS_DECL;
9755 
9756 #ifdef DEBUG_DUMPUNTIL
9757     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9758         last ? last-start : 0,plast ? plast-start : 0);
9759 #endif
9760 
9761     if (plast && plast < last)
9762         last= plast;
9763 
9764     while (PL_regkind[op] != END && (!last || node < last)) {
9765 	/* While that wasn't END last time... */
9766 	NODE_ALIGN(node);
9767 	op = OP(node);
9768 	if (op == CLOSE || op == WHILEM)
9769 	    indent--;
9770 	next = regnext((regnode *)node);
9771 
9772 	/* Where, what. */
9773 	if (OP(node) == OPTIMIZED) {
9774 	    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9775 	        optstart = node;
9776 	    else
9777 		goto after_print;
9778 	} else
9779 	    CLEAR_OPTSTART;
9780 
9781 	regprop(r, sv, node);
9782 	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9783 		      (int)(2*indent + 1), "", SvPVX_const(sv));
9784 
9785         if (OP(node) != OPTIMIZED) {
9786             if (next == NULL)		/* Next ptr. */
9787                 PerlIO_printf(Perl_debug_log, " (0)");
9788             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9789                 PerlIO_printf(Perl_debug_log, " (FAIL)");
9790             else
9791                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9792             (void)PerlIO_putc(Perl_debug_log, '\n');
9793         }
9794 
9795       after_print:
9796 	if (PL_regkind[(U8)op] == BRANCHJ) {
9797 	    assert(next);
9798 	    {
9799                 register const regnode *nnode = (OP(next) == LONGJMP
9800 					     ? regnext((regnode *)next)
9801 					     : next);
9802                 if (last && nnode > last)
9803                     nnode = last;
9804                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9805 	    }
9806 	}
9807 	else if (PL_regkind[(U8)op] == BRANCH) {
9808 	    assert(next);
9809 	    DUMPUNTIL(NEXTOPER(node), next);
9810 	}
9811 	else if ( PL_regkind[(U8)op]  == TRIE ) {
9812 	    const regnode *this_trie = node;
9813 	    const char op = OP(node);
9814             const U32 n = ARG(node);
9815 	    const reg_ac_data * const ac = op>=AHOCORASICK ?
9816                (reg_ac_data *)ri->data->data[n] :
9817                NULL;
9818 	    const reg_trie_data * const trie =
9819 	        (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
9820 #ifdef DEBUGGING
9821 	    AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9822 #endif
9823 	    const regnode *nextbranch= NULL;
9824 	    I32 word_idx;
9825             sv_setpvn(sv, "", 0);
9826 	    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9827 		SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
9828 
9829                 PerlIO_printf(Perl_debug_log, "%*s%s ",
9830                    (int)(2*(indent+3)), "",
9831                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9832 	                    PL_colors[0], PL_colors[1],
9833 	                    (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9834 	                    PERL_PV_PRETTY_ELLIPSES    |
9835 	                    PERL_PV_PRETTY_LTGT
9836                             )
9837                             : "???"
9838                 );
9839                 if (trie->jump) {
9840                     U16 dist= trie->jump[word_idx+1];
9841 		    PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9842 				  (UV)((dist ? this_trie + dist : next) - start));
9843                     if (dist) {
9844                         if (!nextbranch)
9845                             nextbranch= this_trie + trie->jump[0];
9846 			DUMPUNTIL(this_trie + dist, nextbranch);
9847                     }
9848                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9849                         nextbranch= regnext((regnode *)nextbranch);
9850                 } else {
9851                     PerlIO_printf(Perl_debug_log, "\n");
9852 		}
9853 	    }
9854 	    if (last && next > last)
9855 	        node= last;
9856 	    else
9857 	        node= next;
9858 	}
9859 	else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
9860 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9861                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9862 	}
9863 	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9864 	    assert(next);
9865 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9866 	}
9867 	else if ( op == PLUS || op == STAR) {
9868 	    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9869 	}
9870 	else if (op == ANYOF) {
9871 	    /* arglen 1 + class block */
9872 	    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9873 		    ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9874 	    node = NEXTOPER(node);
9875 	}
9876 	else if (PL_regkind[(U8)op] == EXACT) {
9877             /* Literal string, where present. */
9878 	    node += NODE_SZ_STR(node) - 1;
9879 	    node = NEXTOPER(node);
9880 	}
9881 	else {
9882 	    node = NEXTOPER(node);
9883 	    node += regarglen[(U8)op];
9884 	}
9885 	if (op == CURLYX || op == OPEN)
9886 	    indent++;
9887     }
9888     CLEAR_OPTSTART;
9889 #ifdef DEBUG_DUMPUNTIL
9890     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9891 #endif
9892     return node;
9893 }
9894 
9895 #endif	/* DEBUGGING */
9896 
9897 /*
9898  * Local variables:
9899  * c-indentation-style: bsd
9900  * c-basic-offset: 4
9901  * indent-tabs-mode: t
9902  * End:
9903  *
9904  * ex: set ts=8 sts=4 sw=4 noet:
9905  */
9906