xref: /openbsd-src/gnu/usr.bin/perl/regcomp.c (revision 5054e3e78af0749a9bb00ba9a024b3ee2d90290f)
1 /*    regcomp.c
2  */
3 
4 /*
5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6  *
7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8  */
9 
10 /* This file contains functions for compiling a regular expression.  See
11  * also regexec.c which funnily enough, contains functions for executing
12  * a regular expression.
13  *
14  * This file is also copied at build time to ext/re/re_comp.c, where
15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16  * This causes the main functions to be compiled under new names and with
17  * debugging support added, which makes "use re 'debug'" work.
18  */
19 
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21  * confused with the original package (see point 3 below).  Thanks, Henry!
22  */
23 
24 /* Additional note: this code is very heavily munged from Henry's version
25  * in places.  In some spots I've traded clarity for efficiency, so don't
26  * blame Henry for some of the lack of readability.
27  */
28 
29 /* The names of the functions have been changed from regcomp and
30  * regexec to  pregcomp and pregexec in order to avoid conflicts
31  * with the POSIX routines of the same names.
32 */
33 
34 #ifdef PERL_EXT_RE_BUILD
35 #include "re_top.h"
36 #endif
37 
38 /*
39  * pregcomp and pregexec -- regsub and regerror are not used in perl
40  *
41  *	Copyright (c) 1986 by University of Toronto.
42  *	Written by Henry Spencer.  Not derived from licensed software.
43  *
44  *	Permission is granted to anyone to use this software for any
45  *	purpose on any computer system, and to redistribute it freely,
46  *	subject to the following restrictions:
47  *
48  *	1. The author is not responsible for the consequences of use of
49  *		this software, no matter how awful, even if they arise
50  *		from defects in it.
51  *
52  *	2. The origin of this software must not be misrepresented, either
53  *		by explicit claim or by omission.
54  *
55  *	3. Altered versions must be plainly marked as such, and must not
56  *		be misrepresented as being the original software.
57  *
58  *
59  ****    Alterations to Henry's code are...
60  ****
61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63  ****    by Larry Wall and others
64  ****
65  ****    You may distribute under the terms of either the GNU General Public
66  ****    License or the Artistic License, as specified in the README file.
67 
68  *
69  * Beware that some of this code is subtly aware of the way operator
70  * precedence is structured in regular expressions.  Serious changes in
71  * regular-expression syntax might require a total rethink.
72  */
73 #include "EXTERN.h"
74 #define PERL_IN_REGCOMP_C
75 #include "perl.h"
76 
77 #ifndef PERL_IN_XSUB_RE
78 #  include "INTERN.h"
79 #endif
80 
81 #define REG_COMP_C
82 #ifdef PERL_IN_XSUB_RE
83 #  include "re_comp.h"
84 #else
85 #  include "regcomp.h"
86 #endif
87 
88 #ifdef op
89 #undef op
90 #endif /* op */
91 
92 #ifdef MSDOS
93 #  if defined(BUGGY_MSC6)
94  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
95 #    pragma optimize("a",off)
96  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
97 #    pragma optimize("w",on )
98 #  endif /* BUGGY_MSC6 */
99 #endif /* MSDOS */
100 
101 #ifndef STATIC
102 #define	STATIC	static
103 #endif
104 
105 typedef struct RExC_state_t {
106     U32		flags;			/* are we folding, multilining? */
107     char	*precomp;		/* uncompiled string. */
108     regexp	*rx;                    /* perl core regexp structure */
109     regexp_internal	*rxi;           /* internal data for regexp object pprivate field */
110     char	*start;			/* Start of input for compile */
111     char	*end;			/* End of input for compile */
112     char	*parse;			/* Input-scan pointer. */
113     I32		whilem_seen;		/* number of WHILEM in this expr */
114     regnode	*emit_start;		/* Start of emitted-code area */
115     regnode	*emit_bound;		/* First regnode outside of the allocated space */
116     regnode	*emit;			/* Code-emit pointer; &regdummy = don't = compiling */
117     I32		naughty;		/* How bad is this pattern? */
118     I32		sawback;		/* Did we see \1, ...? */
119     U32		seen;
120     I32		size;			/* Code size. */
121     I32		npar;			/* Capture buffer count, (OPEN). */
122     I32		cpar;			/* Capture buffer count, (CLOSE). */
123     I32		nestroot;		/* root parens we are in - used by accept */
124     I32		extralen;
125     I32		seen_zerolen;
126     I32		seen_evals;
127     regnode	**open_parens;		/* pointers to open parens */
128     regnode	**close_parens;		/* pointers to close parens */
129     regnode	*opend;			/* END node in program */
130     I32		utf8;		/* whether the pattern is utf8 or not */
131     I32		orig_utf8;	/* whether the pattern was originally in utf8 */
132 				/* XXX use this for future optimisation of case
133 				 * where pattern must be upgraded to utf8. */
134     HV		*charnames;		/* cache of named sequences */
135     HV		*paren_names;		/* Paren names */
136 
137     regnode	**recurse;		/* Recurse regops */
138     I32		recurse_count;		/* Number of recurse regops */
139 #if ADD_TO_REGEXEC
140     char 	*starttry;		/* -Dr: where regtry was called. */
141 #define RExC_starttry	(pRExC_state->starttry)
142 #endif
143 #ifdef DEBUGGING
144     const char  *lastparse;
145     I32         lastnum;
146     AV          *paren_name_list;       /* idx -> name */
147 #define RExC_lastparse	(pRExC_state->lastparse)
148 #define RExC_lastnum	(pRExC_state->lastnum)
149 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
150 #endif
151 } RExC_state_t;
152 
153 #define RExC_flags	(pRExC_state->flags)
154 #define RExC_precomp	(pRExC_state->precomp)
155 #define RExC_rx		(pRExC_state->rx)
156 #define RExC_rxi	(pRExC_state->rxi)
157 #define RExC_start	(pRExC_state->start)
158 #define RExC_end	(pRExC_state->end)
159 #define RExC_parse	(pRExC_state->parse)
160 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
161 #ifdef RE_TRACK_PATTERN_OFFSETS
162 #define RExC_offsets	(pRExC_state->rxi->u.offsets) /* I am not like the others */
163 #endif
164 #define RExC_emit	(pRExC_state->emit)
165 #define RExC_emit_start	(pRExC_state->emit_start)
166 #define RExC_emit_bound	(pRExC_state->emit_bound)
167 #define RExC_naughty	(pRExC_state->naughty)
168 #define RExC_sawback	(pRExC_state->sawback)
169 #define RExC_seen	(pRExC_state->seen)
170 #define RExC_size	(pRExC_state->size)
171 #define RExC_npar	(pRExC_state->npar)
172 #define RExC_nestroot   (pRExC_state->nestroot)
173 #define RExC_extralen	(pRExC_state->extralen)
174 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
175 #define RExC_seen_evals	(pRExC_state->seen_evals)
176 #define RExC_utf8	(pRExC_state->utf8)
177 #define RExC_orig_utf8	(pRExC_state->orig_utf8)
178 #define RExC_charnames  (pRExC_state->charnames)
179 #define RExC_open_parens	(pRExC_state->open_parens)
180 #define RExC_close_parens	(pRExC_state->close_parens)
181 #define RExC_opend	(pRExC_state->opend)
182 #define RExC_paren_names	(pRExC_state->paren_names)
183 #define RExC_recurse	(pRExC_state->recurse)
184 #define RExC_recurse_count	(pRExC_state->recurse_count)
185 
186 
187 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
188 #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 	((*s) == '{' && regcurly(s)))
190 
191 #ifdef SPSTART
192 #undef SPSTART		/* dratted cpp namespace... */
193 #endif
194 /*
195  * Flags to be passed up and down.
196  */
197 #define	WORST		0	/* Worst case. */
198 #define	HASWIDTH	0x01	/* Known to match non-null strings. */
199 #define	SIMPLE		0x02	/* Simple enough to be STAR/PLUS operand. */
200 #define	SPSTART		0x04	/* Starts with * or +. */
201 #define TRYAGAIN	0x08	/* Weeded out a declaration. */
202 #define POSTPONED	0x10    /* (?1),(?&name), (??{...}) or similar */
203 
204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
205 
206 /* whether trie related optimizations are enabled */
207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208 #define TRIE_STUDY_OPT
209 #define FULL_TRIE_STUDY
210 #define TRIE_STCLASS
211 #endif
212 
213 
214 
215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216 #define PBITVAL(paren) (1 << ((paren) & 7))
217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
220 
221 
222 /* About scan_data_t.
223 
224   During optimisation we recurse through the regexp program performing
225   various inplace (keyhole style) optimisations. In addition study_chunk
226   and scan_commit populate this data structure with information about
227   what strings MUST appear in the pattern. We look for the longest
228   string that must appear for at a fixed location, and we look for the
229   longest string that may appear at a floating location. So for instance
230   in the pattern:
231 
232     /FOO[xX]A.*B[xX]BAR/
233 
234   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235   strings (because they follow a .* construct). study_chunk will identify
236   both FOO and BAR as being the longest fixed and floating strings respectively.
237 
238   The strings can be composites, for instance
239 
240      /(f)(o)(o)/
241 
242   will result in a composite fixed substring 'foo'.
243 
244   For each string some basic information is maintained:
245 
246   - offset or min_offset
247     This is the position the string must appear at, or not before.
248     It also implicitly (when combined with minlenp) tells us how many
249     character must match before the string we are searching.
250     Likewise when combined with minlenp and the length of the string
251     tells us how many characters must appear after the string we have
252     found.
253 
254   - max_offset
255     Only used for floating strings. This is the rightmost point that
256     the string can appear at. Ifset to I32 max it indicates that the
257     string can occur infinitely far to the right.
258 
259   - minlenp
260     A pointer to the minimum length of the pattern that the string
261     was found inside. This is important as in the case of positive
262     lookahead or positive lookbehind we can have multiple patterns
263     involved. Consider
264 
265     /(?=FOO).*F/
266 
267     The minimum length of the pattern overall is 3, the minimum length
268     of the lookahead part is 3, but the minimum length of the part that
269     will actually match is 1. So 'FOO's minimum length is 3, but the
270     minimum length for the F is 1. This is important as the minimum length
271     is used to determine offsets in front of and behind the string being
272     looked for.  Since strings can be composites this is the length of the
273     pattern at the time it was commited with a scan_commit. Note that
274     the length is calculated by study_chunk, so that the minimum lengths
275     are not known until the full pattern has been compiled, thus the
276     pointer to the value.
277 
278   - lookbehind
279 
280     In the case of lookbehind the string being searched for can be
281     offset past the start point of the final matching string.
282     If this value was just blithely removed from the min_offset it would
283     invalidate some of the calculations for how many chars must match
284     before or after (as they are derived from min_offset and minlen and
285     the length of the string being searched for).
286     When the final pattern is compiled and the data is moved from the
287     scan_data_t structure into the regexp structure the information
288     about lookbehind is factored in, with the information that would
289     have been lost precalculated in the end_shift field for the
290     associated string.
291 
292   The fields pos_min and pos_delta are used to store the minimum offset
293   and the delta to the maximum offset at the current point in the pattern.
294 
295 */
296 
297 typedef struct scan_data_t {
298     /*I32 len_min;      unused */
299     /*I32 len_delta;    unused */
300     I32 pos_min;
301     I32 pos_delta;
302     SV *last_found;
303     I32 last_end;	    /* min value, <0 unless valid. */
304     I32 last_start_min;
305     I32 last_start_max;
306     SV **longest;	    /* Either &l_fixed, or &l_float. */
307     SV *longest_fixed;      /* longest fixed string found in pattern */
308     I32 offset_fixed;       /* offset where it starts */
309     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
310     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
311     SV *longest_float;      /* longest floating string found in pattern */
312     I32 offset_float_min;   /* earliest point in string it can appear */
313     I32 offset_float_max;   /* latest point in string it can appear */
314     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
315     I32 lookbehind_float;   /* is the position of the string modified by LB */
316     I32 flags;
317     I32 whilem_c;
318     I32 *last_closep;
319     struct regnode_charclass_class *start_class;
320 } scan_data_t;
321 
322 /*
323  * Forward declarations for pregcomp()'s friends.
324  */
325 
326 static const scan_data_t zero_scan_data =
327   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
328 
329 #define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
330 #define SF_BEFORE_SEOL		0x0001
331 #define SF_BEFORE_MEOL		0x0002
332 #define SF_FIX_BEFORE_EOL	(SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333 #define SF_FL_BEFORE_EOL	(SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
334 
335 #ifdef NO_UNARY_PLUS
336 #  define SF_FIX_SHIFT_EOL	(0+2)
337 #  define SF_FL_SHIFT_EOL		(0+4)
338 #else
339 #  define SF_FIX_SHIFT_EOL	(+2)
340 #  define SF_FL_SHIFT_EOL		(+4)
341 #endif
342 
343 #define SF_FIX_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344 #define SF_FIX_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
345 
346 #define SF_FL_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347 #define SF_FL_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
348 #define SF_IS_INF		0x0040
349 #define SF_HAS_PAR		0x0080
350 #define SF_IN_PAR		0x0100
351 #define SF_HAS_EVAL		0x0200
352 #define SCF_DO_SUBSTR		0x0400
353 #define SCF_DO_STCLASS_AND	0x0800
354 #define SCF_DO_STCLASS_OR	0x1000
355 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
356 #define SCF_WHILEM_VISITED_POS	0x2000
357 
358 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
359 #define SCF_SEEN_ACCEPT         0x8000
360 
361 #define UTF (RExC_utf8 != 0)
362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
364 
365 #define OOB_UNICODE		12345678
366 #define OOB_NAMEDCLASS		-1
367 
368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
370 
371 
372 /* length of regex to show in messages that don't mark a position within */
373 #define RegexLengthToShowInErrorMessages 127
374 
375 /*
376  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378  * op/pragma/warn/regcomp.
379  */
380 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
381 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
382 
383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
384 
385 /*
386  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387  * arg. Show regex, up to a maximum length. If it's too long, chop and add
388  * "...".
389  */
390 #define _FAIL(code) STMT_START {					\
391     const char *ellipses = "";						\
392     IV len = RExC_end - RExC_precomp;					\
393 									\
394     if (!SIZE_ONLY)							\
395 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);			\
396     if (len > RegexLengthToShowInErrorMessages) {			\
397 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
398 	len = RegexLengthToShowInErrorMessages - 10;			\
399 	ellipses = "...";						\
400     }									\
401     code;                                                               \
402 } STMT_END
403 
404 #define	FAIL(msg) _FAIL(			    \
405     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",	    \
406 	    msg, (int)len, RExC_precomp, ellipses))
407 
408 #define	FAIL2(msg,arg) _FAIL(			    \
409     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",	    \
410 	    arg, (int)len, RExC_precomp, ellipses))
411 
412 /*
413  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
414  */
415 #define	Simple_vFAIL(m) STMT_START {					\
416     const IV offset = RExC_parse - RExC_precomp;			\
417     Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
418 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
419 } STMT_END
420 
421 /*
422  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
423  */
424 #define	vFAIL(m) STMT_START {				\
425     if (!SIZE_ONLY)					\
426 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
427     Simple_vFAIL(m);					\
428 } STMT_END
429 
430 /*
431  * Like Simple_vFAIL(), but accepts two arguments.
432  */
433 #define	Simple_vFAIL2(m,a1) STMT_START {			\
434     const IV offset = RExC_parse - RExC_precomp;			\
435     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,			\
436 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
437 } STMT_END
438 
439 /*
440  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
441  */
442 #define	vFAIL2(m,a1) STMT_START {			\
443     if (!SIZE_ONLY)					\
444 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
445     Simple_vFAIL2(m, a1);				\
446 } STMT_END
447 
448 
449 /*
450  * Like Simple_vFAIL(), but accepts three arguments.
451  */
452 #define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
453     const IV offset = RExC_parse - RExC_precomp;		\
454     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,		\
455 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
456 } STMT_END
457 
458 /*
459  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
460  */
461 #define	vFAIL3(m,a1,a2) STMT_START {			\
462     if (!SIZE_ONLY)					\
463 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
464     Simple_vFAIL3(m, a1, a2);				\
465 } STMT_END
466 
467 /*
468  * Like Simple_vFAIL(), but accepts four arguments.
469  */
470 #define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
471     const IV offset = RExC_parse - RExC_precomp;		\
472     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,		\
473 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
474 } STMT_END
475 
476 #define	vWARN(loc,m) STMT_START {					\
477     const IV offset = loc - RExC_precomp;				\
478     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,	\
479 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
480 } STMT_END
481 
482 #define	vWARNdep(loc,m) STMT_START {					\
483     const IV offset = loc - RExC_precomp;				\
484     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),		\
485 	    "%s" REPORT_LOCATION,					\
486 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
487 } STMT_END
488 
489 
490 #define	vWARN2(loc, m, a1) STMT_START {					\
491     const IV offset = loc - RExC_precomp;				\
492     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
493 	    a1, (int)offset, RExC_precomp, RExC_precomp + offset);	\
494 } STMT_END
495 
496 #define	vWARN3(loc, m, a1, a2) STMT_START {				\
497     const IV offset = loc - RExC_precomp;				\
498     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
499 	    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);	\
500 } STMT_END
501 
502 #define	vWARN4(loc, m, a1, a2, a3) STMT_START {				\
503     const IV offset = loc - RExC_precomp;				\
504     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
505 	    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
506 } STMT_END
507 
508 #define	vWARN5(loc, m, a1, a2, a3, a4) STMT_START {			\
509     const IV offset = loc - RExC_precomp;				\
510     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
511 	    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 } STMT_END
513 
514 
515 /* Allow for side effects in s */
516 #define REGC(c,s) STMT_START {			\
517     if (!SIZE_ONLY) *(s) = (c); else (void)(s);	\
518 } STMT_END
519 
520 /* Macros for recording node offsets.   20001227 mjd@plover.com
521  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
522  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
523  * Element 0 holds the number n.
524  * Position is 1 indexed.
525  */
526 #ifndef RE_TRACK_PATTERN_OFFSETS
527 #define Set_Node_Offset_To_R(node,byte)
528 #define Set_Node_Offset(node,byte)
529 #define Set_Cur_Node_Offset
530 #define Set_Node_Length_To_R(node,len)
531 #define Set_Node_Length(node,len)
532 #define Set_Node_Cur_Length(node)
533 #define Node_Offset(n)
534 #define Node_Length(n)
535 #define Set_Node_Offset_Length(node,offset,len)
536 #define ProgLen(ri) ri->u.proglen
537 #define SetProgLen(ri,x) ri->u.proglen = x
538 #else
539 #define ProgLen(ri) ri->u.offsets[0]
540 #define SetProgLen(ri,x) ri->u.offsets[0] = x
541 #define Set_Node_Offset_To_R(node,byte) STMT_START {			\
542     if (! SIZE_ONLY) {							\
543 	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
544 		    __LINE__, (int)(node), (int)(byte)));		\
545 	if((node) < 0) {						\
546 	    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
547 	} else {							\
548 	    RExC_offsets[2*(node)-1] = (byte);				\
549 	}								\
550     }									\
551 } STMT_END
552 
553 #define Set_Node_Offset(node,byte) \
554     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
555 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
556 
557 #define Set_Node_Length_To_R(node,len) STMT_START {			\
558     if (! SIZE_ONLY) {							\
559 	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
560 		__LINE__, (int)(node), (int)(len)));			\
561 	if((node) < 0) {						\
562 	    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
563 	} else {							\
564 	    RExC_offsets[2*(node)] = (len);				\
565 	}								\
566     }									\
567 } STMT_END
568 
569 #define Set_Node_Length(node,len) \
570     Set_Node_Length_To_R((node)-RExC_emit_start, len)
571 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
572 #define Set_Node_Cur_Length(node) \
573     Set_Node_Length(node, RExC_parse - parse_start)
574 
575 /* Get offsets and lengths */
576 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
577 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
578 
579 #define Set_Node_Offset_Length(node,offset,len) STMT_START {	\
580     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));	\
581     Set_Node_Length_To_R((node)-RExC_emit_start, (len));	\
582 } STMT_END
583 #endif
584 
585 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
586 #define EXPERIMENTAL_INPLACESCAN
587 #endif /*RE_TRACK_PATTERN_OFFSETS*/
588 
589 #define DEBUG_STUDYDATA(str,data,depth)                              \
590 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
591     PerlIO_printf(Perl_debug_log,                                    \
592         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
593         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
594         (int)(depth)*2, "",                                          \
595         (IV)((data)->pos_min),                                       \
596         (IV)((data)->pos_delta),                                     \
597         (UV)((data)->flags),                                         \
598         (IV)((data)->whilem_c),                                      \
599         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
600         is_inf ? "INF " : ""                                         \
601     );                                                               \
602     if ((data)->last_found)                                          \
603         PerlIO_printf(Perl_debug_log,                                \
604             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
605             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
606             SvPVX_const((data)->last_found),                         \
607             (IV)((data)->last_end),                                  \
608             (IV)((data)->last_start_min),                            \
609             (IV)((data)->last_start_max),                            \
610             ((data)->longest &&                                      \
611              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
612             SvPVX_const((data)->longest_fixed),                      \
613             (IV)((data)->offset_fixed),                              \
614             ((data)->longest &&                                      \
615              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
616             SvPVX_const((data)->longest_float),                      \
617             (IV)((data)->offset_float_min),                          \
618             (IV)((data)->offset_float_max)                           \
619         );                                                           \
620     PerlIO_printf(Perl_debug_log,"\n");                              \
621 });
622 
623 static void clear_re(pTHX_ void *r);
624 
625 /* Mark that we cannot extend a found fixed substring at this point.
626    Update the longest found anchored substring and the longest found
627    floating substrings if needed. */
628 
629 STATIC void
630 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
631 {
632     const STRLEN l = CHR_SVLEN(data->last_found);
633     const STRLEN old_l = CHR_SVLEN(*data->longest);
634     GET_RE_DEBUG_FLAGS_DECL;
635 
636     PERL_ARGS_ASSERT_SCAN_COMMIT;
637 
638     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
639 	SvSetMagicSV(*data->longest, data->last_found);
640 	if (*data->longest == data->longest_fixed) {
641 	    data->offset_fixed = l ? data->last_start_min : data->pos_min;
642 	    if (data->flags & SF_BEFORE_EOL)
643 		data->flags
644 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
645 	    else
646 		data->flags &= ~SF_FIX_BEFORE_EOL;
647 	    data->minlen_fixed=minlenp;
648 	    data->lookbehind_fixed=0;
649 	}
650 	else { /* *data->longest == data->longest_float */
651 	    data->offset_float_min = l ? data->last_start_min : data->pos_min;
652 	    data->offset_float_max = (l
653 				      ? data->last_start_max
654 				      : data->pos_min + data->pos_delta);
655 	    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
656 		data->offset_float_max = I32_MAX;
657 	    if (data->flags & SF_BEFORE_EOL)
658 		data->flags
659 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
660 	    else
661 		data->flags &= ~SF_FL_BEFORE_EOL;
662             data->minlen_float=minlenp;
663             data->lookbehind_float=0;
664 	}
665     }
666     SvCUR_set(data->last_found, 0);
667     {
668 	SV * const sv = data->last_found;
669 	if (SvUTF8(sv) && SvMAGICAL(sv)) {
670 	    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
671 	    if (mg)
672 		mg->mg_len = 0;
673 	}
674     }
675     data->last_end = -1;
676     data->flags &= ~SF_BEFORE_EOL;
677     DEBUG_STUDYDATA("commit: ",data,0);
678 }
679 
680 /* Can match anything (initialization) */
681 STATIC void
682 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
683 {
684     PERL_ARGS_ASSERT_CL_ANYTHING;
685 
686     ANYOF_CLASS_ZERO(cl);
687     ANYOF_BITMAP_SETALL(cl);
688     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
689     if (LOC)
690 	cl->flags |= ANYOF_LOCALE;
691 }
692 
693 /* Can match anything (initialization) */
694 STATIC int
695 S_cl_is_anything(const struct regnode_charclass_class *cl)
696 {
697     int value;
698 
699     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
700 
701     for (value = 0; value <= ANYOF_MAX; value += 2)
702 	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
703 	    return 1;
704     if (!(cl->flags & ANYOF_UNICODE_ALL))
705 	return 0;
706     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
707 	return 0;
708     return 1;
709 }
710 
711 /* Can match anything (initialization) */
712 STATIC void
713 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
714 {
715     PERL_ARGS_ASSERT_CL_INIT;
716 
717     Zero(cl, 1, struct regnode_charclass_class);
718     cl->type = ANYOF;
719     cl_anything(pRExC_state, cl);
720 }
721 
722 STATIC void
723 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
724 {
725     PERL_ARGS_ASSERT_CL_INIT_ZERO;
726 
727     Zero(cl, 1, struct regnode_charclass_class);
728     cl->type = ANYOF;
729     cl_anything(pRExC_state, cl);
730     if (LOC)
731 	cl->flags |= ANYOF_LOCALE;
732 }
733 
734 /* 'And' a given class with another one.  Can create false positives */
735 /* We assume that cl is not inverted */
736 STATIC void
737 S_cl_and(struct regnode_charclass_class *cl,
738 	const struct regnode_charclass_class *and_with)
739 {
740     PERL_ARGS_ASSERT_CL_AND;
741 
742     assert(and_with->type == ANYOF);
743     if (!(and_with->flags & ANYOF_CLASS)
744 	&& !(cl->flags & ANYOF_CLASS)
745 	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
746 	&& !(and_with->flags & ANYOF_FOLD)
747 	&& !(cl->flags & ANYOF_FOLD)) {
748 	int i;
749 
750 	if (and_with->flags & ANYOF_INVERT)
751 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
752 		cl->bitmap[i] &= ~and_with->bitmap[i];
753 	else
754 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
755 		cl->bitmap[i] &= and_with->bitmap[i];
756     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
757     if (!(and_with->flags & ANYOF_EOS))
758 	cl->flags &= ~ANYOF_EOS;
759 
760     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
761 	!(and_with->flags & ANYOF_INVERT)) {
762 	cl->flags &= ~ANYOF_UNICODE_ALL;
763 	cl->flags |= ANYOF_UNICODE;
764 	ARG_SET(cl, ARG(and_with));
765     }
766     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
767 	!(and_with->flags & ANYOF_INVERT))
768 	cl->flags &= ~ANYOF_UNICODE_ALL;
769     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
770 	!(and_with->flags & ANYOF_INVERT))
771 	cl->flags &= ~ANYOF_UNICODE;
772 }
773 
774 /* 'OR' a given class with another one.  Can create false positives */
775 /* We assume that cl is not inverted */
776 STATIC void
777 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
778 {
779     PERL_ARGS_ASSERT_CL_OR;
780 
781     if (or_with->flags & ANYOF_INVERT) {
782 	/* We do not use
783 	 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
784 	 *   <= (B1 | !B2) | (CL1 | !CL2)
785 	 * which is wasteful if CL2 is small, but we ignore CL2:
786 	 *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
787 	 * XXXX Can we handle case-fold?  Unclear:
788 	 *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
789 	 *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
790 	 */
791 	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
792 	     && !(or_with->flags & ANYOF_FOLD)
793 	     && !(cl->flags & ANYOF_FOLD) ) {
794 	    int i;
795 
796 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
797 		cl->bitmap[i] |= ~or_with->bitmap[i];
798 	} /* XXXX: logic is complicated otherwise */
799 	else {
800 	    cl_anything(pRExC_state, cl);
801 	}
802     } else {
803 	/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
804 	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 	     && (!(or_with->flags & ANYOF_FOLD)
806 		 || (cl->flags & ANYOF_FOLD)) ) {
807 	    int i;
808 
809 	    /* OR char bitmap and class bitmap separately */
810 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 		cl->bitmap[i] |= or_with->bitmap[i];
812 	    if (or_with->flags & ANYOF_CLASS) {
813 		for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
814 		    cl->classflags[i] |= or_with->classflags[i];
815 		cl->flags |= ANYOF_CLASS;
816 	    }
817 	}
818 	else { /* XXXX: logic is complicated, leave it along for a moment. */
819 	    cl_anything(pRExC_state, cl);
820 	}
821     }
822     if (or_with->flags & ANYOF_EOS)
823 	cl->flags |= ANYOF_EOS;
824 
825     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
826 	ARG(cl) != ARG(or_with)) {
827 	cl->flags |= ANYOF_UNICODE_ALL;
828 	cl->flags &= ~ANYOF_UNICODE;
829     }
830     if (or_with->flags & ANYOF_UNICODE_ALL) {
831 	cl->flags |= ANYOF_UNICODE_ALL;
832 	cl->flags &= ~ANYOF_UNICODE;
833     }
834 }
835 
836 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
837 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
838 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
839 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
840 
841 
842 #ifdef DEBUGGING
843 /*
844    dump_trie(trie,widecharmap,revcharmap)
845    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
846    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
847 
848    These routines dump out a trie in a somewhat readable format.
849    The _interim_ variants are used for debugging the interim
850    tables that are used to generate the final compressed
851    representation which is what dump_trie expects.
852 
853    Part of the reason for their existance is to provide a form
854    of documentation as to how the different representations function.
855 
856 */
857 
858 /*
859   Dumps the final compressed table form of the trie to Perl_debug_log.
860   Used for debugging make_trie().
861 */
862 
863 STATIC void
864 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
865 	    AV *revcharmap, U32 depth)
866 {
867     U32 state;
868     SV *sv=sv_newmortal();
869     int colwidth= widecharmap ? 6 : 4;
870     GET_RE_DEBUG_FLAGS_DECL;
871 
872     PERL_ARGS_ASSERT_DUMP_TRIE;
873 
874     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
875         (int)depth * 2 + 2,"",
876         "Match","Base","Ofs" );
877 
878     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
879 	SV ** const tmp = av_fetch( revcharmap, state, 0);
880         if ( tmp ) {
881             PerlIO_printf( Perl_debug_log, "%*s",
882                 colwidth,
883                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
884 	                    PL_colors[0], PL_colors[1],
885 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
886 	                    PERL_PV_ESCAPE_FIRSTCHAR
887                 )
888             );
889         }
890     }
891     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
892         (int)depth * 2 + 2,"");
893 
894     for( state = 0 ; state < trie->uniquecharcount ; state++ )
895         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
896     PerlIO_printf( Perl_debug_log, "\n");
897 
898     for( state = 1 ; state < trie->statecount ; state++ ) {
899 	const U32 base = trie->states[ state ].trans.base;
900 
901         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
902 
903         if ( trie->states[ state ].wordnum ) {
904             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
905         } else {
906             PerlIO_printf( Perl_debug_log, "%6s", "" );
907         }
908 
909         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
910 
911         if ( base ) {
912             U32 ofs = 0;
913 
914             while( ( base + ofs  < trie->uniquecharcount ) ||
915                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
916                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
917                     ofs++;
918 
919             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
920 
921             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
922                 if ( ( base + ofs >= trie->uniquecharcount ) &&
923                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
924                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
925                 {
926                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
927                     colwidth,
928                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
929                 } else {
930                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
931                 }
932             }
933 
934             PerlIO_printf( Perl_debug_log, "]");
935 
936         }
937         PerlIO_printf( Perl_debug_log, "\n" );
938     }
939 }
940 /*
941   Dumps a fully constructed but uncompressed trie in list form.
942   List tries normally only are used for construction when the number of
943   possible chars (trie->uniquecharcount) is very high.
944   Used for debugging make_trie().
945 */
946 STATIC void
947 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
948 			 HV *widecharmap, AV *revcharmap, U32 next_alloc,
949 			 U32 depth)
950 {
951     U32 state;
952     SV *sv=sv_newmortal();
953     int colwidth= widecharmap ? 6 : 4;
954     GET_RE_DEBUG_FLAGS_DECL;
955 
956     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
957 
958     /* print out the table precompression.  */
959     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
960         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
961         "------:-----+-----------------\n" );
962 
963     for( state=1 ; state < next_alloc ; state ++ ) {
964         U16 charid;
965 
966         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
967             (int)depth * 2 + 2,"", (UV)state  );
968         if ( ! trie->states[ state ].wordnum ) {
969             PerlIO_printf( Perl_debug_log, "%5s| ","");
970         } else {
971             PerlIO_printf( Perl_debug_log, "W%4x| ",
972                 trie->states[ state ].wordnum
973             );
974         }
975         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
976 	    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
977 	    if ( tmp ) {
978                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
979                     colwidth,
980                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
981 	                    PL_colors[0], PL_colors[1],
982 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
983 	                    PERL_PV_ESCAPE_FIRSTCHAR
984                     ) ,
985                     TRIE_LIST_ITEM(state,charid).forid,
986                     (UV)TRIE_LIST_ITEM(state,charid).newstate
987                 );
988                 if (!(charid % 10))
989                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
990                         (int)((depth * 2) + 14), "");
991             }
992         }
993         PerlIO_printf( Perl_debug_log, "\n");
994     }
995 }
996 
997 /*
998   Dumps a fully constructed but uncompressed trie in table form.
999   This is the normal DFA style state transition table, with a few
1000   twists to facilitate compression later.
1001   Used for debugging make_trie().
1002 */
1003 STATIC void
1004 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1005 			  HV *widecharmap, AV *revcharmap, U32 next_alloc,
1006 			  U32 depth)
1007 {
1008     U32 state;
1009     U16 charid;
1010     SV *sv=sv_newmortal();
1011     int colwidth= widecharmap ? 6 : 4;
1012     GET_RE_DEBUG_FLAGS_DECL;
1013 
1014     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1015 
1016     /*
1017        print out the table precompression so that we can do a visual check
1018        that they are identical.
1019      */
1020 
1021     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1022 
1023     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1024 	SV ** const tmp = av_fetch( revcharmap, charid, 0);
1025         if ( tmp ) {
1026             PerlIO_printf( Perl_debug_log, "%*s",
1027                 colwidth,
1028                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1029 	                    PL_colors[0], PL_colors[1],
1030 	                    (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031 	                    PERL_PV_ESCAPE_FIRSTCHAR
1032                 )
1033             );
1034         }
1035     }
1036 
1037     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1038 
1039     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1040         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1041     }
1042 
1043     PerlIO_printf( Perl_debug_log, "\n" );
1044 
1045     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1046 
1047         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1048             (int)depth * 2 + 2,"",
1049             (UV)TRIE_NODENUM( state ) );
1050 
1051         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1052             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1053             if (v)
1054                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1055             else
1056                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1057         }
1058         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1059             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1060         } else {
1061             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1062             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1063         }
1064     }
1065 }
1066 
1067 #endif
1068 
1069 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1070   startbranch: the first branch in the whole branch sequence
1071   first      : start branch of sequence of branch-exact nodes.
1072 	       May be the same as startbranch
1073   last       : Thing following the last branch.
1074 	       May be the same as tail.
1075   tail       : item following the branch sequence
1076   count      : words in the sequence
1077   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1078   depth      : indent depth
1079 
1080 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1081 
1082 A trie is an N'ary tree where the branches are determined by digital
1083 decomposition of the key. IE, at the root node you look up the 1st character and
1084 follow that branch repeat until you find the end of the branches. Nodes can be
1085 marked as "accepting" meaning they represent a complete word. Eg:
1086 
1087   /he|she|his|hers/
1088 
1089 would convert into the following structure. Numbers represent states, letters
1090 following numbers represent valid transitions on the letter from that state, if
1091 the number is in square brackets it represents an accepting state, otherwise it
1092 will be in parenthesis.
1093 
1094       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1095       |    |
1096       |   (2)
1097       |    |
1098      (1)   +-i->(6)-+-s->[7]
1099       |
1100       +-s->(3)-+-h->(4)-+-e->[5]
1101 
1102       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1103 
1104 This shows that when matching against the string 'hers' we will begin at state 1
1105 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1106 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1107 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1108 single traverse. We store a mapping from accepting to state to which word was
1109 matched, and then when we have multiple possibilities we try to complete the
1110 rest of the regex in the order in which they occured in the alternation.
1111 
1112 The only prior NFA like behaviour that would be changed by the TRIE support is
1113 the silent ignoring of duplicate alternations which are of the form:
1114 
1115  / (DUPE|DUPE) X? (?{ ... }) Y /x
1116 
1117 Thus EVAL blocks follwing a trie may be called a different number of times with
1118 and without the optimisation. With the optimisations dupes will be silently
1119 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1120 the following demonstrates:
1121 
1122  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1123 
1124 which prints out 'word' three times, but
1125 
1126  'words'=~/(word|word|word)(?{ print $1 })S/
1127 
1128 which doesnt print it out at all. This is due to other optimisations kicking in.
1129 
1130 Example of what happens on a structural level:
1131 
1132 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1133 
1134    1: CURLYM[1] {1,32767}(18)
1135    5:   BRANCH(8)
1136    6:     EXACT <ac>(16)
1137    8:   BRANCH(11)
1138    9:     EXACT <ad>(16)
1139   11:   BRANCH(14)
1140   12:     EXACT <ab>(16)
1141   16:   SUCCEED(0)
1142   17:   NOTHING(18)
1143   18: END(0)
1144 
1145 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1146 and should turn into:
1147 
1148    1: CURLYM[1] {1,32767}(18)
1149    5:   TRIE(16)
1150 	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1151 	  <ac>
1152 	  <ad>
1153 	  <ab>
1154   16:   SUCCEED(0)
1155   17:   NOTHING(18)
1156   18: END(0)
1157 
1158 Cases where tail != last would be like /(?foo|bar)baz/:
1159 
1160    1: BRANCH(4)
1161    2:   EXACT <foo>(8)
1162    4: BRANCH(7)
1163    5:   EXACT <bar>(8)
1164    7: TAIL(8)
1165    8: EXACT <baz>(10)
1166   10: END(0)
1167 
1168 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1169 and would end up looking like:
1170 
1171     1: TRIE(8)
1172       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1173 	<foo>
1174 	<bar>
1175    7: TAIL(8)
1176    8: EXACT <baz>(10)
1177   10: END(0)
1178 
1179     d = uvuni_to_utf8_flags(d, uv, 0);
1180 
1181 is the recommended Unicode-aware way of saying
1182 
1183     *(d++) = uv;
1184 */
1185 
1186 #define TRIE_STORE_REVCHAR                                                 \
1187     STMT_START {                                                           \
1188 	if (UTF) {							   \
1189 	    SV *zlopp = newSV(2);					   \
1190 	    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);	   \
1191 	    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1192 	    SvCUR_set(zlopp, kapow - flrbbbbb);				   \
1193 	    SvPOK_on(zlopp);						   \
1194 	    SvUTF8_on(zlopp);						   \
1195 	    av_push(revcharmap, zlopp);					   \
1196 	} else {							   \
1197 	    char ooooff = (char)uvc;					   	   \
1198 	    av_push(revcharmap, newSVpvn(&ooooff, 1));			   \
1199 	}								   \
1200         } STMT_END
1201 
1202 #define TRIE_READ_CHAR STMT_START {                                           \
1203     wordlen++;                                                                \
1204     if ( UTF ) {                                                              \
1205 	if ( folder ) {                                                       \
1206 	    if ( foldlen > 0 ) {                                              \
1207 	       uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1208 	       foldlen -= len;                                                \
1209 	       scan += len;                                                   \
1210 	       len = 0;                                                       \
1211 	    } else {                                                          \
1212 		uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1213 		uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1214 		foldlen -= UNISKIP( uvc );                                    \
1215 		scan = foldbuf + UNISKIP( uvc );                              \
1216 	    }                                                                 \
1217 	} else {                                                              \
1218 	    uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1219 	}                                                                     \
1220     } else {                                                                  \
1221 	uvc = (U32)*uc;                                                       \
1222 	len = 1;                                                              \
1223     }                                                                         \
1224 } STMT_END
1225 
1226 
1227 
1228 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1229     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1230 	U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1231 	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1232     }                                                           \
1233     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1234     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1235     TRIE_LIST_CUR( state )++;                                   \
1236 } STMT_END
1237 
1238 #define TRIE_LIST_NEW(state) STMT_START {                       \
1239     Newxz( trie->states[ state ].trans.list,               \
1240 	4, reg_trie_trans_le );                                 \
1241      TRIE_LIST_CUR( state ) = 1;                                \
1242      TRIE_LIST_LEN( state ) = 4;                                \
1243 } STMT_END
1244 
1245 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1246     U16 dupe= trie->states[ state ].wordnum;                    \
1247     regnode * const noper_next = regnext( noper );              \
1248                                                                 \
1249     if (trie->wordlen)                                          \
1250         trie->wordlen[ curword ] = wordlen;                     \
1251     DEBUG_r({                                                   \
1252         /* store the word for dumping */                        \
1253         SV* tmp;                                                \
1254         if (OP(noper) != NOTHING)                               \
1255             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);	\
1256         else                                                    \
1257             tmp = newSVpvn_utf8( "", 0, UTF );			\
1258         av_push( trie_words, tmp );                             \
1259     });                                                         \
1260                                                                 \
1261     curword++;                                                  \
1262                                                                 \
1263     if ( noper_next < tail ) {                                  \
1264         if (!trie->jump)                                        \
1265             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1266         trie->jump[curword] = (U16)(noper_next - convert);      \
1267         if (!jumper)                                            \
1268             jumper = noper_next;                                \
1269         if (!nextbranch)                                        \
1270             nextbranch= regnext(cur);                           \
1271     }                                                           \
1272                                                                 \
1273     if ( dupe ) {                                               \
1274         /* So it's a dupe. This means we need to maintain a   */\
1275         /* linked-list from the first to the next.            */\
1276         /* we only allocate the nextword buffer when there    */\
1277         /* a dupe, so first time we have to do the allocation */\
1278         if (!trie->nextword)                                    \
1279             trie->nextword = (U16 *)					\
1280 		PerlMemShared_calloc( word_count + 1, sizeof(U16));	\
1281         while ( trie->nextword[dupe] )                          \
1282             dupe= trie->nextword[dupe];                         \
1283         trie->nextword[dupe]= curword;                          \
1284     } else {                                                    \
1285         /* we haven't inserted this word yet.                */ \
1286         trie->states[ state ].wordnum = curword;                \
1287     }                                                           \
1288 } STMT_END
1289 
1290 
1291 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)		\
1292      ( ( base + charid >=  ucharcount					\
1293          && base + charid < ubound					\
1294          && state == trie->trans[ base - ucharcount + charid ].check	\
1295          && trie->trans[ base - ucharcount + charid ].next )		\
1296            ? trie->trans[ base - ucharcount + charid ].next		\
1297            : ( state==1 ? special : 0 )					\
1298       )
1299 
1300 #define MADE_TRIE       1
1301 #define MADE_JUMP_TRIE  2
1302 #define MADE_EXACT_TRIE 4
1303 
1304 STATIC I32
1305 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1306 {
1307     dVAR;
1308     /* first pass, loop through and scan words */
1309     reg_trie_data *trie;
1310     HV *widecharmap = NULL;
1311     AV *revcharmap = newAV();
1312     regnode *cur;
1313     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1314     STRLEN len = 0;
1315     UV uvc = 0;
1316     U16 curword = 0;
1317     U32 next_alloc = 0;
1318     regnode *jumper = NULL;
1319     regnode *nextbranch = NULL;
1320     regnode *convert = NULL;
1321     /* we just use folder as a flag in utf8 */
1322     const U8 * const folder = ( flags == EXACTF
1323                        ? PL_fold
1324                        : ( flags == EXACTFL
1325                            ? PL_fold_locale
1326                            : NULL
1327                          )
1328                      );
1329 
1330 #ifdef DEBUGGING
1331     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1332     AV *trie_words = NULL;
1333     /* along with revcharmap, this only used during construction but both are
1334      * useful during debugging so we store them in the struct when debugging.
1335      */
1336 #else
1337     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1338     STRLEN trie_charcount=0;
1339 #endif
1340     SV *re_trie_maxbuff;
1341     GET_RE_DEBUG_FLAGS_DECL;
1342 
1343     PERL_ARGS_ASSERT_MAKE_TRIE;
1344 #ifndef DEBUGGING
1345     PERL_UNUSED_ARG(depth);
1346 #endif
1347 
1348     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1349     trie->refcount = 1;
1350     trie->startstate = 1;
1351     trie->wordcount = word_count;
1352     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1353     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1354     if (!(UTF && folder))
1355 	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1356     DEBUG_r({
1357         trie_words = newAV();
1358     });
1359 
1360     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1361     if (!SvIOK(re_trie_maxbuff)) {
1362         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1363     }
1364     DEBUG_OPTIMISE_r({
1365                 PerlIO_printf( Perl_debug_log,
1366                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1367                   (int)depth * 2 + 2, "",
1368                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1369                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1370                   (int)depth);
1371     });
1372 
1373    /* Find the node we are going to overwrite */
1374     if ( first == startbranch && OP( last ) != BRANCH ) {
1375         /* whole branch chain */
1376         convert = first;
1377     } else {
1378         /* branch sub-chain */
1379         convert = NEXTOPER( first );
1380     }
1381 
1382     /*  -- First loop and Setup --
1383 
1384        We first traverse the branches and scan each word to determine if it
1385        contains widechars, and how many unique chars there are, this is
1386        important as we have to build a table with at least as many columns as we
1387        have unique chars.
1388 
1389        We use an array of integers to represent the character codes 0..255
1390        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1391        native representation of the character value as the key and IV's for the
1392        coded index.
1393 
1394        *TODO* If we keep track of how many times each character is used we can
1395        remap the columns so that the table compression later on is more
1396        efficient in terms of memory by ensuring most common value is in the
1397        middle and the least common are on the outside.  IMO this would be better
1398        than a most to least common mapping as theres a decent chance the most
1399        common letter will share a node with the least common, meaning the node
1400        will not be compressable. With a middle is most common approach the worst
1401        case is when we have the least common nodes twice.
1402 
1403      */
1404 
1405     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1406         regnode * const noper = NEXTOPER( cur );
1407         const U8 *uc = (U8*)STRING( noper );
1408         const U8 * const e  = uc + STR_LEN( noper );
1409         STRLEN foldlen = 0;
1410         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1411         const U8 *scan = (U8*)NULL;
1412         U32 wordlen      = 0;         /* required init */
1413         STRLEN chars = 0;
1414         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1415 
1416         if (OP(noper) == NOTHING) {
1417             trie->minlen= 0;
1418             continue;
1419         }
1420         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1421             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1422                                           regardless of encoding */
1423 
1424         for ( ; uc < e ; uc += len ) {
1425             TRIE_CHARCOUNT(trie)++;
1426             TRIE_READ_CHAR;
1427             chars++;
1428             if ( uvc < 256 ) {
1429                 if ( !trie->charmap[ uvc ] ) {
1430                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1431                     if ( folder )
1432                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1433                     TRIE_STORE_REVCHAR;
1434                 }
1435                 if ( set_bit ) {
1436                     /* store the codepoint in the bitmap, and if its ascii
1437                        also store its folded equivelent. */
1438                     TRIE_BITMAP_SET(trie,uvc);
1439 
1440 		    /* store the folded codepoint */
1441 		    if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1442 
1443 		    if ( !UTF ) {
1444 			/* store first byte of utf8 representation of
1445 			   codepoints in the 127 < uvc < 256 range */
1446 			if (127 < uvc && uvc < 192) {
1447 			    TRIE_BITMAP_SET(trie,194);
1448 			} else if (191 < uvc ) {
1449 			    TRIE_BITMAP_SET(trie,195);
1450 			/* && uvc < 256 -- we know uvc is < 256 already */
1451 			}
1452 		    }
1453                     set_bit = 0; /* We've done our bit :-) */
1454                 }
1455             } else {
1456                 SV** svpp;
1457                 if ( !widecharmap )
1458                     widecharmap = newHV();
1459 
1460                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1461 
1462                 if ( !svpp )
1463                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1464 
1465                 if ( !SvTRUE( *svpp ) ) {
1466                     sv_setiv( *svpp, ++trie->uniquecharcount );
1467                     TRIE_STORE_REVCHAR;
1468                 }
1469             }
1470         }
1471         if( cur == first ) {
1472             trie->minlen=chars;
1473             trie->maxlen=chars;
1474         } else if (chars < trie->minlen) {
1475             trie->minlen=chars;
1476         } else if (chars > trie->maxlen) {
1477             trie->maxlen=chars;
1478         }
1479 
1480     } /* end first pass */
1481     DEBUG_TRIE_COMPILE_r(
1482         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1483                 (int)depth * 2 + 2,"",
1484                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1485 		(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1486 		(int)trie->minlen, (int)trie->maxlen )
1487     );
1488     trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1489 
1490     /*
1491         We now know what we are dealing with in terms of unique chars and
1492         string sizes so we can calculate how much memory a naive
1493         representation using a flat table  will take. If it's over a reasonable
1494         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1495         conservative but potentially much slower representation using an array
1496         of lists.
1497 
1498         At the end we convert both representations into the same compressed
1499         form that will be used in regexec.c for matching with. The latter
1500         is a form that cannot be used to construct with but has memory
1501         properties similar to the list form and access properties similar
1502         to the table form making it both suitable for fast searches and
1503         small enough that its feasable to store for the duration of a program.
1504 
1505         See the comment in the code where the compressed table is produced
1506         inplace from the flat tabe representation for an explanation of how
1507         the compression works.
1508 
1509     */
1510 
1511 
1512     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1513         /*
1514             Second Pass -- Array Of Lists Representation
1515 
1516             Each state will be represented by a list of charid:state records
1517             (reg_trie_trans_le) the first such element holds the CUR and LEN
1518             points of the allocated array. (See defines above).
1519 
1520             We build the initial structure using the lists, and then convert
1521             it into the compressed table form which allows faster lookups
1522             (but cant be modified once converted).
1523         */
1524 
1525         STRLEN transcount = 1;
1526 
1527         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1528             "%*sCompiling trie using list compiler\n",
1529             (int)depth * 2 + 2, ""));
1530 
1531 	trie->states = (reg_trie_state *)
1532 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1533 				  sizeof(reg_trie_state) );
1534         TRIE_LIST_NEW(1);
1535         next_alloc = 2;
1536 
1537         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1538 
1539 	    regnode * const noper = NEXTOPER( cur );
1540 	    U8 *uc           = (U8*)STRING( noper );
1541 	    const U8 * const e = uc + STR_LEN( noper );
1542 	    U32 state        = 1;         /* required init */
1543 	    U16 charid       = 0;         /* sanity init */
1544 	    U8 *scan         = (U8*)NULL; /* sanity init */
1545 	    STRLEN foldlen   = 0;         /* required init */
1546             U32 wordlen      = 0;         /* required init */
1547 	    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1548 
1549             if (OP(noper) != NOTHING) {
1550                 for ( ; uc < e ; uc += len ) {
1551 
1552                     TRIE_READ_CHAR;
1553 
1554                     if ( uvc < 256 ) {
1555                         charid = trie->charmap[ uvc ];
1556 		    } else {
1557                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1558                         if ( !svpp ) {
1559                             charid = 0;
1560                         } else {
1561                             charid=(U16)SvIV( *svpp );
1562                         }
1563 		    }
1564                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1565                     if ( charid ) {
1566 
1567                         U16 check;
1568                         U32 newstate = 0;
1569 
1570                         charid--;
1571                         if ( !trie->states[ state ].trans.list ) {
1572                             TRIE_LIST_NEW( state );
1573 			}
1574                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1575                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1576                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1577                                 break;
1578                             }
1579                         }
1580                         if ( ! newstate ) {
1581                             newstate = next_alloc++;
1582                             TRIE_LIST_PUSH( state, charid, newstate );
1583                             transcount++;
1584                         }
1585                         state = newstate;
1586                     } else {
1587                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1588 		    }
1589 		}
1590 	    }
1591             TRIE_HANDLE_WORD(state);
1592 
1593         } /* end second pass */
1594 
1595         /* next alloc is the NEXT state to be allocated */
1596         trie->statecount = next_alloc;
1597         trie->states = (reg_trie_state *)
1598 	    PerlMemShared_realloc( trie->states,
1599 				   next_alloc
1600 				   * sizeof(reg_trie_state) );
1601 
1602         /* and now dump it out before we compress it */
1603         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1604 							 revcharmap, next_alloc,
1605 							 depth+1)
1606         );
1607 
1608         trie->trans = (reg_trie_trans *)
1609 	    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1610         {
1611             U32 state;
1612             U32 tp = 0;
1613             U32 zp = 0;
1614 
1615 
1616             for( state=1 ; state < next_alloc ; state ++ ) {
1617                 U32 base=0;
1618 
1619                 /*
1620                 DEBUG_TRIE_COMPILE_MORE_r(
1621                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1622                 );
1623                 */
1624 
1625                 if (trie->states[state].trans.list) {
1626                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1627                     U16 maxid=minid;
1628 		    U16 idx;
1629 
1630                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1631 			const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1632 			if ( forid < minid ) {
1633 			    minid=forid;
1634 			} else if ( forid > maxid ) {
1635 			    maxid=forid;
1636 			}
1637                     }
1638                     if ( transcount < tp + maxid - minid + 1) {
1639                         transcount *= 2;
1640 			trie->trans = (reg_trie_trans *)
1641 			    PerlMemShared_realloc( trie->trans,
1642 						     transcount
1643 						     * sizeof(reg_trie_trans) );
1644                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1645                     }
1646                     base = trie->uniquecharcount + tp - minid;
1647                     if ( maxid == minid ) {
1648                         U32 set = 0;
1649                         for ( ; zp < tp ; zp++ ) {
1650                             if ( ! trie->trans[ zp ].next ) {
1651                                 base = trie->uniquecharcount + zp - minid;
1652                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1653                                 trie->trans[ zp ].check = state;
1654                                 set = 1;
1655                                 break;
1656                             }
1657                         }
1658                         if ( !set ) {
1659                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1660                             trie->trans[ tp ].check = state;
1661                             tp++;
1662                             zp = tp;
1663                         }
1664                     } else {
1665                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1666                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1667                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1668                             trie->trans[ tid ].check = state;
1669                         }
1670                         tp += ( maxid - minid + 1 );
1671                     }
1672                     Safefree(trie->states[ state ].trans.list);
1673                 }
1674                 /*
1675                 DEBUG_TRIE_COMPILE_MORE_r(
1676                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1677                 );
1678                 */
1679                 trie->states[ state ].trans.base=base;
1680             }
1681             trie->lasttrans = tp + 1;
1682         }
1683     } else {
1684         /*
1685            Second Pass -- Flat Table Representation.
1686 
1687            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1688            We know that we will need Charcount+1 trans at most to store the data
1689            (one row per char at worst case) So we preallocate both structures
1690            assuming worst case.
1691 
1692            We then construct the trie using only the .next slots of the entry
1693            structs.
1694 
1695            We use the .check field of the first entry of the node  temporarily to
1696            make compression both faster and easier by keeping track of how many non
1697            zero fields are in the node.
1698 
1699            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1700            transition.
1701 
1702            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1703            number representing the first entry of the node, and state as a
1704            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1705            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1706            are 2 entrys per node. eg:
1707 
1708              A B       A B
1709           1. 2 4    1. 3 7
1710           2. 0 3    3. 0 5
1711           3. 0 0    5. 0 0
1712           4. 0 0    7. 0 0
1713 
1714            The table is internally in the right hand, idx form. However as we also
1715            have to deal with the states array which is indexed by nodenum we have to
1716            use TRIE_NODENUM() to convert.
1717 
1718         */
1719         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1720             "%*sCompiling trie using table compiler\n",
1721             (int)depth * 2 + 2, ""));
1722 
1723 	trie->trans = (reg_trie_trans *)
1724 	    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1725 				  * trie->uniquecharcount + 1,
1726 				  sizeof(reg_trie_trans) );
1727         trie->states = (reg_trie_state *)
1728 	    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1729 				  sizeof(reg_trie_state) );
1730         next_alloc = trie->uniquecharcount + 1;
1731 
1732 
1733         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1734 
1735 	    regnode * const noper   = NEXTOPER( cur );
1736 	    const U8 *uc     = (U8*)STRING( noper );
1737 	    const U8 * const e = uc + STR_LEN( noper );
1738 
1739             U32 state        = 1;         /* required init */
1740 
1741             U16 charid       = 0;         /* sanity init */
1742             U32 accept_state = 0;         /* sanity init */
1743             U8 *scan         = (U8*)NULL; /* sanity init */
1744 
1745             STRLEN foldlen   = 0;         /* required init */
1746             U32 wordlen      = 0;         /* required init */
1747             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1748 
1749             if ( OP(noper) != NOTHING ) {
1750                 for ( ; uc < e ; uc += len ) {
1751 
1752                     TRIE_READ_CHAR;
1753 
1754                     if ( uvc < 256 ) {
1755                         charid = trie->charmap[ uvc ];
1756                     } else {
1757                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1758                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1759                     }
1760                     if ( charid ) {
1761                         charid--;
1762                         if ( !trie->trans[ state + charid ].next ) {
1763                             trie->trans[ state + charid ].next = next_alloc;
1764                             trie->trans[ state ].check++;
1765                             next_alloc += trie->uniquecharcount;
1766                         }
1767                         state = trie->trans[ state + charid ].next;
1768                     } else {
1769                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1770                     }
1771                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1772                 }
1773             }
1774             accept_state = TRIE_NODENUM( state );
1775             TRIE_HANDLE_WORD(accept_state);
1776 
1777         } /* end second pass */
1778 
1779         /* and now dump it out before we compress it */
1780         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1781 							  revcharmap,
1782 							  next_alloc, depth+1));
1783 
1784         {
1785         /*
1786            * Inplace compress the table.*
1787 
1788            For sparse data sets the table constructed by the trie algorithm will
1789            be mostly 0/FAIL transitions or to put it another way mostly empty.
1790            (Note that leaf nodes will not contain any transitions.)
1791 
1792            This algorithm compresses the tables by eliminating most such
1793            transitions, at the cost of a modest bit of extra work during lookup:
1794 
1795            - Each states[] entry contains a .base field which indicates the
1796            index in the state[] array wheres its transition data is stored.
1797 
1798            - If .base is 0 there are no  valid transitions from that node.
1799 
1800            - If .base is nonzero then charid is added to it to find an entry in
1801            the trans array.
1802 
1803            -If trans[states[state].base+charid].check!=state then the
1804            transition is taken to be a 0/Fail transition. Thus if there are fail
1805            transitions at the front of the node then the .base offset will point
1806            somewhere inside the previous nodes data (or maybe even into a node
1807            even earlier), but the .check field determines if the transition is
1808            valid.
1809 
1810            XXX - wrong maybe?
1811            The following process inplace converts the table to the compressed
1812            table: We first do not compress the root node 1,and mark its all its
1813            .check pointers as 1 and set its .base pointer as 1 as well. This
1814            allows to do a DFA construction from the compressed table later, and
1815            ensures that any .base pointers we calculate later are greater than
1816            0.
1817 
1818            - We set 'pos' to indicate the first entry of the second node.
1819 
1820            - We then iterate over the columns of the node, finding the first and
1821            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1822            and set the .check pointers accordingly, and advance pos
1823            appropriately and repreat for the next node. Note that when we copy
1824            the next pointers we have to convert them from the original
1825            NODEIDX form to NODENUM form as the former is not valid post
1826            compression.
1827 
1828            - If a node has no transitions used we mark its base as 0 and do not
1829            advance the pos pointer.
1830 
1831            - If a node only has one transition we use a second pointer into the
1832            structure to fill in allocated fail transitions from other states.
1833            This pointer is independent of the main pointer and scans forward
1834            looking for null transitions that are allocated to a state. When it
1835            finds one it writes the single transition into the "hole".  If the
1836            pointer doesnt find one the single transition is appended as normal.
1837 
1838            - Once compressed we can Renew/realloc the structures to release the
1839            excess space.
1840 
1841            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1842            specifically Fig 3.47 and the associated pseudocode.
1843 
1844            demq
1845         */
1846         const U32 laststate = TRIE_NODENUM( next_alloc );
1847 	U32 state, charid;
1848         U32 pos = 0, zp=0;
1849         trie->statecount = laststate;
1850 
1851         for ( state = 1 ; state < laststate ; state++ ) {
1852             U8 flag = 0;
1853 	    const U32 stateidx = TRIE_NODEIDX( state );
1854 	    const U32 o_used = trie->trans[ stateidx ].check;
1855 	    U32 used = trie->trans[ stateidx ].check;
1856             trie->trans[ stateidx ].check = 0;
1857 
1858             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1859                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1860                     if ( trie->trans[ stateidx + charid ].next ) {
1861                         if (o_used == 1) {
1862                             for ( ; zp < pos ; zp++ ) {
1863                                 if ( ! trie->trans[ zp ].next ) {
1864                                     break;
1865                                 }
1866                             }
1867                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1868                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1869                             trie->trans[ zp ].check = state;
1870                             if ( ++zp > pos ) pos = zp;
1871                             break;
1872                         }
1873                         used--;
1874                     }
1875                     if ( !flag ) {
1876                         flag = 1;
1877                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1878                     }
1879                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880                     trie->trans[ pos ].check = state;
1881                     pos++;
1882                 }
1883             }
1884         }
1885         trie->lasttrans = pos + 1;
1886         trie->states = (reg_trie_state *)
1887 	    PerlMemShared_realloc( trie->states, laststate
1888 				   * sizeof(reg_trie_state) );
1889         DEBUG_TRIE_COMPILE_MORE_r(
1890                 PerlIO_printf( Perl_debug_log,
1891 		    "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1892 		    (int)depth * 2 + 2,"",
1893                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1894 		    (IV)next_alloc,
1895 		    (IV)pos,
1896                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1897             );
1898 
1899         } /* end table compress */
1900     }
1901     DEBUG_TRIE_COMPILE_MORE_r(
1902             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1903                 (int)depth * 2 + 2, "",
1904                 (UV)trie->statecount,
1905                 (UV)trie->lasttrans)
1906     );
1907     /* resize the trans array to remove unused space */
1908     trie->trans = (reg_trie_trans *)
1909 	PerlMemShared_realloc( trie->trans, trie->lasttrans
1910 			       * sizeof(reg_trie_trans) );
1911 
1912     /* and now dump out the compressed format */
1913     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1914 
1915     {   /* Modify the program and insert the new TRIE node*/
1916         U8 nodetype =(U8)(flags & 0xFF);
1917         char *str=NULL;
1918 
1919 #ifdef DEBUGGING
1920         regnode *optimize = NULL;
1921 #ifdef RE_TRACK_PATTERN_OFFSETS
1922 
1923         U32 mjd_offset = 0;
1924         U32 mjd_nodelen = 0;
1925 #endif /* RE_TRACK_PATTERN_OFFSETS */
1926 #endif /* DEBUGGING */
1927         /*
1928            This means we convert either the first branch or the first Exact,
1929            depending on whether the thing following (in 'last') is a branch
1930            or not and whther first is the startbranch (ie is it a sub part of
1931            the alternation or is it the whole thing.)
1932            Assuming its a sub part we conver the EXACT otherwise we convert
1933            the whole branch sequence, including the first.
1934          */
1935         /* Find the node we are going to overwrite */
1936         if ( first != startbranch || OP( last ) == BRANCH ) {
1937             /* branch sub-chain */
1938             NEXT_OFF( first ) = (U16)(last - first);
1939 #ifdef RE_TRACK_PATTERN_OFFSETS
1940             DEBUG_r({
1941                 mjd_offset= Node_Offset((convert));
1942                 mjd_nodelen= Node_Length((convert));
1943             });
1944 #endif
1945             /* whole branch chain */
1946         }
1947 #ifdef RE_TRACK_PATTERN_OFFSETS
1948         else {
1949             DEBUG_r({
1950                 const  regnode *nop = NEXTOPER( convert );
1951                 mjd_offset= Node_Offset((nop));
1952                 mjd_nodelen= Node_Length((nop));
1953             });
1954         }
1955         DEBUG_OPTIMISE_r(
1956             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1957                 (int)depth * 2 + 2, "",
1958                 (UV)mjd_offset, (UV)mjd_nodelen)
1959         );
1960 #endif
1961         /* But first we check to see if there is a common prefix we can
1962            split out as an EXACT and put in front of the TRIE node.  */
1963         trie->startstate= 1;
1964         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1965             U32 state;
1966             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1967                 U32 ofs = 0;
1968                 I32 idx = -1;
1969                 U32 count = 0;
1970                 const U32 base = trie->states[ state ].trans.base;
1971 
1972                 if ( trie->states[state].wordnum )
1973                         count = 1;
1974 
1975                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1976                     if ( ( base + ofs >= trie->uniquecharcount ) &&
1977                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1978                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1979                     {
1980                         if ( ++count > 1 ) {
1981                             SV **tmp = av_fetch( revcharmap, ofs, 0);
1982 			    const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1983                             if ( state == 1 ) break;
1984                             if ( count == 2 ) {
1985                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1986                                 DEBUG_OPTIMISE_r(
1987                                     PerlIO_printf(Perl_debug_log,
1988 					"%*sNew Start State=%"UVuf" Class: [",
1989                                         (int)depth * 2 + 2, "",
1990                                         (UV)state));
1991 				if (idx >= 0) {
1992 				    SV ** const tmp = av_fetch( revcharmap, idx, 0);
1993 				    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1994 
1995                                     TRIE_BITMAP_SET(trie,*ch);
1996                                     if ( folder )
1997                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
1998                                     DEBUG_OPTIMISE_r(
1999                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2000                                     );
2001 				}
2002 			    }
2003 			    TRIE_BITMAP_SET(trie,*ch);
2004 			    if ( folder )
2005 				TRIE_BITMAP_SET(trie,folder[ *ch ]);
2006 			    DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2007 			}
2008                         idx = ofs;
2009 		    }
2010                 }
2011                 if ( count == 1 ) {
2012                     SV **tmp = av_fetch( revcharmap, idx, 0);
2013                     STRLEN len;
2014                     char *ch = SvPV( *tmp, len );
2015                     DEBUG_OPTIMISE_r({
2016                         SV *sv=sv_newmortal();
2017                         PerlIO_printf( Perl_debug_log,
2018 			    "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2019                             (int)depth * 2 + 2, "",
2020                             (UV)state, (UV)idx,
2021                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2022 	                        PL_colors[0], PL_colors[1],
2023 	                        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2024 	                        PERL_PV_ESCAPE_FIRSTCHAR
2025                             )
2026                         );
2027                     });
2028                     if ( state==1 ) {
2029                         OP( convert ) = nodetype;
2030                         str=STRING(convert);
2031                         STR_LEN(convert)=0;
2032                     }
2033                     STR_LEN(convert) += len;
2034                     while (len--)
2035                         *str++ = *ch++;
2036 		} else {
2037 #ifdef DEBUGGING
2038 		    if (state>1)
2039 			DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2040 #endif
2041 		    break;
2042 		}
2043 	    }
2044             if (str) {
2045                 regnode *n = convert+NODE_SZ_STR(convert);
2046                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2047                 trie->startstate = state;
2048                 trie->minlen -= (state - 1);
2049                 trie->maxlen -= (state - 1);
2050 #ifdef DEBUGGING
2051                /* At least the UNICOS C compiler choked on this
2052                 * being argument to DEBUG_r(), so let's just have
2053                 * it right here. */
2054                if (
2055 #ifdef PERL_EXT_RE_BUILD
2056                    1
2057 #else
2058                    DEBUG_r_TEST
2059 #endif
2060                    ) {
2061                    regnode *fix = convert;
2062                    U32 word = trie->wordcount;
2063                    mjd_nodelen++;
2064                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2065                    while( ++fix < n ) {
2066                        Set_Node_Offset_Length(fix, 0, 0);
2067                    }
2068                    while (word--) {
2069                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2070                        if (tmp) {
2071                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2072                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2073                            else
2074                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2075                        }
2076                    }
2077                }
2078 #endif
2079                 if (trie->maxlen) {
2080                     convert = n;
2081 		} else {
2082                     NEXT_OFF(convert) = (U16)(tail - convert);
2083                     DEBUG_r(optimize= n);
2084                 }
2085             }
2086         }
2087         if (!jumper)
2088             jumper = last;
2089         if ( trie->maxlen ) {
2090 	    NEXT_OFF( convert ) = (U16)(tail - convert);
2091 	    ARG_SET( convert, data_slot );
2092 	    /* Store the offset to the first unabsorbed branch in
2093 	       jump[0], which is otherwise unused by the jump logic.
2094 	       We use this when dumping a trie and during optimisation. */
2095 	    if (trie->jump)
2096 	        trie->jump[0] = (U16)(nextbranch - convert);
2097 
2098             /* XXXX */
2099             if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2100                  ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2101             {
2102                 OP( convert ) = TRIEC;
2103                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2104                 PerlMemShared_free(trie->bitmap);
2105                 trie->bitmap= NULL;
2106             } else
2107                 OP( convert ) = TRIE;
2108 
2109             /* store the type in the flags */
2110             convert->flags = nodetype;
2111             DEBUG_r({
2112             optimize = convert
2113                       + NODE_STEP_REGNODE
2114                       + regarglen[ OP( convert ) ];
2115             });
2116             /* XXX We really should free up the resource in trie now,
2117                    as we won't use them - (which resources?) dmq */
2118         }
2119         /* needed for dumping*/
2120         DEBUG_r(if (optimize) {
2121             regnode *opt = convert;
2122 
2123             while ( ++opt < optimize) {
2124                 Set_Node_Offset_Length(opt,0,0);
2125             }
2126             /*
2127                 Try to clean up some of the debris left after the
2128                 optimisation.
2129              */
2130             while( optimize < jumper ) {
2131                 mjd_nodelen += Node_Length((optimize));
2132                 OP( optimize ) = OPTIMIZED;
2133                 Set_Node_Offset_Length(optimize,0,0);
2134                 optimize++;
2135             }
2136             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2137         });
2138     } /* end node insert */
2139     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2140 #ifdef DEBUGGING
2141     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2142     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2143 #else
2144     SvREFCNT_dec(revcharmap);
2145 #endif
2146     return trie->jump
2147            ? MADE_JUMP_TRIE
2148            : trie->startstate>1
2149              ? MADE_EXACT_TRIE
2150              : MADE_TRIE;
2151 }
2152 
2153 STATIC void
2154 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2155 {
2156 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2157 
2158    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2159    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2160    ISBN 0-201-10088-6
2161 
2162    We find the fail state for each state in the trie, this state is the longest proper
2163    suffix of the current states 'word' that is also a proper prefix of another word in our
2164    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2165    the DFA not to have to restart after its tried and failed a word at a given point, it
2166    simply continues as though it had been matching the other word in the first place.
2167    Consider
2168       'abcdgu'=~/abcdefg|cdgu/
2169    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2170    fail, which would bring use to the state representing 'd' in the second word where we would
2171    try 'g' and succeed, prodceding to match 'cdgu'.
2172  */
2173  /* add a fail transition */
2174     const U32 trie_offset = ARG(source);
2175     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2176     U32 *q;
2177     const U32 ucharcount = trie->uniquecharcount;
2178     const U32 numstates = trie->statecount;
2179     const U32 ubound = trie->lasttrans + ucharcount;
2180     U32 q_read = 0;
2181     U32 q_write = 0;
2182     U32 charid;
2183     U32 base = trie->states[ 1 ].trans.base;
2184     U32 *fail;
2185     reg_ac_data *aho;
2186     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2187     GET_RE_DEBUG_FLAGS_DECL;
2188 
2189     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2190 #ifndef DEBUGGING
2191     PERL_UNUSED_ARG(depth);
2192 #endif
2193 
2194 
2195     ARG_SET( stclass, data_slot );
2196     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2197     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2198     aho->trie=trie_offset;
2199     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2200     Copy( trie->states, aho->states, numstates, reg_trie_state );
2201     Newxz( q, numstates, U32);
2202     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2203     aho->refcount = 1;
2204     fail = aho->fail;
2205     /* initialize fail[0..1] to be 1 so that we always have
2206        a valid final fail state */
2207     fail[ 0 ] = fail[ 1 ] = 1;
2208 
2209     for ( charid = 0; charid < ucharcount ; charid++ ) {
2210 	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2211 	if ( newstate ) {
2212             q[ q_write ] = newstate;
2213             /* set to point at the root */
2214             fail[ q[ q_write++ ] ]=1;
2215         }
2216     }
2217     while ( q_read < q_write) {
2218 	const U32 cur = q[ q_read++ % numstates ];
2219         base = trie->states[ cur ].trans.base;
2220 
2221         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2222 	    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2223 	    if (ch_state) {
2224                 U32 fail_state = cur;
2225                 U32 fail_base;
2226                 do {
2227                     fail_state = fail[ fail_state ];
2228                     fail_base = aho->states[ fail_state ].trans.base;
2229                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2230 
2231                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2232                 fail[ ch_state ] = fail_state;
2233                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2234                 {
2235                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2236                 }
2237                 q[ q_write++ % numstates] = ch_state;
2238             }
2239         }
2240     }
2241     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2242        when we fail in state 1, this allows us to use the
2243        charclass scan to find a valid start char. This is based on the principle
2244        that theres a good chance the string being searched contains lots of stuff
2245        that cant be a start char.
2246      */
2247     fail[ 0 ] = fail[ 1 ] = 0;
2248     DEBUG_TRIE_COMPILE_r({
2249         PerlIO_printf(Perl_debug_log,
2250 		      "%*sStclass Failtable (%"UVuf" states): 0",
2251 		      (int)(depth * 2), "", (UV)numstates
2252         );
2253         for( q_read=1; q_read<numstates; q_read++ ) {
2254             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2255         }
2256         PerlIO_printf(Perl_debug_log, "\n");
2257     });
2258     Safefree(q);
2259     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2260 }
2261 
2262 
2263 /*
2264  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2265  * These need to be revisited when a newer toolchain becomes available.
2266  */
2267 #if defined(__sparc64__) && defined(__GNUC__)
2268 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2269 #       undef  SPARC64_GCC_WORKAROUND
2270 #       define SPARC64_GCC_WORKAROUND 1
2271 #   endif
2272 #endif
2273 
2274 #define DEBUG_PEEP(str,scan,depth) \
2275     DEBUG_OPTIMISE_r({if (scan){ \
2276        SV * const mysv=sv_newmortal(); \
2277        regnode *Next = regnext(scan); \
2278        regprop(RExC_rx, mysv, scan); \
2279        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2280        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2281        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2282    }});
2283 
2284 
2285 
2286 
2287 
2288 #define JOIN_EXACT(scan,min,flags) \
2289     if (PL_regkind[OP(scan)] == EXACT) \
2290         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2291 
2292 STATIC U32
2293 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2294     /* Merge several consecutive EXACTish nodes into one. */
2295     regnode *n = regnext(scan);
2296     U32 stringok = 1;
2297     regnode *next = scan + NODE_SZ_STR(scan);
2298     U32 merged = 0;
2299     U32 stopnow = 0;
2300 #ifdef DEBUGGING
2301     regnode *stop = scan;
2302     GET_RE_DEBUG_FLAGS_DECL;
2303 #else
2304     PERL_UNUSED_ARG(depth);
2305 #endif
2306 
2307     PERL_ARGS_ASSERT_JOIN_EXACT;
2308 #ifndef EXPERIMENTAL_INPLACESCAN
2309     PERL_UNUSED_ARG(flags);
2310     PERL_UNUSED_ARG(val);
2311 #endif
2312     DEBUG_PEEP("join",scan,depth);
2313 
2314     /* Skip NOTHING, merge EXACT*. */
2315     while (n &&
2316            ( PL_regkind[OP(n)] == NOTHING ||
2317              (stringok && (OP(n) == OP(scan))))
2318            && NEXT_OFF(n)
2319            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2320 
2321         if (OP(n) == TAIL || n > next)
2322             stringok = 0;
2323         if (PL_regkind[OP(n)] == NOTHING) {
2324             DEBUG_PEEP("skip:",n,depth);
2325             NEXT_OFF(scan) += NEXT_OFF(n);
2326             next = n + NODE_STEP_REGNODE;
2327 #ifdef DEBUGGING
2328             if (stringok)
2329                 stop = n;
2330 #endif
2331             n = regnext(n);
2332         }
2333         else if (stringok) {
2334             const unsigned int oldl = STR_LEN(scan);
2335             regnode * const nnext = regnext(n);
2336 
2337             DEBUG_PEEP("merg",n,depth);
2338 
2339             merged++;
2340             if (oldl + STR_LEN(n) > U8_MAX)
2341                 break;
2342             NEXT_OFF(scan) += NEXT_OFF(n);
2343             STR_LEN(scan) += STR_LEN(n);
2344             next = n + NODE_SZ_STR(n);
2345             /* Now we can overwrite *n : */
2346             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2347 #ifdef DEBUGGING
2348             stop = next - 1;
2349 #endif
2350             n = nnext;
2351             if (stopnow) break;
2352         }
2353 
2354 #ifdef EXPERIMENTAL_INPLACESCAN
2355 	if (flags && !NEXT_OFF(n)) {
2356 	    DEBUG_PEEP("atch", val, depth);
2357 	    if (reg_off_by_arg[OP(n)]) {
2358 		ARG_SET(n, val - n);
2359 	    }
2360 	    else {
2361 		NEXT_OFF(n) = val - n;
2362 	    }
2363 	    stopnow = 1;
2364 	}
2365 #endif
2366     }
2367 
2368     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2369     /*
2370     Two problematic code points in Unicode casefolding of EXACT nodes:
2371 
2372     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2373     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2374 
2375     which casefold to
2376 
2377     Unicode                      UTF-8
2378 
2379     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2380     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2381 
2382     This means that in case-insensitive matching (or "loose matching",
2383     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2384     length of the above casefolded versions) can match a target string
2385     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2386     This would rather mess up the minimum length computation.
2387 
2388     What we'll do is to look for the tail four bytes, and then peek
2389     at the preceding two bytes to see whether we need to decrease
2390     the minimum length by four (six minus two).
2391 
2392     Thanks to the design of UTF-8, there cannot be false matches:
2393     A sequence of valid UTF-8 bytes cannot be a subsequence of
2394     another valid sequence of UTF-8 bytes.
2395 
2396     */
2397          char * const s0 = STRING(scan), *s, *t;
2398          char * const s1 = s0 + STR_LEN(scan) - 1;
2399          char * const s2 = s1 - 4;
2400 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2401 	 const char t0[] = "\xaf\x49\xaf\x42";
2402 #else
2403          const char t0[] = "\xcc\x88\xcc\x81";
2404 #endif
2405          const char * const t1 = t0 + 3;
2406 
2407          for (s = s0 + 2;
2408               s < s2 && (t = ninstr(s, s1, t0, t1));
2409               s = t + 4) {
2410 #ifdef EBCDIC
2411 	      if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2412 		  ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2413 #else
2414               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2415                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2416 #endif
2417                    *min -= 4;
2418          }
2419     }
2420 
2421 #ifdef DEBUGGING
2422     /* Allow dumping */
2423     n = scan + NODE_SZ_STR(scan);
2424     while (n <= stop) {
2425         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2426             OP(n) = OPTIMIZED;
2427             NEXT_OFF(n) = 0;
2428         }
2429         n++;
2430     }
2431 #endif
2432     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2433     return stopnow;
2434 }
2435 
2436 /* REx optimizer.  Converts nodes into quickier variants "in place".
2437    Finds fixed substrings.  */
2438 
2439 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2440    to the position after last scanned or to NULL. */
2441 
2442 #define INIT_AND_WITHP \
2443     assert(!and_withp); \
2444     Newx(and_withp,1,struct regnode_charclass_class); \
2445     SAVEFREEPV(and_withp)
2446 
2447 /* this is a chain of data about sub patterns we are processing that
2448    need to be handled seperately/specially in study_chunk. Its so
2449    we can simulate recursion without losing state.  */
2450 struct scan_frame;
2451 typedef struct scan_frame {
2452     regnode *last;  /* last node to process in this frame */
2453     regnode *next;  /* next node to process when last is reached */
2454     struct scan_frame *prev; /*previous frame*/
2455     I32 stop; /* what stopparen do we use */
2456 } scan_frame;
2457 
2458 
2459 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2460 
2461 #define CASE_SYNST_FNC(nAmE)                                       \
2462 case nAmE:                                                         \
2463     if (flags & SCF_DO_STCLASS_AND) {                              \
2464 	    for (value = 0; value < 256; value++)                  \
2465 		if (!is_ ## nAmE ## _cp(value))                       \
2466 		    ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2467     }                                                              \
2468     else {                                                         \
2469 	    for (value = 0; value < 256; value++)                  \
2470 		if (is_ ## nAmE ## _cp(value))                        \
2471 		    ANYOF_BITMAP_SET(data->start_class, value);	   \
2472     }                                                              \
2473     break;                                                         \
2474 case N ## nAmE:                                                    \
2475     if (flags & SCF_DO_STCLASS_AND) {                              \
2476 	    for (value = 0; value < 256; value++)                   \
2477 		if (is_ ## nAmE ## _cp(value))                         \
2478 		    ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2479     }                                                               \
2480     else {                                                          \
2481 	    for (value = 0; value < 256; value++)                   \
2482 		if (!is_ ## nAmE ## _cp(value))                        \
2483 		    ANYOF_BITMAP_SET(data->start_class, value);	    \
2484     }                                                               \
2485     break
2486 
2487 
2488 
2489 STATIC I32
2490 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2491                         I32 *minlenp, I32 *deltap,
2492 			regnode *last,
2493 			scan_data_t *data,
2494 			I32 stopparen,
2495 			U8* recursed,
2496 			struct regnode_charclass_class *and_withp,
2497 			U32 flags, U32 depth)
2498 			/* scanp: Start here (read-write). */
2499 			/* deltap: Write maxlen-minlen here. */
2500 			/* last: Stop before this one. */
2501 			/* data: string data about the pattern */
2502 			/* stopparen: treat close N as END */
2503 			/* recursed: which subroutines have we recursed into */
2504 			/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2505 {
2506     dVAR;
2507     I32 min = 0, pars = 0, code;
2508     regnode *scan = *scanp, *next;
2509     I32 delta = 0;
2510     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2511     int is_inf_internal = 0;		/* The studied chunk is infinite */
2512     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2513     scan_data_t data_fake;
2514     SV *re_trie_maxbuff = NULL;
2515     regnode *first_non_open = scan;
2516     I32 stopmin = I32_MAX;
2517     scan_frame *frame = NULL;
2518     GET_RE_DEBUG_FLAGS_DECL;
2519 
2520     PERL_ARGS_ASSERT_STUDY_CHUNK;
2521 
2522 #ifdef DEBUGGING
2523     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2524 #endif
2525 
2526     if ( depth == 0 ) {
2527         while (first_non_open && OP(first_non_open) == OPEN)
2528             first_non_open=regnext(first_non_open);
2529     }
2530 
2531 
2532   fake_study_recurse:
2533     while ( scan && OP(scan) != END && scan < last ){
2534 	/* Peephole optimizer: */
2535 	DEBUG_STUDYDATA("Peep:", data,depth);
2536 	DEBUG_PEEP("Peep",scan,depth);
2537         JOIN_EXACT(scan,&min,0);
2538 
2539 	/* Follow the next-chain of the current node and optimize
2540 	   away all the NOTHINGs from it.  */
2541 	if (OP(scan) != CURLYX) {
2542 	    const int max = (reg_off_by_arg[OP(scan)]
2543 		       ? I32_MAX
2544 		       /* I32 may be smaller than U16 on CRAYs! */
2545 		       : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2546 	    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2547 	    int noff;
2548 	    regnode *n = scan;
2549 
2550 	    /* Skip NOTHING and LONGJMP. */
2551 	    while ((n = regnext(n))
2552 		   && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2553 		       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2554 		   && off + noff < max)
2555 		off += noff;
2556 	    if (reg_off_by_arg[OP(scan)])
2557 		ARG(scan) = off;
2558 	    else
2559 		NEXT_OFF(scan) = off;
2560 	}
2561 
2562 
2563 
2564 	/* The principal pseudo-switch.  Cannot be a switch, since we
2565 	   look into several different things.  */
2566 	if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2567 		   || OP(scan) == IFTHEN) {
2568 	    next = regnext(scan);
2569 	    code = OP(scan);
2570 	    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2571 
2572 	    if (OP(next) == code || code == IFTHEN) {
2573 	        /* NOTE - There is similar code to this block below for handling
2574 	           TRIE nodes on a re-study.  If you change stuff here check there
2575 	           too. */
2576 		I32 max1 = 0, min1 = I32_MAX, num = 0;
2577 		struct regnode_charclass_class accum;
2578 		regnode * const startbranch=scan;
2579 
2580 		if (flags & SCF_DO_SUBSTR)
2581 		    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2582 		if (flags & SCF_DO_STCLASS)
2583 		    cl_init_zero(pRExC_state, &accum);
2584 
2585 		while (OP(scan) == code) {
2586 		    I32 deltanext, minnext, f = 0, fake;
2587 		    struct regnode_charclass_class this_class;
2588 
2589 		    num++;
2590 		    data_fake.flags = 0;
2591 		    if (data) {
2592 			data_fake.whilem_c = data->whilem_c;
2593 			data_fake.last_closep = data->last_closep;
2594 		    }
2595 		    else
2596 			data_fake.last_closep = &fake;
2597 
2598 		    data_fake.pos_delta = delta;
2599 		    next = regnext(scan);
2600 		    scan = NEXTOPER(scan);
2601 		    if (code != BRANCH)
2602 			scan = NEXTOPER(scan);
2603 		    if (flags & SCF_DO_STCLASS) {
2604 			cl_init(pRExC_state, &this_class);
2605 			data_fake.start_class = &this_class;
2606 			f = SCF_DO_STCLASS_AND;
2607 		    }
2608 		    if (flags & SCF_WHILEM_VISITED_POS)
2609 			f |= SCF_WHILEM_VISITED_POS;
2610 
2611 		    /* we suppose the run is continuous, last=next...*/
2612 		    minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2613 					  next, &data_fake,
2614 					  stopparen, recursed, NULL, f,depth+1);
2615 		    if (min1 > minnext)
2616 			min1 = minnext;
2617 		    if (max1 < minnext + deltanext)
2618 			max1 = minnext + deltanext;
2619 		    if (deltanext == I32_MAX)
2620 			is_inf = is_inf_internal = 1;
2621 		    scan = next;
2622 		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2623 			pars++;
2624 	            if (data_fake.flags & SCF_SEEN_ACCEPT) {
2625 	                if ( stopmin > minnext)
2626 	                    stopmin = min + min1;
2627 	                flags &= ~SCF_DO_SUBSTR;
2628 	                if (data)
2629 	                    data->flags |= SCF_SEEN_ACCEPT;
2630 	            }
2631 		    if (data) {
2632 			if (data_fake.flags & SF_HAS_EVAL)
2633 			    data->flags |= SF_HAS_EVAL;
2634 			data->whilem_c = data_fake.whilem_c;
2635 		    }
2636 		    if (flags & SCF_DO_STCLASS)
2637 			cl_or(pRExC_state, &accum, &this_class);
2638 		}
2639 		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2640 		    min1 = 0;
2641 		if (flags & SCF_DO_SUBSTR) {
2642 		    data->pos_min += min1;
2643 		    data->pos_delta += max1 - min1;
2644 		    if (max1 != min1 || is_inf)
2645 			data->longest = &(data->longest_float);
2646 		}
2647 		min += min1;
2648 		delta += max1 - min1;
2649 		if (flags & SCF_DO_STCLASS_OR) {
2650 		    cl_or(pRExC_state, data->start_class, &accum);
2651 		    if (min1) {
2652 			cl_and(data->start_class, and_withp);
2653 			flags &= ~SCF_DO_STCLASS;
2654 		    }
2655 		}
2656 		else if (flags & SCF_DO_STCLASS_AND) {
2657 		    if (min1) {
2658 			cl_and(data->start_class, &accum);
2659 			flags &= ~SCF_DO_STCLASS;
2660 		    }
2661 		    else {
2662 			/* Switch to OR mode: cache the old value of
2663 			 * data->start_class */
2664 			INIT_AND_WITHP;
2665 			StructCopy(data->start_class, and_withp,
2666 				   struct regnode_charclass_class);
2667 			flags &= ~SCF_DO_STCLASS_AND;
2668 			StructCopy(&accum, data->start_class,
2669 				   struct regnode_charclass_class);
2670 			flags |= SCF_DO_STCLASS_OR;
2671 			data->start_class->flags |= ANYOF_EOS;
2672 		    }
2673 		}
2674 
2675                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2676 		/* demq.
2677 
2678 		   Assuming this was/is a branch we are dealing with: 'scan' now
2679 		   points at the item that follows the branch sequence, whatever
2680 		   it is. We now start at the beginning of the sequence and look
2681 		   for subsequences of
2682 
2683 		   BRANCH->EXACT=>x1
2684 		   BRANCH->EXACT=>x2
2685 		   tail
2686 
2687 		   which would be constructed from a pattern like /A|LIST|OF|WORDS/
2688 
2689 		   If we can find such a subseqence we need to turn the first
2690 		   element into a trie and then add the subsequent branch exact
2691 		   strings to the trie.
2692 
2693 		   We have two cases
2694 
2695 		     1. patterns where the whole set of branch can be converted.
2696 
2697 		     2. patterns where only a subset can be converted.
2698 
2699 		   In case 1 we can replace the whole set with a single regop
2700 		   for the trie. In case 2 we need to keep the start and end
2701 		   branchs so
2702 
2703 		     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2704 		     becomes BRANCH TRIE; BRANCH X;
2705 
2706 		  There is an additional case, that being where there is a
2707 		  common prefix, which gets split out into an EXACT like node
2708 		  preceding the TRIE node.
2709 
2710 		  If x(1..n)==tail then we can do a simple trie, if not we make
2711 		  a "jump" trie, such that when we match the appropriate word
2712 		  we "jump" to the appopriate tail node. Essentailly we turn
2713 		  a nested if into a case structure of sorts.
2714 
2715 		*/
2716 
2717 		    int made=0;
2718 		    if (!re_trie_maxbuff) {
2719 			re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2720 			if (!SvIOK(re_trie_maxbuff))
2721 			    sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2722 		    }
2723                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2724                         regnode *cur;
2725                         regnode *first = (regnode *)NULL;
2726                         regnode *last = (regnode *)NULL;
2727                         regnode *tail = scan;
2728                         U8 optype = 0;
2729                         U32 count=0;
2730 
2731 #ifdef DEBUGGING
2732                         SV * const mysv = sv_newmortal();       /* for dumping */
2733 #endif
2734                         /* var tail is used because there may be a TAIL
2735                            regop in the way. Ie, the exacts will point to the
2736                            thing following the TAIL, but the last branch will
2737                            point at the TAIL. So we advance tail. If we
2738                            have nested (?:) we may have to move through several
2739                            tails.
2740                          */
2741 
2742                         while ( OP( tail ) == TAIL ) {
2743                             /* this is the TAIL generated by (?:) */
2744                             tail = regnext( tail );
2745                         }
2746 
2747 
2748                         DEBUG_OPTIMISE_r({
2749                             regprop(RExC_rx, mysv, tail );
2750                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2751                                 (int)depth * 2 + 2, "",
2752                                 "Looking for TRIE'able sequences. Tail node is: ",
2753                                 SvPV_nolen_const( mysv )
2754                             );
2755                         });
2756 
2757                         /*
2758 
2759                            step through the branches, cur represents each
2760                            branch, noper is the first thing to be matched
2761                            as part of that branch and noper_next is the
2762                            regnext() of that node. if noper is an EXACT
2763                            and noper_next is the same as scan (our current
2764                            position in the regex) then the EXACT branch is
2765                            a possible optimization target. Once we have
2766                            two or more consequetive such branches we can
2767                            create a trie of the EXACT's contents and stich
2768                            it in place. If the sequence represents all of
2769                            the branches we eliminate the whole thing and
2770                            replace it with a single TRIE. If it is a
2771                            subsequence then we need to stitch it in. This
2772                            means the first branch has to remain, and needs
2773                            to be repointed at the item on the branch chain
2774                            following the last branch optimized. This could
2775                            be either a BRANCH, in which case the
2776                            subsequence is internal, or it could be the
2777                            item following the branch sequence in which
2778                            case the subsequence is at the end.
2779 
2780                         */
2781 
2782                         /* dont use tail as the end marker for this traverse */
2783                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2784                             regnode * const noper = NEXTOPER( cur );
2785 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2786                             regnode * const noper_next = regnext( noper );
2787 #endif
2788 
2789                             DEBUG_OPTIMISE_r({
2790                                 regprop(RExC_rx, mysv, cur);
2791                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2792                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2793 
2794                                 regprop(RExC_rx, mysv, noper);
2795                                 PerlIO_printf( Perl_debug_log, " -> %s",
2796                                     SvPV_nolen_const(mysv));
2797 
2798                                 if ( noper_next ) {
2799                                   regprop(RExC_rx, mysv, noper_next );
2800                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2801                                     SvPV_nolen_const(mysv));
2802                                 }
2803                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2804                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2805                             });
2806                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2807                                          : PL_regkind[ OP( noper ) ] == EXACT )
2808                                   || OP(noper) == NOTHING )
2809 #ifdef NOJUMPTRIE
2810                                   && noper_next == tail
2811 #endif
2812                                   && count < U16_MAX)
2813                             {
2814                                 count++;
2815                                 if ( !first || optype == NOTHING ) {
2816                                     if (!first) first = cur;
2817                                     optype = OP( noper );
2818                                 } else {
2819                                     last = cur;
2820                                 }
2821                             } else {
2822 /*
2823     Currently we do not believe that the trie logic can
2824     handle case insensitive matching properly when the
2825     pattern is not unicode (thus forcing unicode semantics).
2826 
2827     If/when this is fixed the following define can be swapped
2828     in below to fully enable trie logic.
2829 
2830 #define TRIE_TYPE_IS_SAFE 1
2831 
2832 */
2833 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2834 
2835                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2836                                     make_trie( pRExC_state,
2837                                             startbranch, first, cur, tail, count,
2838                                             optype, depth+1 );
2839                                 }
2840                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2841 #ifdef NOJUMPTRIE
2842                                      && noper_next == tail
2843 #endif
2844                                 ){
2845                                     count = 1;
2846                                     first = cur;
2847                                     optype = OP( noper );
2848                                 } else {
2849                                     count = 0;
2850                                     first = NULL;
2851                                     optype = 0;
2852                                 }
2853                                 last = NULL;
2854                             }
2855                         }
2856                         DEBUG_OPTIMISE_r({
2857                             regprop(RExC_rx, mysv, cur);
2858                             PerlIO_printf( Perl_debug_log,
2859                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2860                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2861 
2862                         });
2863 
2864                         if ( last && TRIE_TYPE_IS_SAFE ) {
2865                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2866 #ifdef TRIE_STUDY_OPT
2867                             if ( ((made == MADE_EXACT_TRIE &&
2868                                  startbranch == first)
2869                                  || ( first_non_open == first )) &&
2870                                  depth==0 ) {
2871                                 flags |= SCF_TRIE_RESTUDY;
2872                                 if ( startbranch == first
2873                                      && scan == tail )
2874                                 {
2875                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2876                                 }
2877                             }
2878 #endif
2879                         }
2880                     }
2881 
2882                 } /* do trie */
2883 
2884 	    }
2885 	    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2886 		scan = NEXTOPER(NEXTOPER(scan));
2887 	    } else			/* single branch is optimized. */
2888 		scan = NEXTOPER(scan);
2889 	    continue;
2890 	} else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2891 	    scan_frame *newframe = NULL;
2892 	    I32 paren;
2893 	    regnode *start;
2894 	    regnode *end;
2895 
2896 	    if (OP(scan) != SUSPEND) {
2897 	    /* set the pointer */
2898 	        if (OP(scan) == GOSUB) {
2899 	            paren = ARG(scan);
2900 	            RExC_recurse[ARG2L(scan)] = scan;
2901                     start = RExC_open_parens[paren-1];
2902                     end   = RExC_close_parens[paren-1];
2903                 } else {
2904                     paren = 0;
2905                     start = RExC_rxi->program + 1;
2906                     end   = RExC_opend;
2907                 }
2908                 if (!recursed) {
2909                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2910                     SAVEFREEPV(recursed);
2911                 }
2912                 if (!PAREN_TEST(recursed,paren+1)) {
2913 		    PAREN_SET(recursed,paren+1);
2914                     Newx(newframe,1,scan_frame);
2915                 } else {
2916                     if (flags & SCF_DO_SUBSTR) {
2917                         SCAN_COMMIT(pRExC_state,data,minlenp);
2918                         data->longest = &(data->longest_float);
2919                     }
2920                     is_inf = is_inf_internal = 1;
2921                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2922                         cl_anything(pRExC_state, data->start_class);
2923                     flags &= ~SCF_DO_STCLASS;
2924 	        }
2925             } else {
2926 	        Newx(newframe,1,scan_frame);
2927 	        paren = stopparen;
2928 	        start = scan+2;
2929 	        end = regnext(scan);
2930 	    }
2931 	    if (newframe) {
2932                 assert(start);
2933                 assert(end);
2934 	        SAVEFREEPV(newframe);
2935 	        newframe->next = regnext(scan);
2936 	        newframe->last = last;
2937 	        newframe->stop = stopparen;
2938 	        newframe->prev = frame;
2939 
2940 	        frame = newframe;
2941 	        scan =  start;
2942 	        stopparen = paren;
2943 	        last = end;
2944 
2945 	        continue;
2946 	    }
2947 	}
2948 	else if (OP(scan) == EXACT) {
2949 	    I32 l = STR_LEN(scan);
2950 	    UV uc;
2951 	    if (UTF) {
2952 		const U8 * const s = (U8*)STRING(scan);
2953 		l = utf8_length(s, s + l);
2954 		uc = utf8_to_uvchr(s, NULL);
2955 	    } else {
2956 		uc = *((U8*)STRING(scan));
2957 	    }
2958 	    min += l;
2959 	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2960 		/* The code below prefers earlier match for fixed
2961 		   offset, later match for variable offset.  */
2962 		if (data->last_end == -1) { /* Update the start info. */
2963 		    data->last_start_min = data->pos_min;
2964  		    data->last_start_max = is_inf
2965  			? I32_MAX : data->pos_min + data->pos_delta;
2966 		}
2967 		sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2968 		if (UTF)
2969 		    SvUTF8_on(data->last_found);
2970 		{
2971 		    SV * const sv = data->last_found;
2972 		    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2973 			mg_find(sv, PERL_MAGIC_utf8) : NULL;
2974 		    if (mg && mg->mg_len >= 0)
2975 			mg->mg_len += utf8_length((U8*)STRING(scan),
2976 						  (U8*)STRING(scan)+STR_LEN(scan));
2977 		}
2978 		data->last_end = data->pos_min + l;
2979 		data->pos_min += l; /* As in the first entry. */
2980 		data->flags &= ~SF_BEFORE_EOL;
2981 	    }
2982 	    if (flags & SCF_DO_STCLASS_AND) {
2983 		/* Check whether it is compatible with what we know already! */
2984 		int compat = 1;
2985 
2986 		if (uc >= 0x100 ||
2987 		    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2988 		    && !ANYOF_BITMAP_TEST(data->start_class, uc)
2989 		    && (!(data->start_class->flags & ANYOF_FOLD)
2990 			|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2991                     )
2992 		    compat = 0;
2993 		ANYOF_CLASS_ZERO(data->start_class);
2994 		ANYOF_BITMAP_ZERO(data->start_class);
2995 		if (compat)
2996 		    ANYOF_BITMAP_SET(data->start_class, uc);
2997 		data->start_class->flags &= ~ANYOF_EOS;
2998 		if (uc < 0x100)
2999 		  data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3000 	    }
3001 	    else if (flags & SCF_DO_STCLASS_OR) {
3002 		/* false positive possible if the class is case-folded */
3003 		if (uc < 0x100)
3004 		    ANYOF_BITMAP_SET(data->start_class, uc);
3005 		else
3006 		    data->start_class->flags |= ANYOF_UNICODE_ALL;
3007 		data->start_class->flags &= ~ANYOF_EOS;
3008 		cl_and(data->start_class, and_withp);
3009 	    }
3010 	    flags &= ~SCF_DO_STCLASS;
3011 	}
3012 	else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3013 	    I32 l = STR_LEN(scan);
3014 	    UV uc = *((U8*)STRING(scan));
3015 
3016 	    /* Search for fixed substrings supports EXACT only. */
3017 	    if (flags & SCF_DO_SUBSTR) {
3018 		assert(data);
3019 		SCAN_COMMIT(pRExC_state, data, minlenp);
3020 	    }
3021 	    if (UTF) {
3022 		const U8 * const s = (U8 *)STRING(scan);
3023 		l = utf8_length(s, s + l);
3024 		uc = utf8_to_uvchr(s, NULL);
3025 	    }
3026 	    min += l;
3027 	    if (flags & SCF_DO_SUBSTR)
3028 		data->pos_min += l;
3029 	    if (flags & SCF_DO_STCLASS_AND) {
3030 		/* Check whether it is compatible with what we know already! */
3031 		int compat = 1;
3032 
3033 		if (uc >= 0x100 ||
3034 		    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3035 		    && !ANYOF_BITMAP_TEST(data->start_class, uc)
3036 		     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3037 		    compat = 0;
3038 		ANYOF_CLASS_ZERO(data->start_class);
3039 		ANYOF_BITMAP_ZERO(data->start_class);
3040 		if (compat) {
3041 		    ANYOF_BITMAP_SET(data->start_class, uc);
3042 		    data->start_class->flags &= ~ANYOF_EOS;
3043 		    data->start_class->flags |= ANYOF_FOLD;
3044 		    if (OP(scan) == EXACTFL)
3045 			data->start_class->flags |= ANYOF_LOCALE;
3046 		}
3047 	    }
3048 	    else if (flags & SCF_DO_STCLASS_OR) {
3049 		if (data->start_class->flags & ANYOF_FOLD) {
3050 		    /* false positive possible if the class is case-folded.
3051 		       Assume that the locale settings are the same... */
3052 		    if (uc < 0x100)
3053 			ANYOF_BITMAP_SET(data->start_class, uc);
3054 		    data->start_class->flags &= ~ANYOF_EOS;
3055 		}
3056 		cl_and(data->start_class, and_withp);
3057 	    }
3058 	    flags &= ~SCF_DO_STCLASS;
3059 	}
3060 	else if (strchr((const char*)PL_varies,OP(scan))) {
3061 	    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3062 	    I32 f = flags, pos_before = 0;
3063 	    regnode * const oscan = scan;
3064 	    struct regnode_charclass_class this_class;
3065 	    struct regnode_charclass_class *oclass = NULL;
3066 	    I32 next_is_eval = 0;
3067 
3068 	    switch (PL_regkind[OP(scan)]) {
3069 	    case WHILEM:		/* End of (?:...)* . */
3070 		scan = NEXTOPER(scan);
3071 		goto finish;
3072 	    case PLUS:
3073 		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3074 		    next = NEXTOPER(scan);
3075 		    if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3076 			mincount = 1;
3077 			maxcount = REG_INFTY;
3078 			next = regnext(scan);
3079 			scan = NEXTOPER(scan);
3080 			goto do_curly;
3081 		    }
3082 		}
3083 		if (flags & SCF_DO_SUBSTR)
3084 		    data->pos_min++;
3085 		min++;
3086 		/* Fall through. */
3087 	    case STAR:
3088 		if (flags & SCF_DO_STCLASS) {
3089 		    mincount = 0;
3090 		    maxcount = REG_INFTY;
3091 		    next = regnext(scan);
3092 		    scan = NEXTOPER(scan);
3093 		    goto do_curly;
3094 		}
3095 		is_inf = is_inf_internal = 1;
3096 		scan = regnext(scan);
3097 		if (flags & SCF_DO_SUBSTR) {
3098 		    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3099 		    data->longest = &(data->longest_float);
3100 		}
3101 		goto optimize_curly_tail;
3102 	    case CURLY:
3103 	        if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3104 	            && (scan->flags == stopparen))
3105 		{
3106 		    mincount = 1;
3107 		    maxcount = 1;
3108 		} else {
3109 		    mincount = ARG1(scan);
3110 		    maxcount = ARG2(scan);
3111 		}
3112 		next = regnext(scan);
3113 		if (OP(scan) == CURLYX) {
3114 		    I32 lp = (data ? *(data->last_closep) : 0);
3115 		    scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3116 		}
3117 		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3118 		next_is_eval = (OP(scan) == EVAL);
3119 	      do_curly:
3120 		if (flags & SCF_DO_SUBSTR) {
3121 		    if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3122 		    pos_before = data->pos_min;
3123 		}
3124 		if (data) {
3125 		    fl = data->flags;
3126 		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3127 		    if (is_inf)
3128 			data->flags |= SF_IS_INF;
3129 		}
3130 		if (flags & SCF_DO_STCLASS) {
3131 		    cl_init(pRExC_state, &this_class);
3132 		    oclass = data->start_class;
3133 		    data->start_class = &this_class;
3134 		    f |= SCF_DO_STCLASS_AND;
3135 		    f &= ~SCF_DO_STCLASS_OR;
3136 		}
3137 		/* These are the cases when once a subexpression
3138 		   fails at a particular position, it cannot succeed
3139 		   even after backtracking at the enclosing scope.
3140 
3141 		   XXXX what if minimal match and we are at the
3142 		        initial run of {n,m}? */
3143 		if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3144 		    f &= ~SCF_WHILEM_VISITED_POS;
3145 
3146 		/* This will finish on WHILEM, setting scan, or on NULL: */
3147 		minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3148 		                      last, data, stopparen, recursed, NULL,
3149 				      (mincount == 0
3150 					? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3151 
3152 		if (flags & SCF_DO_STCLASS)
3153 		    data->start_class = oclass;
3154 		if (mincount == 0 || minnext == 0) {
3155 		    if (flags & SCF_DO_STCLASS_OR) {
3156 			cl_or(pRExC_state, data->start_class, &this_class);
3157 		    }
3158 		    else if (flags & SCF_DO_STCLASS_AND) {
3159 			/* Switch to OR mode: cache the old value of
3160 			 * data->start_class */
3161 			INIT_AND_WITHP;
3162 			StructCopy(data->start_class, and_withp,
3163 				   struct regnode_charclass_class);
3164 			flags &= ~SCF_DO_STCLASS_AND;
3165 			StructCopy(&this_class, data->start_class,
3166 				   struct regnode_charclass_class);
3167 			flags |= SCF_DO_STCLASS_OR;
3168 			data->start_class->flags |= ANYOF_EOS;
3169 		    }
3170 		} else {		/* Non-zero len */
3171 		    if (flags & SCF_DO_STCLASS_OR) {
3172 			cl_or(pRExC_state, data->start_class, &this_class);
3173 			cl_and(data->start_class, and_withp);
3174 		    }
3175 		    else if (flags & SCF_DO_STCLASS_AND)
3176 			cl_and(data->start_class, &this_class);
3177 		    flags &= ~SCF_DO_STCLASS;
3178 		}
3179 		if (!scan) 		/* It was not CURLYX, but CURLY. */
3180 		    scan = next;
3181 		if ( /* ? quantifier ok, except for (?{ ... }) */
3182 		    (next_is_eval || !(mincount == 0 && maxcount == 1))
3183 		    && (minnext == 0) && (deltanext == 0)
3184 		    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3185 		    && maxcount <= REG_INFTY/3 /* Complement check for big count */
3186 		    && ckWARN(WARN_REGEXP))
3187 		{
3188 		    vWARN(RExC_parse,
3189 			  "Quantifier unexpected on zero-length expression");
3190 		}
3191 
3192 		min += minnext * mincount;
3193 		is_inf_internal |= ((maxcount == REG_INFTY
3194 				     && (minnext + deltanext) > 0)
3195 				    || deltanext == I32_MAX);
3196 		is_inf |= is_inf_internal;
3197 		delta += (minnext + deltanext) * maxcount - minnext * mincount;
3198 
3199 		/* Try powerful optimization CURLYX => CURLYN. */
3200 		if (  OP(oscan) == CURLYX && data
3201 		      && data->flags & SF_IN_PAR
3202 		      && !(data->flags & SF_HAS_EVAL)
3203 		      && !deltanext && minnext == 1 ) {
3204 		    /* Try to optimize to CURLYN.  */
3205 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3206 		    regnode * const nxt1 = nxt;
3207 #ifdef DEBUGGING
3208 		    regnode *nxt2;
3209 #endif
3210 
3211 		    /* Skip open. */
3212 		    nxt = regnext(nxt);
3213 		    if (!strchr((const char*)PL_simple,OP(nxt))
3214 			&& !(PL_regkind[OP(nxt)] == EXACT
3215 			     && STR_LEN(nxt) == 1))
3216 			goto nogo;
3217 #ifdef DEBUGGING
3218 		    nxt2 = nxt;
3219 #endif
3220 		    nxt = regnext(nxt);
3221 		    if (OP(nxt) != CLOSE)
3222 			goto nogo;
3223 		    if (RExC_open_parens) {
3224 			RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3225 			RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3226 		    }
3227 		    /* Now we know that nxt2 is the only contents: */
3228 		    oscan->flags = (U8)ARG(nxt);
3229 		    OP(oscan) = CURLYN;
3230 		    OP(nxt1) = NOTHING;	/* was OPEN. */
3231 
3232 #ifdef DEBUGGING
3233 		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3234 		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3235 		    NEXT_OFF(nxt2) = 0;	/* just for consistancy with CURLY. */
3236 		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
3237 		    OP(nxt + 1) = OPTIMIZED; /* was count. */
3238 		    NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3239 #endif
3240 		}
3241 	      nogo:
3242 
3243 		/* Try optimization CURLYX => CURLYM. */
3244 		if (  OP(oscan) == CURLYX && data
3245 		      && !(data->flags & SF_HAS_PAR)
3246 		      && !(data->flags & SF_HAS_EVAL)
3247 		      && !deltanext	/* atom is fixed width */
3248 		      && minnext != 0	/* CURLYM can't handle zero width */
3249 		) {
3250 		    /* XXXX How to optimize if data == 0? */
3251 		    /* Optimize to a simpler form.  */
3252 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3253 		    regnode *nxt2;
3254 
3255 		    OP(oscan) = CURLYM;
3256 		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3257 			    && (OP(nxt2) != WHILEM))
3258 			nxt = nxt2;
3259 		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3260 		    /* Need to optimize away parenths. */
3261 		    if (data->flags & SF_IN_PAR) {
3262 			/* Set the parenth number.  */
3263 			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3264 
3265 			if (OP(nxt) != CLOSE)
3266 			    FAIL("Panic opt close");
3267 			oscan->flags = (U8)ARG(nxt);
3268 			if (RExC_open_parens) {
3269 			    RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3270 			    RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3271 			}
3272 			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
3273 			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
3274 
3275 #ifdef DEBUGGING
3276 			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3277 			OP(nxt + 1) = OPTIMIZED; /* was count. */
3278 			NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3279 			NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3280 #endif
3281 #if 0
3282 			while ( nxt1 && (OP(nxt1) != WHILEM)) {
3283 			    regnode *nnxt = regnext(nxt1);
3284 
3285 			    if (nnxt == nxt) {
3286 				if (reg_off_by_arg[OP(nxt1)])
3287 				    ARG_SET(nxt1, nxt2 - nxt1);
3288 				else if (nxt2 - nxt1 < U16_MAX)
3289 				    NEXT_OFF(nxt1) = nxt2 - nxt1;
3290 				else
3291 				    OP(nxt) = NOTHING;	/* Cannot beautify */
3292 			    }
3293 			    nxt1 = nnxt;
3294 			}
3295 #endif
3296 			/* Optimize again: */
3297 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3298 				    NULL, stopparen, recursed, NULL, 0,depth+1);
3299 		    }
3300 		    else
3301 			oscan->flags = 0;
3302 		}
3303 		else if ((OP(oscan) == CURLYX)
3304 			 && (flags & SCF_WHILEM_VISITED_POS)
3305 			 /* See the comment on a similar expression above.
3306 			    However, this time it not a subexpression
3307 			    we care about, but the expression itself. */
3308 			 && (maxcount == REG_INFTY)
3309 			 && data && ++data->whilem_c < 16) {
3310 		    /* This stays as CURLYX, we can put the count/of pair. */
3311 		    /* Find WHILEM (as in regexec.c) */
3312 		    regnode *nxt = oscan + NEXT_OFF(oscan);
3313 
3314 		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3315 			nxt += ARG(nxt);
3316 		    PREVOPER(nxt)->flags = (U8)(data->whilem_c
3317 			| (RExC_whilem_seen << 4)); /* On WHILEM */
3318 		}
3319 		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3320 		    pars++;
3321 		if (flags & SCF_DO_SUBSTR) {
3322 		    SV *last_str = NULL;
3323 		    int counted = mincount != 0;
3324 
3325 		    if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3326 #if defined(SPARC64_GCC_WORKAROUND)
3327 			I32 b = 0;
3328 			STRLEN l = 0;
3329 			const char *s = NULL;
3330 			I32 old = 0;
3331 
3332 			if (pos_before >= data->last_start_min)
3333 			    b = pos_before;
3334 			else
3335 			    b = data->last_start_min;
3336 
3337 			l = 0;
3338 			s = SvPV_const(data->last_found, l);
3339 			old = b - data->last_start_min;
3340 
3341 #else
3342 			I32 b = pos_before >= data->last_start_min
3343 			    ? pos_before : data->last_start_min;
3344 			STRLEN l;
3345 			const char * const s = SvPV_const(data->last_found, l);
3346 			I32 old = b - data->last_start_min;
3347 #endif
3348 
3349 			if (UTF)
3350 			    old = utf8_hop((U8*)s, old) - (U8*)s;
3351 
3352 			l -= old;
3353 			/* Get the added string: */
3354 			last_str = newSVpvn_utf8(s  + old, l, UTF);
3355 			if (deltanext == 0 && pos_before == b) {
3356 			    /* What was added is a constant string */
3357 			    if (mincount > 1) {
3358 				SvGROW(last_str, (mincount * l) + 1);
3359 				repeatcpy(SvPVX(last_str) + l,
3360 					  SvPVX_const(last_str), l, mincount - 1);
3361 				SvCUR_set(last_str, SvCUR(last_str) * mincount);
3362 				/* Add additional parts. */
3363 				SvCUR_set(data->last_found,
3364 					  SvCUR(data->last_found) - l);
3365 				sv_catsv(data->last_found, last_str);
3366 				{
3367 				    SV * sv = data->last_found;
3368 				    MAGIC *mg =
3369 					SvUTF8(sv) && SvMAGICAL(sv) ?
3370 					mg_find(sv, PERL_MAGIC_utf8) : NULL;
3371 				    if (mg && mg->mg_len >= 0)
3372 					mg->mg_len += CHR_SVLEN(last_str) - l;
3373 				}
3374 				data->last_end += l * (mincount - 1);
3375 			    }
3376 			} else {
3377 			    /* start offset must point into the last copy */
3378 			    data->last_start_min += minnext * (mincount - 1);
3379 			    data->last_start_max += is_inf ? I32_MAX
3380 				: (maxcount - 1) * (minnext + data->pos_delta);
3381 			}
3382 		    }
3383 		    /* It is counted once already... */
3384 		    data->pos_min += minnext * (mincount - counted);
3385 		    data->pos_delta += - counted * deltanext +
3386 			(minnext + deltanext) * maxcount - minnext * mincount;
3387 		    if (mincount != maxcount) {
3388 			 /* Cannot extend fixed substrings found inside
3389 			    the group.  */
3390 			SCAN_COMMIT(pRExC_state,data,minlenp);
3391 			if (mincount && last_str) {
3392 			    SV * const sv = data->last_found;
3393 			    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3394 				mg_find(sv, PERL_MAGIC_utf8) : NULL;
3395 
3396 			    if (mg)
3397 				mg->mg_len = -1;
3398 			    sv_setsv(sv, last_str);
3399 			    data->last_end = data->pos_min;
3400 			    data->last_start_min =
3401 				data->pos_min - CHR_SVLEN(last_str);
3402 			    data->last_start_max = is_inf
3403 				? I32_MAX
3404 				: data->pos_min + data->pos_delta
3405 				- CHR_SVLEN(last_str);
3406 			}
3407 			data->longest = &(data->longest_float);
3408 		    }
3409 		    SvREFCNT_dec(last_str);
3410 		}
3411 		if (data && (fl & SF_HAS_EVAL))
3412 		    data->flags |= SF_HAS_EVAL;
3413 	      optimize_curly_tail:
3414 		if (OP(oscan) != CURLYX) {
3415 		    while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3416 			   && NEXT_OFF(next))
3417 			NEXT_OFF(oscan) += NEXT_OFF(next);
3418 		}
3419 		continue;
3420 	    default:			/* REF and CLUMP only? */
3421 		if (flags & SCF_DO_SUBSTR) {
3422 		    SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3423 		    data->longest = &(data->longest_float);
3424 		}
3425 		is_inf = is_inf_internal = 1;
3426 		if (flags & SCF_DO_STCLASS_OR)
3427 		    cl_anything(pRExC_state, data->start_class);
3428 		flags &= ~SCF_DO_STCLASS;
3429 		break;
3430 	    }
3431 	}
3432 	else if (OP(scan) == LNBREAK) {
3433 	    if (flags & SCF_DO_STCLASS) {
3434 		int value = 0;
3435 		data->start_class->flags &= ~ANYOF_EOS;	/* No match on empty */
3436     	        if (flags & SCF_DO_STCLASS_AND) {
3437                     for (value = 0; value < 256; value++)
3438                         if (!is_VERTWS_cp(value))
3439                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3440                 }
3441                 else {
3442                     for (value = 0; value < 256; value++)
3443                         if (is_VERTWS_cp(value))
3444                             ANYOF_BITMAP_SET(data->start_class, value);
3445                 }
3446                 if (flags & SCF_DO_STCLASS_OR)
3447 		    cl_and(data->start_class, and_withp);
3448 		flags &= ~SCF_DO_STCLASS;
3449             }
3450 	    min += 1;
3451 	    delta += 1;
3452             if (flags & SCF_DO_SUBSTR) {
3453     	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3454     	        data->pos_min += 1;
3455 	        data->pos_delta += 1;
3456 		data->longest = &(data->longest_float);
3457     	    }
3458 
3459 	}
3460 	else if (OP(scan) == FOLDCHAR) {
3461 	    int d = ARG(scan)==0xDF ? 1 : 2;
3462 	    flags &= ~SCF_DO_STCLASS;
3463             min += 1;
3464             delta += d;
3465             if (flags & SCF_DO_SUBSTR) {
3466 	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
3467 	        data->pos_min += 1;
3468 	        data->pos_delta += d;
3469 		data->longest = &(data->longest_float);
3470 	    }
3471 	}
3472 	else if (strchr((const char*)PL_simple,OP(scan))) {
3473 	    int value = 0;
3474 
3475 	    if (flags & SCF_DO_SUBSTR) {
3476 		SCAN_COMMIT(pRExC_state,data,minlenp);
3477 		data->pos_min++;
3478 	    }
3479 	    min++;
3480 	    if (flags & SCF_DO_STCLASS) {
3481 		data->start_class->flags &= ~ANYOF_EOS;	/* No match on empty */
3482 
3483 		/* Some of the logic below assumes that switching
3484 		   locale on will only add false positives. */
3485 		switch (PL_regkind[OP(scan)]) {
3486 		case SANY:
3487 		default:
3488 		  do_default:
3489 		    /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3490 		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3491 			cl_anything(pRExC_state, data->start_class);
3492 		    break;
3493 		case REG_ANY:
3494 		    if (OP(scan) == SANY)
3495 			goto do_default;
3496 		    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3497 			value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3498 				 || (data->start_class->flags & ANYOF_CLASS));
3499 			cl_anything(pRExC_state, data->start_class);
3500 		    }
3501 		    if (flags & SCF_DO_STCLASS_AND || !value)
3502 			ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3503 		    break;
3504 		case ANYOF:
3505 		    if (flags & SCF_DO_STCLASS_AND)
3506 			cl_and(data->start_class,
3507 			       (struct regnode_charclass_class*)scan);
3508 		    else
3509 			cl_or(pRExC_state, data->start_class,
3510 			      (struct regnode_charclass_class*)scan);
3511 		    break;
3512 		case ALNUM:
3513 		    if (flags & SCF_DO_STCLASS_AND) {
3514 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3515 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3516 			    for (value = 0; value < 256; value++)
3517 				if (!isALNUM(value))
3518 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3519 			}
3520 		    }
3521 		    else {
3522 			if (data->start_class->flags & ANYOF_LOCALE)
3523 			    ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3524 			else {
3525 			    for (value = 0; value < 256; value++)
3526 				if (isALNUM(value))
3527 				    ANYOF_BITMAP_SET(data->start_class, value);
3528 			}
3529 		    }
3530 		    break;
3531 		case ALNUML:
3532 		    if (flags & SCF_DO_STCLASS_AND) {
3533 			if (data->start_class->flags & ANYOF_LOCALE)
3534 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3535 		    }
3536 		    else {
3537 			ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3538 			data->start_class->flags |= ANYOF_LOCALE;
3539 		    }
3540 		    break;
3541 		case NALNUM:
3542 		    if (flags & SCF_DO_STCLASS_AND) {
3543 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3544 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3545 			    for (value = 0; value < 256; value++)
3546 				if (isALNUM(value))
3547 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3548 			}
3549 		    }
3550 		    else {
3551 			if (data->start_class->flags & ANYOF_LOCALE)
3552 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3553 			else {
3554 			    for (value = 0; value < 256; value++)
3555 				if (!isALNUM(value))
3556 				    ANYOF_BITMAP_SET(data->start_class, value);
3557 			}
3558 		    }
3559 		    break;
3560 		case NALNUML:
3561 		    if (flags & SCF_DO_STCLASS_AND) {
3562 			if (data->start_class->flags & ANYOF_LOCALE)
3563 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3564 		    }
3565 		    else {
3566 			data->start_class->flags |= ANYOF_LOCALE;
3567 			ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3568 		    }
3569 		    break;
3570 		case SPACE:
3571 		    if (flags & SCF_DO_STCLASS_AND) {
3572 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3573 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3574 			    for (value = 0; value < 256; value++)
3575 				if (!isSPACE(value))
3576 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3577 			}
3578 		    }
3579 		    else {
3580 			if (data->start_class->flags & ANYOF_LOCALE)
3581 			    ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3582 			else {
3583 			    for (value = 0; value < 256; value++)
3584 				if (isSPACE(value))
3585 				    ANYOF_BITMAP_SET(data->start_class, value);
3586 			}
3587 		    }
3588 		    break;
3589 		case SPACEL:
3590 		    if (flags & SCF_DO_STCLASS_AND) {
3591 			if (data->start_class->flags & ANYOF_LOCALE)
3592 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3593 		    }
3594 		    else {
3595 			data->start_class->flags |= ANYOF_LOCALE;
3596 			ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3597 		    }
3598 		    break;
3599 		case NSPACE:
3600 		    if (flags & SCF_DO_STCLASS_AND) {
3601 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
3602 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3603 			    for (value = 0; value < 256; value++)
3604 				if (isSPACE(value))
3605 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3606 			}
3607 		    }
3608 		    else {
3609 			if (data->start_class->flags & ANYOF_LOCALE)
3610 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3611 			else {
3612 			    for (value = 0; value < 256; value++)
3613 				if (!isSPACE(value))
3614 				    ANYOF_BITMAP_SET(data->start_class, value);
3615 			}
3616 		    }
3617 		    break;
3618 		case NSPACEL:
3619 		    if (flags & SCF_DO_STCLASS_AND) {
3620 			if (data->start_class->flags & ANYOF_LOCALE) {
3621 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3622 			    for (value = 0; value < 256; value++)
3623 				if (!isSPACE(value))
3624 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
3625 			}
3626 		    }
3627 		    else {
3628 			data->start_class->flags |= ANYOF_LOCALE;
3629 			ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3630 		    }
3631 		    break;
3632 		case DIGIT:
3633 		    if (flags & SCF_DO_STCLASS_AND) {
3634 			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3635 			for (value = 0; value < 256; value++)
3636 			    if (!isDIGIT(value))
3637 				ANYOF_BITMAP_CLEAR(data->start_class, value);
3638 		    }
3639 		    else {
3640 			if (data->start_class->flags & ANYOF_LOCALE)
3641 			    ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3642 			else {
3643 			    for (value = 0; value < 256; value++)
3644 				if (isDIGIT(value))
3645 				    ANYOF_BITMAP_SET(data->start_class, value);
3646 			}
3647 		    }
3648 		    break;
3649 		case NDIGIT:
3650 		    if (flags & SCF_DO_STCLASS_AND) {
3651 			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3652 			for (value = 0; value < 256; value++)
3653 			    if (isDIGIT(value))
3654 				ANYOF_BITMAP_CLEAR(data->start_class, value);
3655 		    }
3656 		    else {
3657 			if (data->start_class->flags & ANYOF_LOCALE)
3658 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3659 			else {
3660 			    for (value = 0; value < 256; value++)
3661 				if (!isDIGIT(value))
3662 				    ANYOF_BITMAP_SET(data->start_class, value);
3663 			}
3664 		    }
3665 		    break;
3666 		CASE_SYNST_FNC(VERTWS);
3667 		CASE_SYNST_FNC(HORIZWS);
3668 
3669 		}
3670 		if (flags & SCF_DO_STCLASS_OR)
3671 		    cl_and(data->start_class, and_withp);
3672 		flags &= ~SCF_DO_STCLASS;
3673 	    }
3674 	}
3675 	else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3676 	    data->flags |= (OP(scan) == MEOL
3677 			    ? SF_BEFORE_MEOL
3678 			    : SF_BEFORE_SEOL);
3679 	}
3680 	else if (  PL_regkind[OP(scan)] == BRANCHJ
3681 		 /* Lookbehind, or need to calculate parens/evals/stclass: */
3682 		   && (scan->flags || data || (flags & SCF_DO_STCLASS))
3683 		   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3684             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3685                 || OP(scan) == UNLESSM )
3686             {
3687                 /* Negative Lookahead/lookbehind
3688                    In this case we can't do fixed string optimisation.
3689                 */
3690 
3691                 I32 deltanext, minnext, fake = 0;
3692                 regnode *nscan;
3693                 struct regnode_charclass_class intrnl;
3694                 int f = 0;
3695 
3696                 data_fake.flags = 0;
3697                 if (data) {
3698                     data_fake.whilem_c = data->whilem_c;
3699                     data_fake.last_closep = data->last_closep;
3700 		}
3701                 else
3702                     data_fake.last_closep = &fake;
3703 		data_fake.pos_delta = delta;
3704                 if ( flags & SCF_DO_STCLASS && !scan->flags
3705                      && OP(scan) == IFMATCH ) { /* Lookahead */
3706                     cl_init(pRExC_state, &intrnl);
3707                     data_fake.start_class = &intrnl;
3708                     f |= SCF_DO_STCLASS_AND;
3709 		}
3710                 if (flags & SCF_WHILEM_VISITED_POS)
3711                     f |= SCF_WHILEM_VISITED_POS;
3712                 next = regnext(scan);
3713                 nscan = NEXTOPER(NEXTOPER(scan));
3714                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3715                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3716                 if (scan->flags) {
3717                     if (deltanext) {
3718 			FAIL("Variable length lookbehind not implemented");
3719                     }
3720                     else if (minnext > (I32)U8_MAX) {
3721 			FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3722                     }
3723                     scan->flags = (U8)minnext;
3724                 }
3725                 if (data) {
3726                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3727                         pars++;
3728                     if (data_fake.flags & SF_HAS_EVAL)
3729                         data->flags |= SF_HAS_EVAL;
3730                     data->whilem_c = data_fake.whilem_c;
3731                 }
3732                 if (f & SCF_DO_STCLASS_AND) {
3733 		    if (flags & SCF_DO_STCLASS_OR) {
3734 			/* OR before, AND after: ideally we would recurse with
3735 			 * data_fake to get the AND applied by study of the
3736 			 * remainder of the pattern, and then derecurse;
3737 			 * *** HACK *** for now just treat as "no information".
3738 			 * See [perl #56690].
3739 			 */
3740 			cl_init(pRExC_state, data->start_class);
3741 		    }  else {
3742 			/* AND before and after: combine and continue */
3743 			const int was = (data->start_class->flags & ANYOF_EOS);
3744 
3745 			cl_and(data->start_class, &intrnl);
3746 			if (was)
3747 			    data->start_class->flags |= ANYOF_EOS;
3748 		    }
3749                 }
3750 	    }
3751 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3752             else {
3753                 /* Positive Lookahead/lookbehind
3754                    In this case we can do fixed string optimisation,
3755                    but we must be careful about it. Note in the case of
3756                    lookbehind the positions will be offset by the minimum
3757                    length of the pattern, something we won't know about
3758                    until after the recurse.
3759                 */
3760                 I32 deltanext, fake = 0;
3761                 regnode *nscan;
3762                 struct regnode_charclass_class intrnl;
3763                 int f = 0;
3764                 /* We use SAVEFREEPV so that when the full compile
3765                     is finished perl will clean up the allocated
3766                     minlens when its all done. This was we don't
3767                     have to worry about freeing them when we know
3768                     they wont be used, which would be a pain.
3769                  */
3770                 I32 *minnextp;
3771                 Newx( minnextp, 1, I32 );
3772                 SAVEFREEPV(minnextp);
3773 
3774                 if (data) {
3775                     StructCopy(data, &data_fake, scan_data_t);
3776                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3777                         f |= SCF_DO_SUBSTR;
3778                         if (scan->flags)
3779                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3780                         data_fake.last_found=newSVsv(data->last_found);
3781                     }
3782                 }
3783                 else
3784                     data_fake.last_closep = &fake;
3785                 data_fake.flags = 0;
3786 		data_fake.pos_delta = delta;
3787                 if (is_inf)
3788 	            data_fake.flags |= SF_IS_INF;
3789                 if ( flags & SCF_DO_STCLASS && !scan->flags
3790                      && OP(scan) == IFMATCH ) { /* Lookahead */
3791                     cl_init(pRExC_state, &intrnl);
3792                     data_fake.start_class = &intrnl;
3793                     f |= SCF_DO_STCLASS_AND;
3794                 }
3795                 if (flags & SCF_WHILEM_VISITED_POS)
3796                     f |= SCF_WHILEM_VISITED_POS;
3797                 next = regnext(scan);
3798                 nscan = NEXTOPER(NEXTOPER(scan));
3799 
3800                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3801                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3802                 if (scan->flags) {
3803                     if (deltanext) {
3804 			FAIL("Variable length lookbehind not implemented");
3805                     }
3806                     else if (*minnextp > (I32)U8_MAX) {
3807 			FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3808                     }
3809                     scan->flags = (U8)*minnextp;
3810                 }
3811 
3812                 *minnextp += min;
3813 
3814                 if (f & SCF_DO_STCLASS_AND) {
3815                     const int was = (data->start_class->flags & ANYOF_EOS);
3816 
3817                     cl_and(data->start_class, &intrnl);
3818                     if (was)
3819                         data->start_class->flags |= ANYOF_EOS;
3820                 }
3821                 if (data) {
3822                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3823                         pars++;
3824                     if (data_fake.flags & SF_HAS_EVAL)
3825                         data->flags |= SF_HAS_EVAL;
3826                     data->whilem_c = data_fake.whilem_c;
3827                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3828                         if (RExC_rx->minlen<*minnextp)
3829                             RExC_rx->minlen=*minnextp;
3830                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3831                         SvREFCNT_dec(data_fake.last_found);
3832 
3833                         if ( data_fake.minlen_fixed != minlenp )
3834                         {
3835                             data->offset_fixed= data_fake.offset_fixed;
3836                             data->minlen_fixed= data_fake.minlen_fixed;
3837                             data->lookbehind_fixed+= scan->flags;
3838                         }
3839                         if ( data_fake.minlen_float != minlenp )
3840                         {
3841                             data->minlen_float= data_fake.minlen_float;
3842                             data->offset_float_min=data_fake.offset_float_min;
3843                             data->offset_float_max=data_fake.offset_float_max;
3844                             data->lookbehind_float+= scan->flags;
3845                         }
3846                     }
3847                 }
3848 
3849 
3850 	    }
3851 #endif
3852 	}
3853 	else if (OP(scan) == OPEN) {
3854 	    if (stopparen != (I32)ARG(scan))
3855 	        pars++;
3856 	}
3857 	else if (OP(scan) == CLOSE) {
3858 	    if (stopparen == (I32)ARG(scan)) {
3859 	        break;
3860 	    }
3861 	    if ((I32)ARG(scan) == is_par) {
3862 		next = regnext(scan);
3863 
3864 		if ( next && (OP(next) != WHILEM) && next < last)
3865 		    is_par = 0;		/* Disable optimization */
3866 	    }
3867 	    if (data)
3868 		*(data->last_closep) = ARG(scan);
3869 	}
3870 	else if (OP(scan) == EVAL) {
3871 		if (data)
3872 		    data->flags |= SF_HAS_EVAL;
3873 	}
3874 	else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3875 	    if (flags & SCF_DO_SUBSTR) {
3876 		SCAN_COMMIT(pRExC_state,data,minlenp);
3877 		flags &= ~SCF_DO_SUBSTR;
3878 	    }
3879 	    if (data && OP(scan)==ACCEPT) {
3880 	        data->flags |= SCF_SEEN_ACCEPT;
3881 	        if (stopmin > min)
3882 	            stopmin = min;
3883 	    }
3884 	}
3885 	else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3886 	{
3887 		if (flags & SCF_DO_SUBSTR) {
3888 		    SCAN_COMMIT(pRExC_state,data,minlenp);
3889 		    data->longest = &(data->longest_float);
3890 		}
3891 		is_inf = is_inf_internal = 1;
3892 		if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3893 		    cl_anything(pRExC_state, data->start_class);
3894 		flags &= ~SCF_DO_STCLASS;
3895 	}
3896 	else if (OP(scan) == GPOS) {
3897 	    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3898 	        !(delta || is_inf || (data && data->pos_delta)))
3899 	    {
3900 	        if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3901 		    RExC_rx->extflags |= RXf_ANCH_GPOS;
3902 	        if (RExC_rx->gofs < (U32)min)
3903 		    RExC_rx->gofs = min;
3904             } else {
3905                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3906                 RExC_rx->gofs = 0;
3907             }
3908 	}
3909 #ifdef TRIE_STUDY_OPT
3910 #ifdef FULL_TRIE_STUDY
3911         else if (PL_regkind[OP(scan)] == TRIE) {
3912             /* NOTE - There is similar code to this block above for handling
3913                BRANCH nodes on the initial study.  If you change stuff here
3914                check there too. */
3915             regnode *trie_node= scan;
3916             regnode *tail= regnext(scan);
3917             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3918             I32 max1 = 0, min1 = I32_MAX;
3919             struct regnode_charclass_class accum;
3920 
3921             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3922                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3923             if (flags & SCF_DO_STCLASS)
3924                 cl_init_zero(pRExC_state, &accum);
3925 
3926             if (!trie->jump) {
3927                 min1= trie->minlen;
3928                 max1= trie->maxlen;
3929             } else {
3930                 const regnode *nextbranch= NULL;
3931                 U32 word;
3932 
3933                 for ( word=1 ; word <= trie->wordcount ; word++)
3934                 {
3935                     I32 deltanext=0, minnext=0, f = 0, fake;
3936                     struct regnode_charclass_class this_class;
3937 
3938                     data_fake.flags = 0;
3939                     if (data) {
3940                         data_fake.whilem_c = data->whilem_c;
3941                         data_fake.last_closep = data->last_closep;
3942                     }
3943                     else
3944                         data_fake.last_closep = &fake;
3945 		    data_fake.pos_delta = delta;
3946                     if (flags & SCF_DO_STCLASS) {
3947                         cl_init(pRExC_state, &this_class);
3948                         data_fake.start_class = &this_class;
3949                         f = SCF_DO_STCLASS_AND;
3950                     }
3951                     if (flags & SCF_WHILEM_VISITED_POS)
3952                         f |= SCF_WHILEM_VISITED_POS;
3953 
3954                     if (trie->jump[word]) {
3955                         if (!nextbranch)
3956                             nextbranch = trie_node + trie->jump[0];
3957                         scan= trie_node + trie->jump[word];
3958                         /* We go from the jump point to the branch that follows
3959                            it. Note this means we need the vestigal unused branches
3960                            even though they arent otherwise used.
3961                          */
3962                         minnext = study_chunk(pRExC_state, &scan, minlenp,
3963                             &deltanext, (regnode *)nextbranch, &data_fake,
3964                             stopparen, recursed, NULL, f,depth+1);
3965                     }
3966                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3967                         nextbranch= regnext((regnode*)nextbranch);
3968 
3969                     if (min1 > (I32)(minnext + trie->minlen))
3970                         min1 = minnext + trie->minlen;
3971                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3972                         max1 = minnext + deltanext + trie->maxlen;
3973                     if (deltanext == I32_MAX)
3974                         is_inf = is_inf_internal = 1;
3975 
3976                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3977                         pars++;
3978                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3979                         if ( stopmin > min + min1)
3980 	                    stopmin = min + min1;
3981 	                flags &= ~SCF_DO_SUBSTR;
3982 	                if (data)
3983 	                    data->flags |= SCF_SEEN_ACCEPT;
3984 	            }
3985                     if (data) {
3986                         if (data_fake.flags & SF_HAS_EVAL)
3987                             data->flags |= SF_HAS_EVAL;
3988                         data->whilem_c = data_fake.whilem_c;
3989                     }
3990                     if (flags & SCF_DO_STCLASS)
3991                         cl_or(pRExC_state, &accum, &this_class);
3992                 }
3993             }
3994             if (flags & SCF_DO_SUBSTR) {
3995                 data->pos_min += min1;
3996                 data->pos_delta += max1 - min1;
3997                 if (max1 != min1 || is_inf)
3998                     data->longest = &(data->longest_float);
3999             }
4000             min += min1;
4001             delta += max1 - min1;
4002             if (flags & SCF_DO_STCLASS_OR) {
4003                 cl_or(pRExC_state, data->start_class, &accum);
4004                 if (min1) {
4005                     cl_and(data->start_class, and_withp);
4006                     flags &= ~SCF_DO_STCLASS;
4007                 }
4008             }
4009             else if (flags & SCF_DO_STCLASS_AND) {
4010                 if (min1) {
4011                     cl_and(data->start_class, &accum);
4012                     flags &= ~SCF_DO_STCLASS;
4013                 }
4014                 else {
4015                     /* Switch to OR mode: cache the old value of
4016                      * data->start_class */
4017 		    INIT_AND_WITHP;
4018                     StructCopy(data->start_class, and_withp,
4019                                struct regnode_charclass_class);
4020                     flags &= ~SCF_DO_STCLASS_AND;
4021                     StructCopy(&accum, data->start_class,
4022                                struct regnode_charclass_class);
4023                     flags |= SCF_DO_STCLASS_OR;
4024                     data->start_class->flags |= ANYOF_EOS;
4025                 }
4026             }
4027             scan= tail;
4028             continue;
4029         }
4030 #else
4031 	else if (PL_regkind[OP(scan)] == TRIE) {
4032 	    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4033 	    U8*bang=NULL;
4034 
4035 	    min += trie->minlen;
4036 	    delta += (trie->maxlen - trie->minlen);
4037 	    flags &= ~SCF_DO_STCLASS; /* xxx */
4038             if (flags & SCF_DO_SUBSTR) {
4039     	        SCAN_COMMIT(pRExC_state,data,minlenp);	/* Cannot expect anything... */
4040     	        data->pos_min += trie->minlen;
4041     	        data->pos_delta += (trie->maxlen - trie->minlen);
4042 		if (trie->maxlen != trie->minlen)
4043 		    data->longest = &(data->longest_float);
4044     	    }
4045     	    if (trie->jump) /* no more substrings -- for now /grr*/
4046     	        flags &= ~SCF_DO_SUBSTR;
4047 	}
4048 #endif /* old or new */
4049 #endif /* TRIE_STUDY_OPT */
4050 
4051 	/* Else: zero-length, ignore. */
4052 	scan = regnext(scan);
4053     }
4054     if (frame) {
4055         last = frame->last;
4056         scan = frame->next;
4057         stopparen = frame->stop;
4058         frame = frame->prev;
4059         goto fake_study_recurse;
4060     }
4061 
4062   finish:
4063     assert(!frame);
4064     DEBUG_STUDYDATA("pre-fin:",data,depth);
4065 
4066     *scanp = scan;
4067     *deltap = is_inf_internal ? I32_MAX : delta;
4068     if (flags & SCF_DO_SUBSTR && is_inf)
4069 	data->pos_delta = I32_MAX - data->pos_min;
4070     if (is_par > (I32)U8_MAX)
4071 	is_par = 0;
4072     if (is_par && pars==1 && data) {
4073 	data->flags |= SF_IN_PAR;
4074 	data->flags &= ~SF_HAS_PAR;
4075     }
4076     else if (pars && data) {
4077 	data->flags |= SF_HAS_PAR;
4078 	data->flags &= ~SF_IN_PAR;
4079     }
4080     if (flags & SCF_DO_STCLASS_OR)
4081 	cl_and(data->start_class, and_withp);
4082     if (flags & SCF_TRIE_RESTUDY)
4083         data->flags |= 	SCF_TRIE_RESTUDY;
4084 
4085     DEBUG_STUDYDATA("post-fin:",data,depth);
4086 
4087     return min < stopmin ? min : stopmin;
4088 }
4089 
4090 STATIC U32
4091 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4092 {
4093     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4094 
4095     PERL_ARGS_ASSERT_ADD_DATA;
4096 
4097     Renewc(RExC_rxi->data,
4098 	   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4099 	   char, struct reg_data);
4100     if(count)
4101 	Renew(RExC_rxi->data->what, count + n, U8);
4102     else
4103 	Newx(RExC_rxi->data->what, n, U8);
4104     RExC_rxi->data->count = count + n;
4105     Copy(s, RExC_rxi->data->what + count, n, U8);
4106     return count;
4107 }
4108 
4109 /*XXX: todo make this not included in a non debugging perl */
4110 #ifndef PERL_IN_XSUB_RE
4111 void
4112 Perl_reginitcolors(pTHX)
4113 {
4114     dVAR;
4115     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4116     if (s) {
4117 	char *t = savepv(s);
4118 	int i = 0;
4119 	PL_colors[0] = t;
4120 	while (++i < 6) {
4121 	    t = strchr(t, '\t');
4122 	    if (t) {
4123 		*t = '\0';
4124 		PL_colors[i] = ++t;
4125 	    }
4126 	    else
4127 		PL_colors[i] = t = (char *)"";
4128 	}
4129     } else {
4130 	int i = 0;
4131 	while (i < 6)
4132 	    PL_colors[i++] = (char *)"";
4133     }
4134     PL_colorset = 1;
4135 }
4136 #endif
4137 
4138 
4139 #ifdef TRIE_STUDY_OPT
4140 #define CHECK_RESTUDY_GOTO                                  \
4141         if (                                                \
4142               (data.flags & SCF_TRIE_RESTUDY)               \
4143               && ! restudied++                              \
4144         )     goto reStudy
4145 #else
4146 #define CHECK_RESTUDY_GOTO
4147 #endif
4148 
4149 /*
4150  - pregcomp - compile a regular expression into internal code
4151  *
4152  * We can't allocate space until we know how big the compiled form will be,
4153  * but we can't compile it (and thus know how big it is) until we've got a
4154  * place to put the code.  So we cheat:  we compile it twice, once with code
4155  * generation turned off and size counting turned on, and once "for real".
4156  * This also means that we don't allocate space until we are sure that the
4157  * thing really will compile successfully, and we never have to move the
4158  * code and thus invalidate pointers into it.  (Note that it has to be in
4159  * one piece because free() must be able to free it all.) [NB: not true in perl]
4160  *
4161  * Beware that the optimization-preparation code in here knows about some
4162  * of the structure of the compiled regexp.  [I'll say.]
4163  */
4164 
4165 
4166 
4167 #ifndef PERL_IN_XSUB_RE
4168 #define RE_ENGINE_PTR &PL_core_reg_engine
4169 #else
4170 extern const struct regexp_engine my_reg_engine;
4171 #define RE_ENGINE_PTR &my_reg_engine
4172 #endif
4173 
4174 #ifndef PERL_IN_XSUB_RE
4175 REGEXP *
4176 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4177 {
4178     dVAR;
4179     HV * const table = GvHV(PL_hintgv);
4180 
4181     PERL_ARGS_ASSERT_PREGCOMP;
4182 
4183     /* Dispatch a request to compile a regexp to correct
4184        regexp engine. */
4185     if (table) {
4186         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4187         GET_RE_DEBUG_FLAGS_DECL;
4188         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4189             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4190             DEBUG_COMPILE_r({
4191                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4192                     SvIV(*ptr));
4193             });
4194             return CALLREGCOMP_ENG(eng, pattern, flags);
4195         }
4196     }
4197     return Perl_re_compile(aTHX_ pattern, flags);
4198 }
4199 #endif
4200 
4201 REGEXP *
4202 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4203 {
4204     dVAR;
4205     register REGEXP *r;
4206     register regexp_internal *ri;
4207     STRLEN plen;
4208     char*  exp = SvPV((SV*)pattern, plen);
4209     char* xend = exp + plen;
4210     regnode *scan;
4211     I32 flags;
4212     I32 minlen = 0;
4213     I32 sawplus = 0;
4214     I32 sawopen = 0;
4215     scan_data_t data;
4216     RExC_state_t RExC_state;
4217     RExC_state_t * const pRExC_state = &RExC_state;
4218 #ifdef TRIE_STUDY_OPT
4219     int restudied= 0;
4220     RExC_state_t copyRExC_state;
4221 #endif
4222     GET_RE_DEBUG_FLAGS_DECL;
4223 
4224     PERL_ARGS_ASSERT_RE_COMPILE;
4225 
4226     DEBUG_r(if (!PL_colorset) reginitcolors());
4227 
4228     RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4229 
4230     DEBUG_COMPILE_r({
4231         SV *dsv= sv_newmortal();
4232         RE_PV_QUOTED_DECL(s, RExC_utf8,
4233             dsv, exp, plen, 60);
4234         PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4235 		       PL_colors[4],PL_colors[5],s);
4236     });
4237 
4238 redo_first_pass:
4239     RExC_precomp = exp;
4240     RExC_flags = pm_flags;
4241     RExC_sawback = 0;
4242 
4243     RExC_seen = 0;
4244     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4245     RExC_seen_evals = 0;
4246     RExC_extralen = 0;
4247 
4248     /* First pass: determine size, legality. */
4249     RExC_parse = exp;
4250     RExC_start = exp;
4251     RExC_end = xend;
4252     RExC_naughty = 0;
4253     RExC_npar = 1;
4254     RExC_nestroot = 0;
4255     RExC_size = 0L;
4256     RExC_emit = &PL_regdummy;
4257     RExC_whilem_seen = 0;
4258     RExC_charnames = NULL;
4259     RExC_open_parens = NULL;
4260     RExC_close_parens = NULL;
4261     RExC_opend = NULL;
4262     RExC_paren_names = NULL;
4263 #ifdef DEBUGGING
4264     RExC_paren_name_list = NULL;
4265 #endif
4266     RExC_recurse = NULL;
4267     RExC_recurse_count = 0;
4268 
4269 #if 0 /* REGC() is (currently) a NOP at the first pass.
4270        * Clever compilers notice this and complain. --jhi */
4271     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4272 #endif
4273     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4274     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4275 	RExC_precomp = NULL;
4276 	return(NULL);
4277     }
4278     if (RExC_utf8 && !RExC_orig_utf8) {
4279         /* It's possible to write a regexp in ascii that represents Unicode
4280         codepoints outside of the byte range, such as via \x{100}. If we
4281         detect such a sequence we have to convert the entire pattern to utf8
4282         and then recompile, as our sizing calculation will have been based
4283         on 1 byte == 1 character, but we will need to use utf8 to encode
4284         at least some part of the pattern, and therefore must convert the whole
4285         thing.
4286         XXX: somehow figure out how to make this less expensive...
4287         -- dmq */
4288         STRLEN len = plen;
4289         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4290 	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4291         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4292         xend = exp + len;
4293         RExC_orig_utf8 = RExC_utf8;
4294         SAVEFREEPV(exp);
4295         goto redo_first_pass;
4296     }
4297     DEBUG_PARSE_r({
4298         PerlIO_printf(Perl_debug_log,
4299             "Required size %"IVdf" nodes\n"
4300             "Starting second pass (creation)\n",
4301             (IV)RExC_size);
4302         RExC_lastnum=0;
4303         RExC_lastparse=NULL;
4304     });
4305     /* Small enough for pointer-storage convention?
4306        If extralen==0, this means that we will not need long jumps. */
4307     if (RExC_size >= 0x10000L && RExC_extralen)
4308         RExC_size += RExC_extralen;
4309     else
4310 	RExC_extralen = 0;
4311     if (RExC_whilem_seen > 15)
4312 	RExC_whilem_seen = 15;
4313 
4314     /* Allocate space and zero-initialize. Note, the two step process
4315        of zeroing when in debug mode, thus anything assigned has to
4316        happen after that */
4317     Newxz(r, 1, regexp);
4318     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4319 	 char, regexp_internal);
4320     if ( r == NULL || ri == NULL )
4321 	FAIL("Regexp out of space");
4322 #ifdef DEBUGGING
4323     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4324     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4325 #else
4326     /* bulk initialize base fields with 0. */
4327     Zero(ri, sizeof(regexp_internal), char);
4328 #endif
4329 
4330     /* non-zero initialization begins here */
4331     RXi_SET( r, ri );
4332     r->engine= RE_ENGINE_PTR;
4333     r->refcnt = 1;
4334     RX_PRELEN(r) = plen;
4335     r->extflags = pm_flags;
4336     {
4337         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4338 	bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4339 	bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4340 	U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4341 			    >> RXf_PMf_STD_PMMOD_SHIFT);
4342 	const char *fptr = STD_PAT_MODS;        /*"msix"*/
4343 	char *p;
4344         RX_WRAPLEN(r) = plen + has_minus + has_p + has_runon
4345             + (sizeof(STD_PAT_MODS) - 1)
4346             + (sizeof("(?:)") - 1);
4347 
4348         Newx(RX_WRAPPED(r), RX_WRAPLEN(r) + 1, char );
4349         p = RX_WRAPPED(r);
4350         *p++='('; *p++='?';
4351         if (has_p)
4352             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4353         {
4354             char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4355             char *colon = r + 1;
4356             char ch;
4357 
4358             while((ch = *fptr++)) {
4359                 if(reganch & 1)
4360                     *p++ = ch;
4361                 else
4362                     *r-- = ch;
4363                 reganch >>= 1;
4364             }
4365             if(has_minus) {
4366                 *r = '-';
4367                 p = colon;
4368             }
4369         }
4370 
4371         *p++ = ':';
4372         Copy(RExC_precomp, p, plen, char);
4373         RX_PRECOMP(r) = p;
4374         p += plen;
4375         if (has_runon)
4376             *p++ = '\n';
4377         *p++ = ')';
4378         *p = 0;
4379     }
4380 
4381     r->intflags = 0;
4382     r->nparens = RExC_npar - 1;	/* set early to validate backrefs */
4383 
4384     if (RExC_seen & REG_SEEN_RECURSE) {
4385         Newxz(RExC_open_parens, RExC_npar,regnode *);
4386         SAVEFREEPV(RExC_open_parens);
4387         Newxz(RExC_close_parens,RExC_npar,regnode *);
4388         SAVEFREEPV(RExC_close_parens);
4389     }
4390 
4391     /* Useful during FAIL. */
4392 #ifdef RE_TRACK_PATTERN_OFFSETS
4393     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4394     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4395                           "%s %"UVuf" bytes for offset annotations.\n",
4396                           ri->u.offsets ? "Got" : "Couldn't get",
4397                           (UV)((2*RExC_size+1) * sizeof(U32))));
4398 #endif
4399     SetProgLen(ri,RExC_size);
4400     RExC_rx = r;
4401     RExC_rxi = ri;
4402 
4403     /* Second pass: emit code. */
4404     RExC_flags = pm_flags;	/* don't let top level (?i) bleed */
4405     RExC_parse = exp;
4406     RExC_end = xend;
4407     RExC_naughty = 0;
4408     RExC_npar = 1;
4409     RExC_emit_start = ri->program;
4410     RExC_emit = ri->program;
4411     RExC_emit_bound = ri->program + RExC_size + 1;
4412 
4413     /* Store the count of eval-groups for security checks: */
4414     RExC_rx->seen_evals = RExC_seen_evals;
4415     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4416     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4417 	ReREFCNT_dec(r);
4418 	return(NULL);
4419     }
4420     /* XXXX To minimize changes to RE engine we always allocate
4421        3-units-long substrs field. */
4422     Newx(r->substrs, 1, struct reg_substr_data);
4423     if (RExC_recurse_count) {
4424         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4425         SAVEFREEPV(RExC_recurse);
4426     }
4427 
4428 reStudy:
4429     r->minlen = minlen = sawplus = sawopen = 0;
4430     Zero(r->substrs, 1, struct reg_substr_data);
4431 
4432 #ifdef TRIE_STUDY_OPT
4433     if (!restudied) {
4434         StructCopy(&zero_scan_data, &data, scan_data_t);
4435         copyRExC_state = RExC_state;
4436     } else {
4437         U32 seen=RExC_seen;
4438         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4439 
4440         RExC_state = copyRExC_state;
4441         if (seen & REG_TOP_LEVEL_BRANCHES)
4442             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4443         else
4444             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4445         if (data.last_found) {
4446             SvREFCNT_dec(data.longest_fixed);
4447 	    SvREFCNT_dec(data.longest_float);
4448 	    SvREFCNT_dec(data.last_found);
4449 	}
4450 	StructCopy(&zero_scan_data, &data, scan_data_t);
4451     }
4452 #else
4453     StructCopy(&zero_scan_data, &data, scan_data_t);
4454 #endif
4455 
4456     /* Dig out information for optimizations. */
4457     r->extflags = RExC_flags; /* was pm_op */
4458     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4459 
4460     if (UTF)
4461         r->extflags |= RXf_UTF8;	/* Unicode in it? */
4462     ri->regstclass = NULL;
4463     if (RExC_naughty >= 10)	/* Probably an expensive pattern. */
4464 	r->intflags |= PREGf_NAUGHTY;
4465     scan = ri->program + 1;		/* First BRANCH. */
4466 
4467     /* testing for BRANCH here tells us whether there is "must appear"
4468        data in the pattern. If there is then we can use it for optimisations */
4469     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4470 	I32 fake;
4471 	STRLEN longest_float_length, longest_fixed_length;
4472 	struct regnode_charclass_class ch_class; /* pointed to by data */
4473 	int stclass_flag;
4474 	I32 last_close = 0; /* pointed to by data */
4475         regnode *first= scan;
4476         regnode *first_next= regnext(first);
4477 
4478 	/*
4479 	 * Skip introductions and multiplicators >= 1
4480 	 * so that we can extract the 'meat' of the pattern that must
4481 	 * match in the large if() sequence following.
4482 	 * NOTE that EXACT is NOT covered here, as it is normally
4483 	 * picked up by the optimiser separately.
4484 	 *
4485 	 * This is unfortunate as the optimiser isnt handling lookahead
4486 	 * properly currently.
4487 	 *
4488 	 */
4489 	while ((OP(first) == OPEN && (sawopen = 1)) ||
4490 	       /* An OR of *one* alternative - should not happen now. */
4491 	    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4492 	    /* for now we can't handle lookbehind IFMATCH*/
4493 	    (OP(first) == IFMATCH && !first->flags) ||
4494 	    (OP(first) == PLUS) ||
4495 	    (OP(first) == MINMOD) ||
4496 	       /* An {n,m} with n>0 */
4497 	    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4498 	    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4499 	{
4500 		/*
4501 		 * the only op that could be a regnode is PLUS, all the rest
4502 		 * will be regnode_1 or regnode_2.
4503 		 *
4504 		 */
4505 		if (OP(first) == PLUS)
4506 		    sawplus = 1;
4507 		else
4508 		    first += regarglen[OP(first)];
4509 
4510 		first = NEXTOPER(first);
4511 		first_next= regnext(first);
4512 	}
4513 
4514 	/* Starting-point info. */
4515       again:
4516         DEBUG_PEEP("first:",first,0);
4517         /* Ignore EXACT as we deal with it later. */
4518 	if (PL_regkind[OP(first)] == EXACT) {
4519 	    if (OP(first) == EXACT)
4520 		NOOP;	/* Empty, get anchored substr later. */
4521 	    else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4522 		ri->regstclass = first;
4523 	}
4524 #ifdef TRIE_STCLASS
4525 	else if (PL_regkind[OP(first)] == TRIE &&
4526 	        ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4527 	{
4528 	    regnode *trie_op;
4529 	    /* this can happen only on restudy */
4530 	    if ( OP(first) == TRIE ) {
4531                 struct regnode_1 *trieop = (struct regnode_1 *)
4532 		    PerlMemShared_calloc(1, sizeof(struct regnode_1));
4533                 StructCopy(first,trieop,struct regnode_1);
4534                 trie_op=(regnode *)trieop;
4535             } else {
4536                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4537 		    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4538                 StructCopy(first,trieop,struct regnode_charclass);
4539                 trie_op=(regnode *)trieop;
4540             }
4541             OP(trie_op)+=2;
4542             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4543 	    ri->regstclass = trie_op;
4544 	}
4545 #endif
4546 	else if (strchr((const char*)PL_simple,OP(first)))
4547 	    ri->regstclass = first;
4548 	else if (PL_regkind[OP(first)] == BOUND ||
4549 		 PL_regkind[OP(first)] == NBOUND)
4550 	    ri->regstclass = first;
4551 	else if (PL_regkind[OP(first)] == BOL) {
4552 	    r->extflags |= (OP(first) == MBOL
4553 			   ? RXf_ANCH_MBOL
4554 			   : (OP(first) == SBOL
4555 			      ? RXf_ANCH_SBOL
4556 			      : RXf_ANCH_BOL));
4557 	    first = NEXTOPER(first);
4558 	    goto again;
4559 	}
4560 	else if (OP(first) == GPOS) {
4561 	    r->extflags |= RXf_ANCH_GPOS;
4562 	    first = NEXTOPER(first);
4563 	    goto again;
4564 	}
4565 	else if ((!sawopen || !RExC_sawback) &&
4566 	    (OP(first) == STAR &&
4567 	    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4568 	    !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4569 	{
4570 	    /* turn .* into ^.* with an implied $*=1 */
4571 	    const int type =
4572 		(OP(NEXTOPER(first)) == REG_ANY)
4573 		    ? RXf_ANCH_MBOL
4574 		    : RXf_ANCH_SBOL;
4575 	    r->extflags |= type;
4576 	    r->intflags |= PREGf_IMPLICIT;
4577 	    first = NEXTOPER(first);
4578 	    goto again;
4579 	}
4580 	if (sawplus && (!sawopen || !RExC_sawback)
4581 	    && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4582 	    /* x+ must match at the 1st pos of run of x's */
4583 	    r->intflags |= PREGf_SKIP;
4584 
4585 	/* Scan is after the zeroth branch, first is atomic matcher. */
4586 #ifdef TRIE_STUDY_OPT
4587 	DEBUG_PARSE_r(
4588 	    if (!restudied)
4589 	        PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4590 			      (IV)(first - scan + 1))
4591         );
4592 #else
4593 	DEBUG_PARSE_r(
4594 	    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4595 	        (IV)(first - scan + 1))
4596         );
4597 #endif
4598 
4599 
4600 	/*
4601 	* If there's something expensive in the r.e., find the
4602 	* longest literal string that must appear and make it the
4603 	* regmust.  Resolve ties in favor of later strings, since
4604 	* the regstart check works with the beginning of the r.e.
4605 	* and avoiding duplication strengthens checking.  Not a
4606 	* strong reason, but sufficient in the absence of others.
4607 	* [Now we resolve ties in favor of the earlier string if
4608 	* it happens that c_offset_min has been invalidated, since the
4609 	* earlier string may buy us something the later one won't.]
4610 	*/
4611 
4612 	data.longest_fixed = newSVpvs("");
4613 	data.longest_float = newSVpvs("");
4614 	data.last_found = newSVpvs("");
4615 	data.longest = &(data.longest_fixed);
4616 	first = scan;
4617 	if (!ri->regstclass) {
4618 	    cl_init(pRExC_state, &ch_class);
4619 	    data.start_class = &ch_class;
4620 	    stclass_flag = SCF_DO_STCLASS_AND;
4621 	} else				/* XXXX Check for BOUND? */
4622 	    stclass_flag = 0;
4623 	data.last_closep = &last_close;
4624 
4625 	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4626             &data, -1, NULL, NULL,
4627             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4628 
4629 
4630         CHECK_RESTUDY_GOTO;
4631 
4632 
4633 	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4634 	     && data.last_start_min == 0 && data.last_end > 0
4635 	     && !RExC_seen_zerolen
4636 	     && !(RExC_seen & REG_SEEN_VERBARG)
4637 	     && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4638 	    r->extflags |= RXf_CHECK_ALL;
4639 	scan_commit(pRExC_state, &data,&minlen,0);
4640 	SvREFCNT_dec(data.last_found);
4641 
4642         /* Note that code very similar to this but for anchored string
4643            follows immediately below, changes may need to be made to both.
4644            Be careful.
4645          */
4646 	longest_float_length = CHR_SVLEN(data.longest_float);
4647 	if (longest_float_length
4648 	    || (data.flags & SF_FL_BEFORE_EOL
4649 		&& (!(data.flags & SF_FL_BEFORE_MEOL)
4650 		    || (RExC_flags & RXf_PMf_MULTILINE))))
4651         {
4652             I32 t,ml;
4653 
4654 	    if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4655 		&& data.offset_fixed == data.offset_float_min
4656 		&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4657 		    goto remove_float;		/* As in (a)+. */
4658 
4659             /* copy the information about the longest float from the reg_scan_data
4660                over to the program. */
4661 	    if (SvUTF8(data.longest_float)) {
4662 		r->float_utf8 = data.longest_float;
4663 		r->float_substr = NULL;
4664 	    } else {
4665 		r->float_substr = data.longest_float;
4666 		r->float_utf8 = NULL;
4667 	    }
4668 	    /* float_end_shift is how many chars that must be matched that
4669 	       follow this item. We calculate it ahead of time as once the
4670 	       lookbehind offset is added in we lose the ability to correctly
4671 	       calculate it.*/
4672 	    ml = data.minlen_float ? *(data.minlen_float)
4673 	                           : (I32)longest_float_length;
4674 	    r->float_end_shift = ml - data.offset_float_min
4675 	        - longest_float_length + (SvTAIL(data.longest_float) != 0)
4676 	        + data.lookbehind_float;
4677 	    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4678 	    r->float_max_offset = data.offset_float_max;
4679 	    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4680 	        r->float_max_offset -= data.lookbehind_float;
4681 
4682 	    t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4683 		       && (!(data.flags & SF_FL_BEFORE_MEOL)
4684 			   || (RExC_flags & RXf_PMf_MULTILINE)));
4685 	    fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4686 	}
4687 	else {
4688 	  remove_float:
4689 	    r->float_substr = r->float_utf8 = NULL;
4690 	    SvREFCNT_dec(data.longest_float);
4691 	    longest_float_length = 0;
4692 	}
4693 
4694         /* Note that code very similar to this but for floating string
4695            is immediately above, changes may need to be made to both.
4696            Be careful.
4697          */
4698 	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4699 	if (longest_fixed_length
4700 	    || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4701 		&& (!(data.flags & SF_FIX_BEFORE_MEOL)
4702 		    || (RExC_flags & RXf_PMf_MULTILINE))))
4703         {
4704             I32 t,ml;
4705 
4706             /* copy the information about the longest fixed
4707                from the reg_scan_data over to the program. */
4708 	    if (SvUTF8(data.longest_fixed)) {
4709 		r->anchored_utf8 = data.longest_fixed;
4710 		r->anchored_substr = NULL;
4711 	    } else {
4712 		r->anchored_substr = data.longest_fixed;
4713 		r->anchored_utf8 = NULL;
4714 	    }
4715 	    /* fixed_end_shift is how many chars that must be matched that
4716 	       follow this item. We calculate it ahead of time as once the
4717 	       lookbehind offset is added in we lose the ability to correctly
4718 	       calculate it.*/
4719             ml = data.minlen_fixed ? *(data.minlen_fixed)
4720                                    : (I32)longest_fixed_length;
4721             r->anchored_end_shift = ml - data.offset_fixed
4722 	        - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4723 	        + data.lookbehind_fixed;
4724 	    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4725 
4726 	    t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4727 		 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4728 		     || (RExC_flags & RXf_PMf_MULTILINE)));
4729 	    fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4730 	}
4731 	else {
4732 	    r->anchored_substr = r->anchored_utf8 = NULL;
4733 	    SvREFCNT_dec(data.longest_fixed);
4734 	    longest_fixed_length = 0;
4735 	}
4736 	if (ri->regstclass
4737 	    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4738 	    ri->regstclass = NULL;
4739 	if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4740 	    && stclass_flag
4741 	    && !(data.start_class->flags & ANYOF_EOS)
4742 	    && !cl_is_anything(data.start_class))
4743 	{
4744 	    const U32 n = add_data(pRExC_state, 1, "f");
4745 
4746 	    Newx(RExC_rxi->data->data[n], 1,
4747 		struct regnode_charclass_class);
4748 	    StructCopy(data.start_class,
4749 		       (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4750 		       struct regnode_charclass_class);
4751 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4752 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
4753 	    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4754 	              regprop(r, sv, (regnode*)data.start_class);
4755 		      PerlIO_printf(Perl_debug_log,
4756 				    "synthetic stclass \"%s\".\n",
4757 				    SvPVX_const(sv));});
4758 	}
4759 
4760 	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4761 	if (longest_fixed_length > longest_float_length) {
4762 	    r->check_end_shift = r->anchored_end_shift;
4763 	    r->check_substr = r->anchored_substr;
4764 	    r->check_utf8 = r->anchored_utf8;
4765 	    r->check_offset_min = r->check_offset_max = r->anchored_offset;
4766 	    if (r->extflags & RXf_ANCH_SINGLE)
4767 		r->extflags |= RXf_NOSCAN;
4768 	}
4769 	else {
4770 	    r->check_end_shift = r->float_end_shift;
4771 	    r->check_substr = r->float_substr;
4772 	    r->check_utf8 = r->float_utf8;
4773 	    r->check_offset_min = r->float_min_offset;
4774 	    r->check_offset_max = r->float_max_offset;
4775 	}
4776 	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4777 	   This should be changed ASAP!  */
4778 	if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4779 	    r->extflags |= RXf_USE_INTUIT;
4780 	    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4781 		r->extflags |= RXf_INTUIT_TAIL;
4782 	}
4783 	/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4784 	if ( (STRLEN)minlen < longest_float_length )
4785             minlen= longest_float_length;
4786         if ( (STRLEN)minlen < longest_fixed_length )
4787             minlen= longest_fixed_length;
4788         */
4789     }
4790     else {
4791 	/* Several toplevels. Best we can is to set minlen. */
4792 	I32 fake;
4793 	struct regnode_charclass_class ch_class;
4794 	I32 last_close = 0;
4795 
4796 	DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4797 
4798 	scan = ri->program + 1;
4799 	cl_init(pRExC_state, &ch_class);
4800 	data.start_class = &ch_class;
4801 	data.last_closep = &last_close;
4802 
4803 
4804 	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4805 	    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4806 
4807         CHECK_RESTUDY_GOTO;
4808 
4809 	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4810 		= r->float_substr = r->float_utf8 = NULL;
4811 	if (!(data.start_class->flags & ANYOF_EOS)
4812 	    && !cl_is_anything(data.start_class))
4813 	{
4814 	    const U32 n = add_data(pRExC_state, 1, "f");
4815 
4816 	    Newx(RExC_rxi->data->data[n], 1,
4817 		struct regnode_charclass_class);
4818 	    StructCopy(data.start_class,
4819 		       (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4820 		       struct regnode_charclass_class);
4821 	    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4822 	    r->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
4823 	    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4824 	              regprop(r, sv, (regnode*)data.start_class);
4825 		      PerlIO_printf(Perl_debug_log,
4826 				    "synthetic stclass \"%s\".\n",
4827 				    SvPVX_const(sv));});
4828 	}
4829     }
4830 
4831     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4832        the "real" pattern. */
4833     DEBUG_OPTIMISE_r({
4834 	PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4835 		      (IV)minlen, (IV)r->minlen);
4836     });
4837     r->minlenret = minlen;
4838     if (r->minlen < minlen)
4839         r->minlen = minlen;
4840 
4841     if (RExC_seen & REG_SEEN_GPOS)
4842 	r->extflags |= RXf_GPOS_SEEN;
4843     if (RExC_seen & REG_SEEN_LOOKBEHIND)
4844 	r->extflags |= RXf_LOOKBEHIND_SEEN;
4845     if (RExC_seen & REG_SEEN_EVAL)
4846 	r->extflags |= RXf_EVAL_SEEN;
4847     if (RExC_seen & REG_SEEN_CANY)
4848 	r->extflags |= RXf_CANY_SEEN;
4849     if (RExC_seen & REG_SEEN_VERBARG)
4850 	r->intflags |= PREGf_VERBARG_SEEN;
4851     if (RExC_seen & REG_SEEN_CUTGROUP)
4852 	r->intflags |= PREGf_CUTGROUP_SEEN;
4853     if (RExC_paren_names)
4854         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
4855     else
4856         RXp_PAREN_NAMES(r) = NULL;
4857 
4858 #ifdef STUPID_PATTERN_CHECKS
4859     if (RX_PRELEN(r) == 0)
4860         r->extflags |= RXf_NULL;
4861     if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4862         /* XXX: this should happen BEFORE we compile */
4863         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4864     else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RX_PRECOMP(r), 3))
4865         r->extflags |= RXf_WHITE;
4866     else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^')
4867         r->extflags |= RXf_START_ONLY;
4868 #else
4869     if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ')
4870             /* XXX: this should happen BEFORE we compile */
4871             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4872     else {
4873         regnode *first = ri->program + 1;
4874         U8 fop = OP(first);
4875         U8 nop = OP(NEXTOPER(first));
4876 
4877         if (PL_regkind[fop] == NOTHING && nop == END)
4878             r->extflags |= RXf_NULL;
4879         else if (PL_regkind[fop] == BOL && nop == END)
4880             r->extflags |= RXf_START_ONLY;
4881         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4882             r->extflags |= RXf_WHITE;
4883     }
4884 #endif
4885 #ifdef DEBUGGING
4886     if (RExC_paren_names) {
4887         ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4888         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4889     } else
4890 #endif
4891         ri->name_list_idx = 0;
4892 
4893     if (RExC_recurse_count) {
4894         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4895             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4896             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4897         }
4898     }
4899     Newxz(r->offs, RExC_npar, regexp_paren_pair);
4900     /* assume we don't need to swap parens around before we match */
4901 
4902     DEBUG_DUMP_r({
4903         PerlIO_printf(Perl_debug_log,"Final program:\n");
4904         regdump(r);
4905     });
4906 #ifdef RE_TRACK_PATTERN_OFFSETS
4907     DEBUG_OFFSETS_r(if (ri->u.offsets) {
4908         const U32 len = ri->u.offsets[0];
4909         U32 i;
4910         GET_RE_DEBUG_FLAGS_DECL;
4911         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4912         for (i = 1; i <= len; i++) {
4913             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4914                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4915                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4916             }
4917         PerlIO_printf(Perl_debug_log, "\n");
4918     });
4919 #endif
4920     return(r);
4921 }
4922 
4923 #undef RE_ENGINE_PTR
4924 
4925 
4926 SV*
4927 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4928                     const U32 flags)
4929 {
4930     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4931 
4932     PERL_UNUSED_ARG(value);
4933 
4934     if (flags & RXapif_FETCH) {
4935         return reg_named_buff_fetch(rx, key, flags);
4936     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4937         Perl_croak(aTHX_ "%s", PL_no_modify);
4938         return NULL;
4939     } else if (flags & RXapif_EXISTS) {
4940         return reg_named_buff_exists(rx, key, flags)
4941             ? &PL_sv_yes
4942             : &PL_sv_no;
4943     } else if (flags & RXapif_REGNAMES) {
4944         return reg_named_buff_all(rx, flags);
4945     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4946         return reg_named_buff_scalar(rx, flags);
4947     } else {
4948         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4949         return NULL;
4950     }
4951 }
4952 
4953 SV*
4954 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4955                          const U32 flags)
4956 {
4957     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
4958     PERL_UNUSED_ARG(lastkey);
4959 
4960     if (flags & RXapif_FIRSTKEY)
4961         return reg_named_buff_firstkey(rx, flags);
4962     else if (flags & RXapif_NEXTKEY)
4963         return reg_named_buff_nextkey(rx, flags);
4964     else {
4965         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4966         return NULL;
4967     }
4968 }
4969 
4970 SV*
4971 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags)
4972 {
4973     AV *retarray = NULL;
4974     SV *ret;
4975 
4976     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4977 
4978     if (flags & RXapif_ALL)
4979         retarray=newAV();
4980 
4981     if (rx && RXp_PAREN_NAMES(rx)) {
4982         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
4983         if (he_str) {
4984             IV i;
4985             SV* sv_dat=HeVAL(he_str);
4986             I32 *nums=(I32*)SvPVX(sv_dat);
4987             for ( i=0; i<SvIVX(sv_dat); i++ ) {
4988                 if ((I32)(rx->nparens) >= nums[i]
4989                     && rx->offs[nums[i]].start != -1
4990                     && rx->offs[nums[i]].end != -1)
4991                 {
4992                     ret = newSVpvs("");
4993                     CALLREG_NUMBUF_FETCH(rx,nums[i],ret);
4994                     if (!retarray)
4995                         return ret;
4996                 } else {
4997                     ret = newSVsv(&PL_sv_undef);
4998                 }
4999                 if (retarray)
5000                     av_push(retarray, ret);
5001             }
5002             if (retarray)
5003                 return newRV_noinc(MUTABLE_SV(retarray));
5004         }
5005     }
5006     return NULL;
5007 }
5008 
5009 bool
5010 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key,
5011                            const U32 flags)
5012 {
5013 
5014     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5015 
5016     if (rx && RXp_PAREN_NAMES(rx)) {
5017         if (flags & RXapif_ALL) {
5018             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5019         } else {
5020 	    SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags);
5021             if (sv) {
5022 		SvREFCNT_dec(sv);
5023                 return TRUE;
5024             } else {
5025                 return FALSE;
5026             }
5027         }
5028     } else {
5029         return FALSE;
5030     }
5031 }
5032 
5033 SV*
5034 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags)
5035 {
5036 
5037     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5038 
5039     if ( rx && RXp_PAREN_NAMES(rx) ) {
5040 	(void)hv_iterinit(RXp_PAREN_NAMES(rx));
5041 
5042 	return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY);
5043     } else {
5044 	return FALSE;
5045     }
5046 }
5047 
5048 SV*
5049 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags)
5050 {
5051     GET_RE_DEBUG_FLAGS_DECL;
5052 
5053     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5054 
5055     if (rx && RXp_PAREN_NAMES(rx)) {
5056         HV *hv = RXp_PAREN_NAMES(rx);
5057         HE *temphe;
5058         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5059             IV i;
5060             IV parno = 0;
5061             SV* sv_dat = HeVAL(temphe);
5062             I32 *nums = (I32*)SvPVX(sv_dat);
5063             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5064                 if ((I32)(rx->lastparen) >= nums[i] &&
5065                     rx->offs[nums[i]].start != -1 &&
5066                     rx->offs[nums[i]].end != -1)
5067                 {
5068                     parno = nums[i];
5069                     break;
5070                 }
5071             }
5072             if (parno || flags & RXapif_ALL) {
5073 		return newSVhek(HeKEY_hek(temphe));
5074             }
5075         }
5076     }
5077     return NULL;
5078 }
5079 
5080 SV*
5081 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags)
5082 {
5083     SV *ret;
5084     AV *av;
5085     I32 length;
5086 
5087     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5088 
5089     if (rx && RXp_PAREN_NAMES(rx)) {
5090         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5091             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5092         } else if (flags & RXapif_ONE) {
5093             ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES));
5094             av = MUTABLE_AV(SvRV(ret));
5095             length = av_len(av);
5096 	    SvREFCNT_dec(ret);
5097             return newSViv(length + 1);
5098         } else {
5099             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5100             return NULL;
5101         }
5102     }
5103     return &PL_sv_undef;
5104 }
5105 
5106 SV*
5107 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags)
5108 {
5109     AV *av = newAV();
5110 
5111     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5112 
5113     if (rx && RXp_PAREN_NAMES(rx)) {
5114         HV *hv= RXp_PAREN_NAMES(rx);
5115         HE *temphe;
5116         (void)hv_iterinit(hv);
5117         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5118             IV i;
5119             IV parno = 0;
5120             SV* sv_dat = HeVAL(temphe);
5121             I32 *nums = (I32*)SvPVX(sv_dat);
5122             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5123                 if ((I32)(rx->lastparen) >= nums[i] &&
5124                     rx->offs[nums[i]].start != -1 &&
5125                     rx->offs[nums[i]].end != -1)
5126                 {
5127                     parno = nums[i];
5128                     break;
5129                 }
5130             }
5131             if (parno || flags & RXapif_ALL) {
5132                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5133             }
5134         }
5135     }
5136 
5137     return newRV_noinc(MUTABLE_SV(av));
5138 }
5139 
5140 void
5141 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv)
5142 {
5143     char *s = NULL;
5144     I32 i = 0;
5145     I32 s1, t1;
5146 
5147     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5148 
5149     if (!rx->subbeg) {
5150         sv_setsv(sv,&PL_sv_undef);
5151         return;
5152     }
5153     else
5154     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5155         /* $` */
5156 	i = rx->offs[0].start;
5157 	s = rx->subbeg;
5158     }
5159     else
5160     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5161         /* $' */
5162 	s = rx->subbeg + rx->offs[0].end;
5163 	i = rx->sublen - rx->offs[0].end;
5164     }
5165     else
5166     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5167         (s1 = rx->offs[paren].start) != -1 &&
5168         (t1 = rx->offs[paren].end) != -1)
5169     {
5170         /* $& $1 ... */
5171         i = t1 - s1;
5172         s = rx->subbeg + s1;
5173     } else {
5174         sv_setsv(sv,&PL_sv_undef);
5175         return;
5176     }
5177     assert(rx->sublen >= (s - rx->subbeg) + i );
5178     if (i >= 0) {
5179         const int oldtainted = PL_tainted;
5180         TAINT_NOT;
5181         sv_setpvn(sv, s, i);
5182         PL_tainted = oldtainted;
5183         if ( (rx->extflags & RXf_CANY_SEEN)
5184             ? (RXp_MATCH_UTF8(rx)
5185                         && (!i || is_utf8_string((U8*)s, i)))
5186             : (RXp_MATCH_UTF8(rx)) )
5187         {
5188             SvUTF8_on(sv);
5189         }
5190         else
5191             SvUTF8_off(sv);
5192         if (PL_tainting) {
5193             if (RXp_MATCH_TAINTED(rx)) {
5194                 if (SvTYPE(sv) >= SVt_PVMG) {
5195                     MAGIC* const mg = SvMAGIC(sv);
5196                     MAGIC* mgt;
5197                     PL_tainted = 1;
5198                     SvMAGIC_set(sv, mg->mg_moremagic);
5199                     SvTAINT(sv);
5200                     if ((mgt = SvMAGIC(sv))) {
5201                         mg->mg_moremagic = mgt;
5202                         SvMAGIC_set(sv, mg);
5203                     }
5204                 } else {
5205                     PL_tainted = 1;
5206                     SvTAINT(sv);
5207                 }
5208             } else
5209                 SvTAINTED_off(sv);
5210         }
5211     } else {
5212         sv_setsv(sv,&PL_sv_undef);
5213         return;
5214     }
5215 }
5216 
5217 void
5218 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5219 							 SV const * const value)
5220 {
5221     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5222 
5223     PERL_UNUSED_ARG(rx);
5224     PERL_UNUSED_ARG(paren);
5225     PERL_UNUSED_ARG(value);
5226 
5227     if (!PL_localizing)
5228         Perl_croak(aTHX_ "%s", PL_no_modify);
5229 }
5230 
5231 I32
5232 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv,
5233                               const I32 paren)
5234 {
5235     I32 i;
5236     I32 s1, t1;
5237 
5238     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5239 
5240     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5241 	switch (paren) {
5242       /* $` / ${^PREMATCH} */
5243       case RX_BUFF_IDX_PREMATCH:
5244         if (rx->offs[0].start != -1) {
5245 			i = rx->offs[0].start;
5246 			if (i > 0) {
5247 				s1 = 0;
5248 				t1 = i;
5249 				goto getlen;
5250 			}
5251 	    }
5252         return 0;
5253       /* $' / ${^POSTMATCH} */
5254       case RX_BUFF_IDX_POSTMATCH:
5255 	    if (rx->offs[0].end != -1) {
5256 			i = rx->sublen - rx->offs[0].end;
5257 			if (i > 0) {
5258 				s1 = rx->offs[0].end;
5259 				t1 = rx->sublen;
5260 				goto getlen;
5261 			}
5262 	    }
5263         return 0;
5264       /* $& / ${^MATCH}, $1, $2, ... */
5265       default:
5266 	    if (paren <= (I32)rx->nparens &&
5267             (s1 = rx->offs[paren].start) != -1 &&
5268             (t1 = rx->offs[paren].end) != -1)
5269 	    {
5270             i = t1 - s1;
5271             goto getlen;
5272         } else {
5273             if (ckWARN(WARN_UNINITIALIZED))
5274                 report_uninit((SV *)sv);
5275             return 0;
5276         }
5277     }
5278   getlen:
5279     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5280         const char * const s = rx->subbeg + s1;
5281         const U8 *ep;
5282         STRLEN el;
5283 
5284         i = t1 - s1;
5285         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5286 			i = el;
5287     }
5288     return i;
5289 }
5290 
5291 SV*
5292 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5293 {
5294     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5295 	PERL_UNUSED_ARG(rx);
5296 	return newSVpvs("Regexp");
5297 }
5298 
5299 /* Scans the name of a named buffer from the pattern.
5300  * If flags is REG_RSN_RETURN_NULL returns null.
5301  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5302  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5303  * to the parsed name as looked up in the RExC_paren_names hash.
5304  * If there is an error throws a vFAIL().. type exception.
5305  */
5306 
5307 #define REG_RSN_RETURN_NULL    0
5308 #define REG_RSN_RETURN_NAME    1
5309 #define REG_RSN_RETURN_DATA    2
5310 
5311 STATIC SV*
5312 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5313 {
5314     char *name_start = RExC_parse;
5315 
5316     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5317 
5318     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5319 	 /* skip IDFIRST by using do...while */
5320 	if (UTF)
5321 	    do {
5322 		RExC_parse += UTF8SKIP(RExC_parse);
5323 	    } while (isALNUM_utf8((U8*)RExC_parse));
5324 	else
5325 	    do {
5326 		RExC_parse++;
5327 	    } while (isALNUM(*RExC_parse));
5328     }
5329 
5330     if ( flags ) {
5331         SV* sv_name
5332 	    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5333 			     SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5334         if ( flags == REG_RSN_RETURN_NAME)
5335             return sv_name;
5336         else if (flags==REG_RSN_RETURN_DATA) {
5337             HE *he_str = NULL;
5338             SV *sv_dat = NULL;
5339             if ( ! sv_name )      /* should not happen*/
5340                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5341             if (RExC_paren_names)
5342                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5343             if ( he_str )
5344                 sv_dat = HeVAL(he_str);
5345             if ( ! sv_dat )
5346                 vFAIL("Reference to nonexistent named group");
5347             return sv_dat;
5348         }
5349         else {
5350             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5351         }
5352         /* NOT REACHED */
5353     }
5354     return NULL;
5355 }
5356 
5357 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5358     int rem=(int)(RExC_end - RExC_parse);                       \
5359     int cut;                                                    \
5360     int num;                                                    \
5361     int iscut=0;                                                \
5362     if (rem>10) {                                               \
5363         rem=10;                                                 \
5364         iscut=1;                                                \
5365     }                                                           \
5366     cut=10-rem;                                                 \
5367     if (RExC_lastparse!=RExC_parse)                             \
5368         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5369             rem, RExC_parse,                                    \
5370             cut + 4,                                            \
5371             iscut ? "..." : "<"                                 \
5372         );                                                      \
5373     else                                                        \
5374         PerlIO_printf(Perl_debug_log,"%16s","");                \
5375                                                                 \
5376     if (SIZE_ONLY)                                              \
5377        num = RExC_size + 1;                                     \
5378     else                                                        \
5379        num=REG_NODE_NUM(RExC_emit);                             \
5380     if (RExC_lastnum!=num)                                      \
5381        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5382     else                                                        \
5383        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5384     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5385         (int)((depth*2)), "",                                   \
5386         (funcname)                                              \
5387     );                                                          \
5388     RExC_lastnum=num;                                           \
5389     RExC_lastparse=RExC_parse;                                  \
5390 })
5391 
5392 
5393 
5394 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5395     DEBUG_PARSE_MSG((funcname));                            \
5396     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5397 })
5398 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5399     DEBUG_PARSE_MSG((funcname));                            \
5400     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5401 })
5402 /*
5403  - reg - regular expression, i.e. main body or parenthesized thing
5404  *
5405  * Caller must absorb opening parenthesis.
5406  *
5407  * Combining parenthesis handling with the base level of regular expression
5408  * is a trifle forced, but the need to tie the tails of the branches to what
5409  * follows makes it hard to avoid.
5410  */
5411 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5412 #ifdef DEBUGGING
5413 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5414 #else
5415 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5416 #endif
5417 
5418 STATIC regnode *
5419 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5420     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5421 {
5422     dVAR;
5423     register regnode *ret;		/* Will be the head of the group. */
5424     register regnode *br;
5425     register regnode *lastbr;
5426     register regnode *ender = NULL;
5427     register I32 parno = 0;
5428     I32 flags;
5429     U32 oregflags = RExC_flags;
5430     bool have_branch = 0;
5431     bool is_open = 0;
5432     I32 freeze_paren = 0;
5433     I32 after_freeze = 0;
5434 
5435     /* for (?g), (?gc), and (?o) warnings; warning
5436        about (?c) will warn about (?g) -- japhy    */
5437 
5438 #define WASTED_O  0x01
5439 #define WASTED_G  0x02
5440 #define WASTED_C  0x04
5441 #define WASTED_GC (0x02|0x04)
5442     I32 wastedflags = 0x00;
5443 
5444     char * parse_start = RExC_parse; /* MJD */
5445     char * const oregcomp_parse = RExC_parse;
5446 
5447     GET_RE_DEBUG_FLAGS_DECL;
5448 
5449     PERL_ARGS_ASSERT_REG;
5450     DEBUG_PARSE("reg ");
5451 
5452     *flagp = 0;				/* Tentatively. */
5453 
5454 
5455     /* Make an OPEN node, if parenthesized. */
5456     if (paren) {
5457         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5458 	    char *start_verb = RExC_parse;
5459 	    STRLEN verb_len = 0;
5460 	    char *start_arg = NULL;
5461 	    unsigned char op = 0;
5462 	    int argok = 1;
5463 	    int internal_argval = 0; /* internal_argval is only useful if !argok */
5464 	    while ( *RExC_parse && *RExC_parse != ')' ) {
5465 	        if ( *RExC_parse == ':' ) {
5466 	            start_arg = RExC_parse + 1;
5467 	            break;
5468 	        }
5469 	        RExC_parse++;
5470 	    }
5471 	    ++start_verb;
5472 	    verb_len = RExC_parse - start_verb;
5473 	    if ( start_arg ) {
5474 	        RExC_parse++;
5475 	        while ( *RExC_parse && *RExC_parse != ')' )
5476 	            RExC_parse++;
5477 	        if ( *RExC_parse != ')' )
5478 	            vFAIL("Unterminated verb pattern argument");
5479 	        if ( RExC_parse == start_arg )
5480 	            start_arg = NULL;
5481 	    } else {
5482 	        if ( *RExC_parse != ')' )
5483 	            vFAIL("Unterminated verb pattern");
5484 	    }
5485 
5486 	    switch ( *start_verb ) {
5487             case 'A':  /* (*ACCEPT) */
5488                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5489 		    op = ACCEPT;
5490 		    internal_argval = RExC_nestroot;
5491 		}
5492 		break;
5493             case 'C':  /* (*COMMIT) */
5494                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5495                     op = COMMIT;
5496                 break;
5497             case 'F':  /* (*FAIL) */
5498                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5499 		    op = OPFAIL;
5500 		    argok = 0;
5501 		}
5502 		break;
5503             case ':':  /* (*:NAME) */
5504 	    case 'M':  /* (*MARK:NAME) */
5505 	        if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5506                     op = MARKPOINT;
5507                     argok = -1;
5508                 }
5509                 break;
5510             case 'P':  /* (*PRUNE) */
5511                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5512                     op = PRUNE;
5513                 break;
5514             case 'S':   /* (*SKIP) */
5515                 if ( memEQs(start_verb,verb_len,"SKIP") )
5516                     op = SKIP;
5517                 break;
5518             case 'T':  /* (*THEN) */
5519                 /* [19:06] <TimToady> :: is then */
5520                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5521                     op = CUTGROUP;
5522                     RExC_seen |= REG_SEEN_CUTGROUP;
5523                 }
5524                 break;
5525 	    }
5526 	    if ( ! op ) {
5527 	        RExC_parse++;
5528 	        vFAIL3("Unknown verb pattern '%.*s'",
5529 	            verb_len, start_verb);
5530 	    }
5531 	    if ( argok ) {
5532                 if ( start_arg && internal_argval ) {
5533 	            vFAIL3("Verb pattern '%.*s' may not have an argument",
5534 	                verb_len, start_verb);
5535 	        } else if ( argok < 0 && !start_arg ) {
5536                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5537 	                verb_len, start_verb);
5538 	        } else {
5539 	            ret = reganode(pRExC_state, op, internal_argval);
5540 	            if ( ! internal_argval && ! SIZE_ONLY ) {
5541                         if (start_arg) {
5542                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5543                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5544                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5545                             ret->flags = 0;
5546                         } else {
5547                             ret->flags = 1;
5548                         }
5549                     }
5550 	        }
5551 	        if (!internal_argval)
5552 	            RExC_seen |= REG_SEEN_VERBARG;
5553 	    } else if ( start_arg ) {
5554 	        vFAIL3("Verb pattern '%.*s' may not have an argument",
5555 	                verb_len, start_verb);
5556 	    } else {
5557 	        ret = reg_node(pRExC_state, op);
5558 	    }
5559 	    nextchar(pRExC_state);
5560 	    return ret;
5561         } else
5562 	if (*RExC_parse == '?') { /* (?...) */
5563 	    bool is_logical = 0;
5564 	    const char * const seqstart = RExC_parse;
5565 
5566 	    RExC_parse++;
5567 	    paren = *RExC_parse++;
5568 	    ret = NULL;			/* For look-ahead/behind. */
5569 	    switch (paren) {
5570 
5571 	    case 'P':	/* (?P...) variants for those used to PCRE/Python */
5572 	        paren = *RExC_parse++;
5573 		if ( paren == '<')         /* (?P<...>) named capture */
5574 		    goto named_capture;
5575                 else if (paren == '>') {   /* (?P>name) named recursion */
5576                     goto named_recursion;
5577                 }
5578                 else if (paren == '=') {   /* (?P=...)  named backref */
5579                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5580                        you change this make sure you change that */
5581                     char* name_start = RExC_parse;
5582 		    U32 num = 0;
5583                     SV *sv_dat = reg_scan_name(pRExC_state,
5584                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5585                     if (RExC_parse == name_start || *RExC_parse != ')')
5586                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5587 
5588                     if (!SIZE_ONLY) {
5589                         num = add_data( pRExC_state, 1, "S" );
5590                         RExC_rxi->data->data[num]=(void*)sv_dat;
5591                         SvREFCNT_inc_simple_void(sv_dat);
5592                     }
5593                     RExC_sawback = 1;
5594                     ret = reganode(pRExC_state,
5595                     	   (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5596                     	   num);
5597                     *flagp |= HASWIDTH;
5598 
5599                     Set_Node_Offset(ret, parse_start+1);
5600                     Set_Node_Cur_Length(ret); /* MJD */
5601 
5602                     nextchar(pRExC_state);
5603                     return ret;
5604                 }
5605                 RExC_parse++;
5606 		vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5607 		/*NOTREACHED*/
5608             case '<':           /* (?<...) */
5609 		if (*RExC_parse == '!')
5610 		    paren = ',';
5611 		else if (*RExC_parse != '=')
5612               named_capture:
5613 		{               /* (?<...>) */
5614 		    char *name_start;
5615 		    SV *svname;
5616 		    paren= '>';
5617             case '\'':          /* (?'...') */
5618     		    name_start= RExC_parse;
5619     		    svname = reg_scan_name(pRExC_state,
5620     		        SIZE_ONLY ?  /* reverse test from the others */
5621     		        REG_RSN_RETURN_NAME :
5622     		        REG_RSN_RETURN_NULL);
5623 		    if (RExC_parse == name_start) {
5624 		        RExC_parse++;
5625 		        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5626 		        /*NOTREACHED*/
5627                     }
5628 		    if (*RExC_parse != paren)
5629 		        vFAIL2("Sequence (?%c... not terminated",
5630 		            paren=='>' ? '<' : paren);
5631 		    if (SIZE_ONLY) {
5632 			HE *he_str;
5633 			SV *sv_dat = NULL;
5634                         if (!svname) /* shouldnt happen */
5635                             Perl_croak(aTHX_
5636                                 "panic: reg_scan_name returned NULL");
5637                         if (!RExC_paren_names) {
5638                             RExC_paren_names= newHV();
5639                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
5640 #ifdef DEBUGGING
5641                             RExC_paren_name_list= newAV();
5642                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5643 #endif
5644                         }
5645                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5646                         if ( he_str )
5647                             sv_dat = HeVAL(he_str);
5648                         if ( ! sv_dat ) {
5649                             /* croak baby croak */
5650                             Perl_croak(aTHX_
5651                                 "panic: paren_name hash element allocation failed");
5652                         } else if ( SvPOK(sv_dat) ) {
5653                             /* (?|...) can mean we have dupes so scan to check
5654                                its already been stored. Maybe a flag indicating
5655                                we are inside such a construct would be useful,
5656                                but the arrays are likely to be quite small, so
5657                                for now we punt -- dmq */
5658                             IV count = SvIV(sv_dat);
5659                             I32 *pv = (I32*)SvPVX(sv_dat);
5660                             IV i;
5661                             for ( i = 0 ; i < count ; i++ ) {
5662                                 if ( pv[i] == RExC_npar ) {
5663                                     count = 0;
5664                                     break;
5665                                 }
5666                             }
5667                             if ( count ) {
5668                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5669                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5670                                 pv[count] = RExC_npar;
5671                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5672                             }
5673                         } else {
5674                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5675                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5676                             SvIOK_on(sv_dat);
5677                             SvIV_set(sv_dat, 1);
5678                         }
5679 #ifdef DEBUGGING
5680                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5681                             SvREFCNT_dec(svname);
5682 #endif
5683 
5684                         /*sv_dump(sv_dat);*/
5685                     }
5686                     nextchar(pRExC_state);
5687 		    paren = 1;
5688 		    goto capturing_parens;
5689 		}
5690                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5691 		RExC_parse++;
5692 	    case '=':           /* (?=...) */
5693 		RExC_seen_zerolen++;
5694 			break;
5695 	    case '!':           /* (?!...) */
5696 		RExC_seen_zerolen++;
5697 	        if (*RExC_parse == ')') {
5698 	            ret=reg_node(pRExC_state, OPFAIL);
5699 	            nextchar(pRExC_state);
5700 	            return ret;
5701 	        }
5702 	        break;
5703 	    case '|':           /* (?|...) */
5704 	        /* branch reset, behave like a (?:...) except that
5705 	           buffers in alternations share the same numbers */
5706 	        paren = ':';
5707 	        after_freeze = freeze_paren = RExC_npar;
5708 	        break;
5709 	    case ':':           /* (?:...) */
5710 	    case '>':           /* (?>...) */
5711 		break;
5712 	    case '$':           /* (?$...) */
5713 	    case '@':           /* (?@...) */
5714 		vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5715 		break;
5716 	    case '#':           /* (?#...) */
5717 		while (*RExC_parse && *RExC_parse != ')')
5718 		    RExC_parse++;
5719 		if (*RExC_parse != ')')
5720 		    FAIL("Sequence (?#... not terminated");
5721 		nextchar(pRExC_state);
5722 		*flagp = TRYAGAIN;
5723 		return NULL;
5724 	    case '0' :           /* (?0) */
5725 	    case 'R' :           /* (?R) */
5726 		if (*RExC_parse != ')')
5727 		    FAIL("Sequence (?R) not terminated");
5728 		ret = reg_node(pRExC_state, GOSTART);
5729 		*flagp |= POSTPONED;
5730 		nextchar(pRExC_state);
5731 		return ret;
5732 		/*notreached*/
5733             { /* named and numeric backreferences */
5734                 I32 num;
5735             case '&':            /* (?&NAME) */
5736                 parse_start = RExC_parse - 1;
5737               named_recursion:
5738                 {
5739     		    SV *sv_dat = reg_scan_name(pRExC_state,
5740     		        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5741     		     num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5742                 }
5743                 goto gen_recurse_regop;
5744                 /* NOT REACHED */
5745             case '+':
5746                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5747                     RExC_parse++;
5748                     vFAIL("Illegal pattern");
5749                 }
5750                 goto parse_recursion;
5751                 /* NOT REACHED*/
5752             case '-': /* (?-1) */
5753                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5754                     RExC_parse--; /* rewind to let it be handled later */
5755                     goto parse_flags;
5756                 }
5757                 /*FALLTHROUGH */
5758             case '1': case '2': case '3': case '4': /* (?1) */
5759 	    case '5': case '6': case '7': case '8': case '9':
5760 	        RExC_parse--;
5761               parse_recursion:
5762 		num = atoi(RExC_parse);
5763   	        parse_start = RExC_parse - 1; /* MJD */
5764 	        if (*RExC_parse == '-')
5765 	            RExC_parse++;
5766 		while (isDIGIT(*RExC_parse))
5767 			RExC_parse++;
5768 	        if (*RExC_parse!=')')
5769 	            vFAIL("Expecting close bracket");
5770 
5771               gen_recurse_regop:
5772                 if ( paren == '-' ) {
5773                     /*
5774                     Diagram of capture buffer numbering.
5775                     Top line is the normal capture buffer numbers
5776                     Botton line is the negative indexing as from
5777                     the X (the (?-2))
5778 
5779                     +   1 2    3 4 5 X          6 7
5780                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5781                     -   5 4    3 2 1 X          x x
5782 
5783                     */
5784                     num = RExC_npar + num;
5785                     if (num < 1)  {
5786                         RExC_parse++;
5787                         vFAIL("Reference to nonexistent group");
5788                     }
5789                 } else if ( paren == '+' ) {
5790                     num = RExC_npar + num - 1;
5791                 }
5792 
5793                 ret = reganode(pRExC_state, GOSUB, num);
5794                 if (!SIZE_ONLY) {
5795 		    if (num > (I32)RExC_rx->nparens) {
5796 			RExC_parse++;
5797 			vFAIL("Reference to nonexistent group");
5798 	            }
5799 	            ARG2L_SET( ret, RExC_recurse_count++);
5800                     RExC_emit++;
5801 		    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5802 			"Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5803 		} else {
5804 		    RExC_size++;
5805     		}
5806     		RExC_seen |= REG_SEEN_RECURSE;
5807                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5808 		Set_Node_Offset(ret, parse_start); /* MJD */
5809 
5810                 *flagp |= POSTPONED;
5811                 nextchar(pRExC_state);
5812                 return ret;
5813             } /* named and numeric backreferences */
5814             /* NOT REACHED */
5815 
5816 	    case '?':           /* (??...) */
5817 		is_logical = 1;
5818 		if (*RExC_parse != '{') {
5819 		    RExC_parse++;
5820 		    vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5821 		    /*NOTREACHED*/
5822 		}
5823 		*flagp |= POSTPONED;
5824 		paren = *RExC_parse++;
5825 		/* FALL THROUGH */
5826 	    case '{':           /* (?{...}) */
5827 	    {
5828 		I32 count = 1;
5829 		U32 n = 0;
5830 		char c;
5831 		char *s = RExC_parse;
5832 
5833 		RExC_seen_zerolen++;
5834 		RExC_seen |= REG_SEEN_EVAL;
5835 		while (count && (c = *RExC_parse)) {
5836 		    if (c == '\\') {
5837 			if (RExC_parse[1])
5838 			    RExC_parse++;
5839 		    }
5840 		    else if (c == '{')
5841 			count++;
5842 		    else if (c == '}')
5843 			count--;
5844 		    RExC_parse++;
5845 		}
5846 		if (*RExC_parse != ')') {
5847 		    RExC_parse = s;
5848 		    vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5849 		}
5850 		if (!SIZE_ONLY) {
5851 		    PAD *pad;
5852 		    OP_4tree *sop, *rop;
5853 		    SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5854 
5855 		    ENTER;
5856 		    Perl_save_re_context(aTHX);
5857 		    rop = sv_compile_2op(sv, &sop, "re", &pad);
5858 		    sop->op_private |= OPpREFCOUNTED;
5859 		    /* re_dup will OpREFCNT_inc */
5860 		    OpREFCNT_set(sop, 1);
5861 		    LEAVE;
5862 
5863 		    n = add_data(pRExC_state, 3, "nop");
5864 		    RExC_rxi->data->data[n] = (void*)rop;
5865 		    RExC_rxi->data->data[n+1] = (void*)sop;
5866 		    RExC_rxi->data->data[n+2] = (void*)pad;
5867 		    SvREFCNT_dec(sv);
5868 		}
5869 		else {						/* First pass */
5870 		    if (PL_reginterp_cnt < ++RExC_seen_evals
5871 			&& IN_PERL_RUNTIME)
5872 			/* No compiled RE interpolated, has runtime
5873 			   components ===> unsafe.  */
5874 			FAIL("Eval-group not allowed at runtime, use re 'eval'");
5875 		    if (PL_tainting && PL_tainted)
5876 			FAIL("Eval-group in insecure regular expression");
5877 #if PERL_VERSION > 8
5878 		    if (IN_PERL_COMPILETIME)
5879 			PL_cv_has_eval = 1;
5880 #endif
5881 		}
5882 
5883 		nextchar(pRExC_state);
5884 		if (is_logical) {
5885 		    ret = reg_node(pRExC_state, LOGICAL);
5886 		    if (!SIZE_ONLY)
5887 			ret->flags = 2;
5888                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5889                     /* deal with the length of this later - MJD */
5890 		    return ret;
5891 		}
5892 		ret = reganode(pRExC_state, EVAL, n);
5893 		Set_Node_Length(ret, RExC_parse - parse_start + 1);
5894 		Set_Node_Offset(ret, parse_start);
5895 		return ret;
5896 	    }
5897 	    case '(':           /* (?(?{...})...) and (?(?=...)...) */
5898 	    {
5899 	        int is_define= 0;
5900 		if (RExC_parse[0] == '?') {        /* (?(?...)) */
5901 		    if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5902 			|| RExC_parse[1] == '<'
5903 			|| RExC_parse[1] == '{') { /* Lookahead or eval. */
5904 			I32 flag;
5905 
5906 			ret = reg_node(pRExC_state, LOGICAL);
5907 			if (!SIZE_ONLY)
5908 			    ret->flags = 1;
5909                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5910 			goto insert_if;
5911 		    }
5912 		}
5913 		else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
5914 		         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5915 	        {
5916 	            char ch = RExC_parse[0] == '<' ? '>' : '\'';
5917 	            char *name_start= RExC_parse++;
5918 	            U32 num = 0;
5919 	            SV *sv_dat=reg_scan_name(pRExC_state,
5920 	                SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5921 	            if (RExC_parse == name_start || *RExC_parse != ch)
5922                         vFAIL2("Sequence (?(%c... not terminated",
5923                             (ch == '>' ? '<' : ch));
5924                     RExC_parse++;
5925 	            if (!SIZE_ONLY) {
5926                         num = add_data( pRExC_state, 1, "S" );
5927                         RExC_rxi->data->data[num]=(void*)sv_dat;
5928                         SvREFCNT_inc_simple_void(sv_dat);
5929                     }
5930                     ret = reganode(pRExC_state,NGROUPP,num);
5931                     goto insert_if_check_paren;
5932 		}
5933 		else if (RExC_parse[0] == 'D' &&
5934 		         RExC_parse[1] == 'E' &&
5935 		         RExC_parse[2] == 'F' &&
5936 		         RExC_parse[3] == 'I' &&
5937 		         RExC_parse[4] == 'N' &&
5938 		         RExC_parse[5] == 'E')
5939 		{
5940 		    ret = reganode(pRExC_state,DEFINEP,0);
5941 		    RExC_parse +=6 ;
5942 		    is_define = 1;
5943 		    goto insert_if_check_paren;
5944 		}
5945 		else if (RExC_parse[0] == 'R') {
5946 		    RExC_parse++;
5947 		    parno = 0;
5948 		    if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5949 		        parno = atoi(RExC_parse++);
5950 		        while (isDIGIT(*RExC_parse))
5951 			    RExC_parse++;
5952 		    } else if (RExC_parse[0] == '&') {
5953 		        SV *sv_dat;
5954 		        RExC_parse++;
5955 		        sv_dat = reg_scan_name(pRExC_state,
5956     		            SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5957     		        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5958 		    }
5959 		    ret = reganode(pRExC_state,INSUBP,parno);
5960 		    goto insert_if_check_paren;
5961 		}
5962 		else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5963                     /* (?(1)...) */
5964 		    char c;
5965 		    parno = atoi(RExC_parse++);
5966 
5967 		    while (isDIGIT(*RExC_parse))
5968 			RExC_parse++;
5969                     ret = reganode(pRExC_state, GROUPP, parno);
5970 
5971                  insert_if_check_paren:
5972 		    if ((c = *nextchar(pRExC_state)) != ')')
5973 			vFAIL("Switch condition not recognized");
5974 		  insert_if:
5975                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5976                     br = regbranch(pRExC_state, &flags, 1,depth+1);
5977 		    if (br == NULL)
5978 			br = reganode(pRExC_state, LONGJMP, 0);
5979 		    else
5980                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5981 		    c = *nextchar(pRExC_state);
5982 		    if (flags&HASWIDTH)
5983 			*flagp |= HASWIDTH;
5984 		    if (c == '|') {
5985 		        if (is_define)
5986 		            vFAIL("(?(DEFINE)....) does not allow branches");
5987 			lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5988                         regbranch(pRExC_state, &flags, 1,depth+1);
5989                         REGTAIL(pRExC_state, ret, lastbr);
5990 		 	if (flags&HASWIDTH)
5991 			    *flagp |= HASWIDTH;
5992 			c = *nextchar(pRExC_state);
5993 		    }
5994 		    else
5995 			lastbr = NULL;
5996 		    if (c != ')')
5997 			vFAIL("Switch (?(condition)... contains too many branches");
5998 		    ender = reg_node(pRExC_state, TAIL);
5999                     REGTAIL(pRExC_state, br, ender);
6000 		    if (lastbr) {
6001                         REGTAIL(pRExC_state, lastbr, ender);
6002                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6003 		    }
6004 		    else
6005                         REGTAIL(pRExC_state, ret, ender);
6006                     RExC_size++; /* XXX WHY do we need this?!!
6007                                     For large programs it seems to be required
6008                                     but I can't figure out why. -- dmq*/
6009 		    return ret;
6010 		}
6011 		else {
6012 		    vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6013 		}
6014 	    }
6015             case 0:
6016 		RExC_parse--; /* for vFAIL to print correctly */
6017                 vFAIL("Sequence (? incomplete");
6018                 break;
6019 	    default:
6020 	        --RExC_parse;
6021 	        parse_flags:      /* (?i) */
6022 	    {
6023                 U32 posflags = 0, negflags = 0;
6024 	        U32 *flagsp = &posflags;
6025 
6026 		while (*RExC_parse) {
6027 		    /* && strchr("iogcmsx", *RExC_parse) */
6028 		    /* (?g), (?gc) and (?o) are useless here
6029 		       and must be globally applied -- japhy */
6030                     switch (*RExC_parse) {
6031 	            CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6032                     case ONCE_PAT_MOD: /* 'o' */
6033                     case GLOBAL_PAT_MOD: /* 'g' */
6034 			if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6035 			    const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6036 			    if (! (wastedflags & wflagbit) ) {
6037 				wastedflags |= wflagbit;
6038 				vWARN5(
6039 				    RExC_parse + 1,
6040 				    "Useless (%s%c) - %suse /%c modifier",
6041 				    flagsp == &negflags ? "?-" : "?",
6042 				    *RExC_parse,
6043 				    flagsp == &negflags ? "don't " : "",
6044 				    *RExC_parse
6045 				);
6046 			    }
6047 			}
6048 			break;
6049 
6050 		    case CONTINUE_PAT_MOD: /* 'c' */
6051 			if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6052 			    if (! (wastedflags & WASTED_C) ) {
6053 				wastedflags |= WASTED_GC;
6054 				vWARN3(
6055 				    RExC_parse + 1,
6056 				    "Useless (%sc) - %suse /gc modifier",
6057 				    flagsp == &negflags ? "?-" : "?",
6058 				    flagsp == &negflags ? "don't " : ""
6059 				);
6060 			    }
6061 			}
6062 			break;
6063 	            case KEEPCOPY_PAT_MOD: /* 'p' */
6064                         if (flagsp == &negflags) {
6065                             if (SIZE_ONLY && ckWARN(WARN_REGEXP))
6066                                 vWARN(RExC_parse + 1,"Useless use of (?-p)");
6067                         } else {
6068                             *flagsp |= RXf_PMf_KEEPCOPY;
6069                         }
6070 	                break;
6071                     case '-':
6072                         if (flagsp == &negflags) {
6073                             RExC_parse++;
6074 		            vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6075 		            /*NOTREACHED*/
6076 		        }
6077 			flagsp = &negflags;
6078 		        wastedflags = 0;  /* reset so (?g-c) warns twice */
6079 		        break;
6080                     case ':':
6081 		        paren = ':';
6082 		        /*FALLTHROUGH*/
6083                     case ')':
6084                         RExC_flags |= posflags;
6085                         RExC_flags &= ~negflags;
6086                         if (paren != ':') {
6087                             oregflags |= posflags;
6088                             oregflags &= ~negflags;
6089                         }
6090                         nextchar(pRExC_state);
6091 		        if (paren != ':') {
6092 		            *flagp = TRYAGAIN;
6093 		            return NULL;
6094 		        } else {
6095                             ret = NULL;
6096 		            goto parse_rest;
6097 		        }
6098 		        /*NOTREACHED*/
6099                     default:
6100 		        RExC_parse++;
6101 		        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6102 		        /*NOTREACHED*/
6103                     }
6104 		    ++RExC_parse;
6105 		}
6106 	    }} /* one for the default block, one for the switch */
6107 	}
6108 	else {                  /* (...) */
6109 	  capturing_parens:
6110 	    parno = RExC_npar;
6111 	    RExC_npar++;
6112 
6113 	    ret = reganode(pRExC_state, OPEN, parno);
6114 	    if (!SIZE_ONLY ){
6115 	        if (!RExC_nestroot)
6116 	            RExC_nestroot = parno;
6117 	        if (RExC_seen & REG_SEEN_RECURSE
6118 	            && !RExC_open_parens[parno-1])
6119 	        {
6120 		    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6121 			"Setting open paren #%"IVdf" to %d\n",
6122 			(IV)parno, REG_NODE_NUM(ret)));
6123 	            RExC_open_parens[parno-1]= ret;
6124 	        }
6125 	    }
6126             Set_Node_Length(ret, 1); /* MJD */
6127             Set_Node_Offset(ret, RExC_parse); /* MJD */
6128 	    is_open = 1;
6129 	}
6130     }
6131     else                        /* ! paren */
6132 	ret = NULL;
6133 
6134    parse_rest:
6135     /* Pick up the branches, linking them together. */
6136     parse_start = RExC_parse;   /* MJD */
6137     br = regbranch(pRExC_state, &flags, 1,depth+1);
6138 
6139     if (freeze_paren) {
6140         if (RExC_npar > after_freeze)
6141             after_freeze = RExC_npar;
6142         RExC_npar = freeze_paren;
6143     }
6144 
6145     /*     branch_len = (paren != 0); */
6146 
6147     if (br == NULL)
6148 	return(NULL);
6149     if (*RExC_parse == '|') {
6150 	if (!SIZE_ONLY && RExC_extralen) {
6151 	    reginsert(pRExC_state, BRANCHJ, br, depth+1);
6152 	}
6153 	else {                  /* MJD */
6154 	    reginsert(pRExC_state, BRANCH, br, depth+1);
6155             Set_Node_Length(br, paren != 0);
6156             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6157         }
6158 	have_branch = 1;
6159 	if (SIZE_ONLY)
6160 	    RExC_extralen += 1;		/* For BRANCHJ-BRANCH. */
6161     }
6162     else if (paren == ':') {
6163 	*flagp |= flags&SIMPLE;
6164     }
6165     if (is_open) {				/* Starts with OPEN. */
6166         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6167     }
6168     else if (paren != '?')		/* Not Conditional */
6169 	ret = br;
6170     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6171     lastbr = br;
6172     while (*RExC_parse == '|') {
6173 	if (!SIZE_ONLY && RExC_extralen) {
6174 	    ender = reganode(pRExC_state, LONGJMP,0);
6175             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6176 	}
6177 	if (SIZE_ONLY)
6178 	    RExC_extralen += 2;		/* Account for LONGJMP. */
6179 	nextchar(pRExC_state);
6180 	if (freeze_paren) {
6181 	    if (RExC_npar > after_freeze)
6182 	        after_freeze = RExC_npar;
6183             RExC_npar = freeze_paren;
6184         }
6185         br = regbranch(pRExC_state, &flags, 0, depth+1);
6186 
6187 	if (br == NULL)
6188 	    return(NULL);
6189         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6190 	lastbr = br;
6191 	*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6192     }
6193 
6194     if (have_branch || paren != ':') {
6195 	/* Make a closing node, and hook it on the end. */
6196 	switch (paren) {
6197 	case ':':
6198 	    ender = reg_node(pRExC_state, TAIL);
6199 	    break;
6200 	case 1:
6201 	    ender = reganode(pRExC_state, CLOSE, parno);
6202 	    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6203 		DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6204 			"Setting close paren #%"IVdf" to %d\n",
6205 			(IV)parno, REG_NODE_NUM(ender)));
6206 	        RExC_close_parens[parno-1]= ender;
6207 	        if (RExC_nestroot == parno)
6208 	            RExC_nestroot = 0;
6209 	    }
6210             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6211             Set_Node_Length(ender,1); /* MJD */
6212 	    break;
6213 	case '<':
6214 	case ',':
6215 	case '=':
6216 	case '!':
6217 	    *flagp &= ~HASWIDTH;
6218 	    /* FALL THROUGH */
6219 	case '>':
6220 	    ender = reg_node(pRExC_state, SUCCEED);
6221 	    break;
6222 	case 0:
6223 	    ender = reg_node(pRExC_state, END);
6224 	    if (!SIZE_ONLY) {
6225                 assert(!RExC_opend); /* there can only be one! */
6226                 RExC_opend = ender;
6227             }
6228 	    break;
6229 	}
6230         REGTAIL(pRExC_state, lastbr, ender);
6231 
6232 	if (have_branch && !SIZE_ONLY) {
6233 	    if (depth==1)
6234 	        RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6235 
6236 	    /* Hook the tails of the branches to the closing node. */
6237 	    for (br = ret; br; br = regnext(br)) {
6238 		const U8 op = PL_regkind[OP(br)];
6239 		if (op == BRANCH) {
6240                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6241 		}
6242 		else if (op == BRANCHJ) {
6243                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6244 		}
6245 	    }
6246 	}
6247     }
6248 
6249     {
6250         const char *p;
6251         static const char parens[] = "=!<,>";
6252 
6253 	if (paren && (p = strchr(parens, paren))) {
6254 	    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6255 	    int flag = (p - parens) > 1;
6256 
6257 	    if (paren == '>')
6258 		node = SUSPEND, flag = 0;
6259 	    reginsert(pRExC_state, node,ret, depth+1);
6260 	    Set_Node_Cur_Length(ret);
6261 	    Set_Node_Offset(ret, parse_start + 1);
6262 	    ret->flags = flag;
6263             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6264 	}
6265     }
6266 
6267     /* Check for proper termination. */
6268     if (paren) {
6269 	RExC_flags = oregflags;
6270 	if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6271 	    RExC_parse = oregcomp_parse;
6272 	    vFAIL("Unmatched (");
6273 	}
6274     }
6275     else if (!paren && RExC_parse < RExC_end) {
6276 	if (*RExC_parse == ')') {
6277 	    RExC_parse++;
6278 	    vFAIL("Unmatched )");
6279 	}
6280 	else
6281 	    FAIL("Junk on end of regexp");	/* "Can't happen". */
6282 	/* NOTREACHED */
6283     }
6284     if (after_freeze)
6285         RExC_npar = after_freeze;
6286     return(ret);
6287 }
6288 
6289 /*
6290  - regbranch - one alternative of an | operator
6291  *
6292  * Implements the concatenation operator.
6293  */
6294 STATIC regnode *
6295 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6296 {
6297     dVAR;
6298     register regnode *ret;
6299     register regnode *chain = NULL;
6300     register regnode *latest;
6301     I32 flags = 0, c = 0;
6302     GET_RE_DEBUG_FLAGS_DECL;
6303 
6304     PERL_ARGS_ASSERT_REGBRANCH;
6305 
6306     DEBUG_PARSE("brnc");
6307 
6308     if (first)
6309 	ret = NULL;
6310     else {
6311 	if (!SIZE_ONLY && RExC_extralen)
6312 	    ret = reganode(pRExC_state, BRANCHJ,0);
6313 	else {
6314 	    ret = reg_node(pRExC_state, BRANCH);
6315             Set_Node_Length(ret, 1);
6316         }
6317     }
6318 
6319     if (!first && SIZE_ONLY)
6320 	RExC_extralen += 1;			/* BRANCHJ */
6321 
6322     *flagp = WORST;			/* Tentatively. */
6323 
6324     RExC_parse--;
6325     nextchar(pRExC_state);
6326     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6327 	flags &= ~TRYAGAIN;
6328         latest = regpiece(pRExC_state, &flags,depth+1);
6329 	if (latest == NULL) {
6330 	    if (flags & TRYAGAIN)
6331 		continue;
6332 	    return(NULL);
6333 	}
6334 	else if (ret == NULL)
6335 	    ret = latest;
6336 	*flagp |= flags&(HASWIDTH|POSTPONED);
6337 	if (chain == NULL) 	/* First piece. */
6338 	    *flagp |= flags&SPSTART;
6339 	else {
6340 	    RExC_naughty++;
6341             REGTAIL(pRExC_state, chain, latest);
6342 	}
6343 	chain = latest;
6344 	c++;
6345     }
6346     if (chain == NULL) {	/* Loop ran zero times. */
6347 	chain = reg_node(pRExC_state, NOTHING);
6348 	if (ret == NULL)
6349 	    ret = chain;
6350     }
6351     if (c == 1) {
6352 	*flagp |= flags&SIMPLE;
6353     }
6354 
6355     return ret;
6356 }
6357 
6358 /*
6359  - regpiece - something followed by possible [*+?]
6360  *
6361  * Note that the branching code sequences used for ? and the general cases
6362  * of * and + are somewhat optimized:  they use the same NOTHING node as
6363  * both the endmarker for their branch list and the body of the last branch.
6364  * It might seem that this node could be dispensed with entirely, but the
6365  * endmarker role is not redundant.
6366  */
6367 STATIC regnode *
6368 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6369 {
6370     dVAR;
6371     register regnode *ret;
6372     register char op;
6373     register char *next;
6374     I32 flags;
6375     const char * const origparse = RExC_parse;
6376     I32 min;
6377     I32 max = REG_INFTY;
6378     char *parse_start;
6379     const char *maxpos = NULL;
6380     GET_RE_DEBUG_FLAGS_DECL;
6381 
6382     PERL_ARGS_ASSERT_REGPIECE;
6383 
6384     DEBUG_PARSE("piec");
6385 
6386     ret = regatom(pRExC_state, &flags,depth+1);
6387     if (ret == NULL) {
6388 	if (flags & TRYAGAIN)
6389 	    *flagp |= TRYAGAIN;
6390 	return(NULL);
6391     }
6392 
6393     op = *RExC_parse;
6394 
6395     if (op == '{' && regcurly(RExC_parse)) {
6396 	maxpos = NULL;
6397         parse_start = RExC_parse; /* MJD */
6398 	next = RExC_parse + 1;
6399 	while (isDIGIT(*next) || *next == ',') {
6400 	    if (*next == ',') {
6401 		if (maxpos)
6402 		    break;
6403 		else
6404 		    maxpos = next;
6405 	    }
6406 	    next++;
6407 	}
6408 	if (*next == '}') {		/* got one */
6409 	    if (!maxpos)
6410 		maxpos = next;
6411 	    RExC_parse++;
6412 	    min = atoi(RExC_parse);
6413 	    if (*maxpos == ',')
6414 		maxpos++;
6415 	    else
6416 		maxpos = RExC_parse;
6417 	    max = atoi(maxpos);
6418 	    if (!max && *maxpos != '0')
6419 		max = REG_INFTY;		/* meaning "infinity" */
6420 	    else if (max >= REG_INFTY)
6421 		vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6422 	    RExC_parse = next;
6423 	    nextchar(pRExC_state);
6424 
6425 	do_curly:
6426 	    if ((flags&SIMPLE)) {
6427 		RExC_naughty += 2 + RExC_naughty / 2;
6428 		reginsert(pRExC_state, CURLY, ret, depth+1);
6429                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6430                 Set_Node_Cur_Length(ret);
6431 	    }
6432 	    else {
6433 		regnode * const w = reg_node(pRExC_state, WHILEM);
6434 
6435 		w->flags = 0;
6436                 REGTAIL(pRExC_state, ret, w);
6437 		if (!SIZE_ONLY && RExC_extralen) {
6438 		    reginsert(pRExC_state, LONGJMP,ret, depth+1);
6439 		    reginsert(pRExC_state, NOTHING,ret, depth+1);
6440 		    NEXT_OFF(ret) = 3;	/* Go over LONGJMP. */
6441 		}
6442 		reginsert(pRExC_state, CURLYX,ret, depth+1);
6443                                 /* MJD hk */
6444                 Set_Node_Offset(ret, parse_start+1);
6445                 Set_Node_Length(ret,
6446                                 op == '{' ? (RExC_parse - parse_start) : 1);
6447 
6448 		if (!SIZE_ONLY && RExC_extralen)
6449 		    NEXT_OFF(ret) = 3;	/* Go over NOTHING to LONGJMP. */
6450                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6451 		if (SIZE_ONLY)
6452 		    RExC_whilem_seen++, RExC_extralen += 3;
6453 		RExC_naughty += 4 + RExC_naughty;	/* compound interest */
6454 	    }
6455 	    ret->flags = 0;
6456 
6457 	    if (min > 0)
6458 		*flagp = WORST;
6459 	    if (max > 0)
6460 		*flagp |= HASWIDTH;
6461 	    if (max < min)
6462 		vFAIL("Can't do {n,m} with n > m");
6463 	    if (!SIZE_ONLY) {
6464 		ARG1_SET(ret, (U16)min);
6465 		ARG2_SET(ret, (U16)max);
6466 	    }
6467 
6468 	    goto nest_check;
6469 	}
6470     }
6471 
6472     if (!ISMULT1(op)) {
6473 	*flagp = flags;
6474 	return(ret);
6475     }
6476 
6477 #if 0				/* Now runtime fix should be reliable. */
6478 
6479     /* if this is reinstated, don't forget to put this back into perldiag:
6480 
6481 	    =item Regexp *+ operand could be empty at {#} in regex m/%s/
6482 
6483 	   (F) The part of the regexp subject to either the * or + quantifier
6484            could match an empty string. The {#} shows in the regular
6485            expression about where the problem was discovered.
6486 
6487     */
6488 
6489     if (!(flags&HASWIDTH) && op != '?')
6490       vFAIL("Regexp *+ operand could be empty");
6491 #endif
6492 
6493     parse_start = RExC_parse;
6494     nextchar(pRExC_state);
6495 
6496     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6497 
6498     if (op == '*' && (flags&SIMPLE)) {
6499 	reginsert(pRExC_state, STAR, ret, depth+1);
6500 	ret->flags = 0;
6501 	RExC_naughty += 4;
6502     }
6503     else if (op == '*') {
6504 	min = 0;
6505 	goto do_curly;
6506     }
6507     else if (op == '+' && (flags&SIMPLE)) {
6508 	reginsert(pRExC_state, PLUS, ret, depth+1);
6509 	ret->flags = 0;
6510 	RExC_naughty += 3;
6511     }
6512     else if (op == '+') {
6513 	min = 1;
6514 	goto do_curly;
6515     }
6516     else if (op == '?') {
6517 	min = 0; max = 1;
6518 	goto do_curly;
6519     }
6520   nest_check:
6521     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6522 	vWARN3(RExC_parse,
6523 	       "%.*s matches null string many times",
6524 	       (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6525 	       origparse);
6526     }
6527 
6528     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6529 	nextchar(pRExC_state);
6530 	reginsert(pRExC_state, MINMOD, ret, depth+1);
6531         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6532     }
6533 #ifndef REG_ALLOW_MINMOD_SUSPEND
6534     else
6535 #endif
6536     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6537         regnode *ender;
6538         nextchar(pRExC_state);
6539         ender = reg_node(pRExC_state, SUCCEED);
6540         REGTAIL(pRExC_state, ret, ender);
6541         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6542         ret->flags = 0;
6543         ender = reg_node(pRExC_state, TAIL);
6544         REGTAIL(pRExC_state, ret, ender);
6545         /*ret= ender;*/
6546     }
6547 
6548     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6549 	RExC_parse++;
6550 	vFAIL("Nested quantifiers");
6551     }
6552 
6553     return(ret);
6554 }
6555 
6556 
6557 /* reg_namedseq(pRExC_state,UVp)
6558 
6559    This is expected to be called by a parser routine that has
6560    recognized'\N' and needs to handle the rest. RExC_parse is
6561    expected to point at the first char following the N at the time
6562    of the call.
6563 
6564    If valuep is non-null then it is assumed that we are parsing inside
6565    of a charclass definition and the first codepoint in the resolved
6566    string is returned via *valuep and the routine will return NULL.
6567    In this mode if a multichar string is returned from the charnames
6568    handler a warning will be issued, and only the first char in the
6569    sequence will be examined. If the string returned is zero length
6570    then the value of *valuep is undefined and NON-NULL will
6571    be returned to indicate failure. (This will NOT be a valid pointer
6572    to a regnode.)
6573 
6574    If value is null then it is assumed that we are parsing normal text
6575    and inserts a new EXACT node into the program containing the resolved
6576    string and returns a pointer to the new node. If the string is
6577    zerolength a NOTHING node is emitted.
6578 
6579    On success RExC_parse is set to the char following the endbrace.
6580    Parsing failures will generate a fatal errorvia vFAIL(...)
6581 
6582    NOTE: We cache all results from the charnames handler locally in
6583    the RExC_charnames hash (created on first use) to prevent a charnames
6584    handler from playing silly-buggers and returning a short string and
6585    then a long string for a given pattern. Since the regexp program
6586    size is calculated during an initial parse this would result
6587    in a buffer overrun so we cache to prevent the charname result from
6588    changing during the course of the parse.
6589 
6590  */
6591 STATIC regnode *
6592 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6593 {
6594     char * name;        /* start of the content of the name */
6595     char * endbrace;    /* endbrace following the name */
6596     SV *sv_str = NULL;
6597     SV *sv_name = NULL;
6598     STRLEN len; /* this has various purposes throughout the code */
6599     bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6600     regnode *ret = NULL;
6601 
6602     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6603 
6604     if (*RExC_parse != '{') {
6605         vFAIL("Missing braces on \\N{}");
6606     }
6607     name = RExC_parse+1;
6608     endbrace = strchr(RExC_parse, '}');
6609     if ( ! endbrace ) {
6610         RExC_parse++;
6611         vFAIL("Missing right brace on \\N{}");
6612     }
6613     RExC_parse = endbrace + 1;
6614 
6615 
6616     /* RExC_parse points at the beginning brace,
6617        endbrace points at the last */
6618     if ( name[0]=='U' && name[1]=='+' ) {
6619         /* its a "Unicode hex" notation {U+89AB} */
6620         I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6621             | PERL_SCAN_DISALLOW_PREFIX
6622             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6623         UV cp;
6624         len = (STRLEN)(endbrace - name - 2);
6625         cp = grok_hex(name + 2, &len, &fl, NULL);
6626         if ( len != (STRLEN)(endbrace - name - 2) ) {
6627             cp = 0xFFFD;
6628         }
6629         if ( valuep ) {
6630 	    if (cp > 0xff) RExC_utf8 = 1;
6631             *valuep = cp;
6632             return NULL;
6633         }
6634 
6635 	/* Need to convert to utf8 if either: won't fit into a byte, or the re
6636 	 * is going to be in utf8 and the representation changes under utf8. */
6637 	if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6638 	    U8 string[UTF8_MAXBYTES+1];
6639 	    U8 *tmps;
6640 	    RExC_utf8 = 1;
6641 	    tmps = uvuni_to_utf8(string, cp);
6642 	    sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6643 	} else {    /* Otherwise, no need for utf8, can skip that step */
6644 	    char string;
6645 	    string = (char)cp;
6646 	    sv_str= newSVpvn(&string, 1);
6647 	}
6648     } else {
6649         /* fetch the charnames handler for this scope */
6650         HV * const table = GvHV(PL_hintgv);
6651         SV **cvp= table ?
6652             hv_fetchs(table, "charnames", FALSE) :
6653             NULL;
6654         SV *cv= cvp ? *cvp : NULL;
6655         HE *he_str;
6656         int count;
6657         /* create an SV with the name as argument */
6658         sv_name = newSVpvn(name, endbrace - name);
6659 
6660         if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6661             vFAIL2("Constant(\\N{%s}) unknown: "
6662                   "(possibly a missing \"use charnames ...\")",
6663                   SvPVX(sv_name));
6664         }
6665         if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6666             vFAIL2("Constant(\\N{%s}): "
6667                   "$^H{charnames} is not defined",SvPVX(sv_name));
6668         }
6669 
6670 
6671 
6672         if (!RExC_charnames) {
6673             /* make sure our cache is allocated */
6674             RExC_charnames = newHV();
6675             sv_2mortal(MUTABLE_SV(RExC_charnames));
6676         }
6677             /* see if we have looked this one up before */
6678         he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6679         if ( he_str ) {
6680             sv_str = HeVAL(he_str);
6681             cached = 1;
6682         } else {
6683             dSP ;
6684 
6685             ENTER ;
6686             SAVETMPS ;
6687             PUSHMARK(SP) ;
6688 
6689             XPUSHs(sv_name);
6690 
6691             PUTBACK ;
6692 
6693             count= call_sv(cv, G_SCALAR);
6694 
6695             if (count == 1) { /* XXXX is this right? dmq */
6696                 sv_str = POPs;
6697                 SvREFCNT_inc_simple_void(sv_str);
6698             }
6699 
6700             SPAGAIN ;
6701             PUTBACK ;
6702             FREETMPS ;
6703             LEAVE ;
6704 
6705             if ( !sv_str || !SvOK(sv_str) ) {
6706                 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6707                       "did not return a defined value",SvPVX(sv_name));
6708             }
6709             if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6710                 cached = 1;
6711         }
6712     }
6713     if (valuep) {
6714         char *p = SvPV(sv_str, len);
6715         if (len) {
6716             STRLEN numlen = 1;
6717             if ( SvUTF8(sv_str) ) {
6718                 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6719                 if (*valuep > 0x7F)
6720                     RExC_utf8 = 1;
6721                 /* XXXX
6722                   We have to turn on utf8 for high bit chars otherwise
6723                   we get failures with
6724 
6725                    "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6726                    "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6727 
6728                   This is different from what \x{} would do with the same
6729                   codepoint, where the condition is > 0xFF.
6730                   - dmq
6731                 */
6732 
6733 
6734             } else {
6735                 *valuep = (UV)*p;
6736                 /* warn if we havent used the whole string? */
6737             }
6738             if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6739                 vWARN2(RExC_parse,
6740                     "Ignoring excess chars from \\N{%s} in character class",
6741                     SvPVX(sv_name)
6742                 );
6743             }
6744         } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6745             vWARN2(RExC_parse,
6746                     "Ignoring zero length \\N{%s} in character class",
6747                     SvPVX(sv_name)
6748                 );
6749         }
6750         if (sv_name)
6751             SvREFCNT_dec(sv_name);
6752         if (!cached)
6753             SvREFCNT_dec(sv_str);
6754         return len ? NULL : (regnode *)&len;
6755     } else if(SvCUR(sv_str)) {
6756 
6757         char *s;
6758         char *p, *pend;
6759         STRLEN charlen = 1;
6760 #ifdef DEBUGGING
6761         char * parse_start = name-3; /* needed for the offsets */
6762 #endif
6763         GET_RE_DEBUG_FLAGS_DECL;     /* needed for the offsets */
6764 
6765         ret = reg_node(pRExC_state,
6766             (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6767         s= STRING(ret);
6768 
6769         if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6770             sv_utf8_upgrade(sv_str);
6771         } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6772             RExC_utf8= 1;
6773         }
6774 
6775         p = SvPV(sv_str, len);
6776         pend = p + len;
6777         /* len is the length written, charlen is the size the char read */
6778         for ( len = 0; p < pend; p += charlen ) {
6779             if (UTF) {
6780                 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6781                 if (FOLD) {
6782                     STRLEN foldlen,numlen;
6783                     U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6784                     uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6785                     /* Emit all the Unicode characters. */
6786 
6787                     for (foldbuf = tmpbuf;
6788                         foldlen;
6789                         foldlen -= numlen)
6790                     {
6791                         uvc = utf8_to_uvchr(foldbuf, &numlen);
6792                         if (numlen > 0) {
6793                             const STRLEN unilen = reguni(pRExC_state, uvc, s);
6794                             s       += unilen;
6795                             len     += unilen;
6796                             /* In EBCDIC the numlen
6797                             * and unilen can differ. */
6798                             foldbuf += numlen;
6799                             if (numlen >= foldlen)
6800                                 break;
6801                         }
6802                         else
6803                             break; /* "Can't happen." */
6804                     }
6805                 } else {
6806                     const STRLEN unilen = reguni(pRExC_state, uvc, s);
6807         	    if (unilen > 0) {
6808         	       s   += unilen;
6809         	       len += unilen;
6810         	    }
6811         	}
6812 	    } else {
6813                 len++;
6814                 REGC(*p, s++);
6815             }
6816         }
6817         if (SIZE_ONLY) {
6818             RExC_size += STR_SZ(len);
6819         } else {
6820             STR_LEN(ret) = len;
6821             RExC_emit += STR_SZ(len);
6822         }
6823         Set_Node_Cur_Length(ret); /* MJD */
6824         RExC_parse--;
6825         nextchar(pRExC_state);
6826     } else {	/* zero length */
6827         ret = reg_node(pRExC_state,NOTHING);
6828     }
6829     if (!cached) {
6830         SvREFCNT_dec(sv_str);
6831     }
6832     if (sv_name) {
6833         SvREFCNT_dec(sv_name);
6834     }
6835     return ret;
6836 
6837 }
6838 
6839 
6840 /*
6841  * reg_recode
6842  *
6843  * It returns the code point in utf8 for the value in *encp.
6844  *    value: a code value in the source encoding
6845  *    encp:  a pointer to an Encode object
6846  *
6847  * If the result from Encode is not a single character,
6848  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6849  */
6850 STATIC UV
6851 S_reg_recode(pTHX_ const char value, SV **encp)
6852 {
6853     STRLEN numlen = 1;
6854     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6855     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6856     const STRLEN newlen = SvCUR(sv);
6857     UV uv = UNICODE_REPLACEMENT;
6858 
6859     PERL_ARGS_ASSERT_REG_RECODE;
6860 
6861     if (newlen)
6862 	uv = SvUTF8(sv)
6863 	     ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6864 	     : *(U8*)s;
6865 
6866     if (!newlen || numlen != newlen) {
6867 	uv = UNICODE_REPLACEMENT;
6868 	*encp = NULL;
6869     }
6870     return uv;
6871 }
6872 
6873 
6874 /*
6875  - regatom - the lowest level
6876 
6877    Try to identify anything special at the start of the pattern. If there
6878    is, then handle it as required. This may involve generating a single regop,
6879    such as for an assertion; or it may involve recursing, such as to
6880    handle a () structure.
6881 
6882    If the string doesn't start with something special then we gobble up
6883    as much literal text as we can.
6884 
6885    Once we have been able to handle whatever type of thing started the
6886    sequence, we return.
6887 
6888    Note: we have to be careful with escapes, as they can be both literal
6889    and special, and in the case of \10 and friends can either, depending
6890    on context. Specifically there are two seperate switches for handling
6891    escape sequences, with the one for handling literal escapes requiring
6892    a dummy entry for all of the special escapes that are actually handled
6893    by the other.
6894 */
6895 
6896 STATIC regnode *
6897 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6898 {
6899     dVAR;
6900     register regnode *ret = NULL;
6901     I32 flags;
6902     char *parse_start = RExC_parse;
6903     GET_RE_DEBUG_FLAGS_DECL;
6904     DEBUG_PARSE("atom");
6905     *flagp = WORST;		/* Tentatively. */
6906 
6907     PERL_ARGS_ASSERT_REGATOM;
6908 
6909 tryagain:
6910     switch ((U8)*RExC_parse) {
6911     case '^':
6912 	RExC_seen_zerolen++;
6913 	nextchar(pRExC_state);
6914 	if (RExC_flags & RXf_PMf_MULTILINE)
6915 	    ret = reg_node(pRExC_state, MBOL);
6916 	else if (RExC_flags & RXf_PMf_SINGLELINE)
6917 	    ret = reg_node(pRExC_state, SBOL);
6918 	else
6919 	    ret = reg_node(pRExC_state, BOL);
6920         Set_Node_Length(ret, 1); /* MJD */
6921 	break;
6922     case '$':
6923 	nextchar(pRExC_state);
6924 	if (*RExC_parse)
6925 	    RExC_seen_zerolen++;
6926 	if (RExC_flags & RXf_PMf_MULTILINE)
6927 	    ret = reg_node(pRExC_state, MEOL);
6928 	else if (RExC_flags & RXf_PMf_SINGLELINE)
6929 	    ret = reg_node(pRExC_state, SEOL);
6930 	else
6931 	    ret = reg_node(pRExC_state, EOL);
6932         Set_Node_Length(ret, 1); /* MJD */
6933 	break;
6934     case '.':
6935 	nextchar(pRExC_state);
6936 	if (RExC_flags & RXf_PMf_SINGLELINE)
6937 	    ret = reg_node(pRExC_state, SANY);
6938 	else
6939 	    ret = reg_node(pRExC_state, REG_ANY);
6940 	*flagp |= HASWIDTH|SIMPLE;
6941 	RExC_naughty++;
6942         Set_Node_Length(ret, 1); /* MJD */
6943 	break;
6944     case '[':
6945     {
6946 	char * const oregcomp_parse = ++RExC_parse;
6947         ret = regclass(pRExC_state,depth+1);
6948 	if (*RExC_parse != ']') {
6949 	    RExC_parse = oregcomp_parse;
6950 	    vFAIL("Unmatched [");
6951 	}
6952 	nextchar(pRExC_state);
6953 	*flagp |= HASWIDTH|SIMPLE;
6954         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6955 	break;
6956     }
6957     case '(':
6958 	nextchar(pRExC_state);
6959         ret = reg(pRExC_state, 1, &flags,depth+1);
6960 	if (ret == NULL) {
6961 		if (flags & TRYAGAIN) {
6962 		    if (RExC_parse == RExC_end) {
6963 			 /* Make parent create an empty node if needed. */
6964 			*flagp |= TRYAGAIN;
6965 			return(NULL);
6966 		    }
6967 		    goto tryagain;
6968 		}
6969 		return(NULL);
6970 	}
6971 	*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6972 	break;
6973     case '|':
6974     case ')':
6975 	if (flags & TRYAGAIN) {
6976 	    *flagp |= TRYAGAIN;
6977 	    return NULL;
6978 	}
6979 	vFAIL("Internal urp");
6980 				/* Supposed to be caught earlier. */
6981 	break;
6982     case '{':
6983 	if (!regcurly(RExC_parse)) {
6984 	    RExC_parse++;
6985 	    goto defchar;
6986 	}
6987 	/* FALL THROUGH */
6988     case '?':
6989     case '+':
6990     case '*':
6991 	RExC_parse++;
6992 	vFAIL("Quantifier follows nothing");
6993 	break;
6994     case 0xDF:
6995     case 0xC3:
6996     case 0xCE:
6997         do_foldchar:
6998         if (!LOC && FOLD) {
6999             U32 len,cp;
7000 	    len=0; /* silence a spurious compiler warning */
7001             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7002                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7003                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7004                 ret = reganode(pRExC_state, FOLDCHAR, cp);
7005                 Set_Node_Length(ret, 1); /* MJD */
7006                 nextchar(pRExC_state); /* kill whitespace under /x */
7007                 return ret;
7008             }
7009         }
7010         goto outer_default;
7011     case '\\':
7012 	/* Special Escapes
7013 
7014 	   This switch handles escape sequences that resolve to some kind
7015 	   of special regop and not to literal text. Escape sequnces that
7016 	   resolve to literal text are handled below in the switch marked
7017 	   "Literal Escapes".
7018 
7019 	   Every entry in this switch *must* have a corresponding entry
7020 	   in the literal escape switch. However, the opposite is not
7021 	   required, as the default for this switch is to jump to the
7022 	   literal text handling code.
7023 	*/
7024 	switch ((U8)*++RExC_parse) {
7025 	case 0xDF:
7026 	case 0xC3:
7027 	case 0xCE:
7028 	           goto do_foldchar;
7029 	/* Special Escapes */
7030 	case 'A':
7031 	    RExC_seen_zerolen++;
7032 	    ret = reg_node(pRExC_state, SBOL);
7033 	    *flagp |= SIMPLE;
7034 	    goto finish_meta_pat;
7035 	case 'G':
7036 	    ret = reg_node(pRExC_state, GPOS);
7037 	    RExC_seen |= REG_SEEN_GPOS;
7038 	    *flagp |= SIMPLE;
7039 	    goto finish_meta_pat;
7040 	case 'K':
7041 	    RExC_seen_zerolen++;
7042 	    ret = reg_node(pRExC_state, KEEPS);
7043 	    *flagp |= SIMPLE;
7044 	    /* XXX:dmq : disabling in-place substitution seems to
7045 	     * be necessary here to avoid cases of memory corruption, as
7046 	     * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7047 	     */
7048 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
7049 	    goto finish_meta_pat;
7050 	case 'Z':
7051 	    ret = reg_node(pRExC_state, SEOL);
7052 	    *flagp |= SIMPLE;
7053 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
7054 	    goto finish_meta_pat;
7055 	case 'z':
7056 	    ret = reg_node(pRExC_state, EOS);
7057 	    *flagp |= SIMPLE;
7058 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
7059 	    goto finish_meta_pat;
7060 	case 'C':
7061 	    ret = reg_node(pRExC_state, CANY);
7062 	    RExC_seen |= REG_SEEN_CANY;
7063 	    *flagp |= HASWIDTH|SIMPLE;
7064 	    goto finish_meta_pat;
7065 	case 'X':
7066 	    ret = reg_node(pRExC_state, CLUMP);
7067 	    *flagp |= HASWIDTH;
7068 	    goto finish_meta_pat;
7069 	case 'w':
7070 	    ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
7071 	    *flagp |= HASWIDTH|SIMPLE;
7072 	    goto finish_meta_pat;
7073 	case 'W':
7074 	    ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
7075 	    *flagp |= HASWIDTH|SIMPLE;
7076 	    goto finish_meta_pat;
7077 	case 'b':
7078 	    RExC_seen_zerolen++;
7079 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
7080 	    ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
7081 	    *flagp |= SIMPLE;
7082 	    goto finish_meta_pat;
7083 	case 'B':
7084 	    RExC_seen_zerolen++;
7085 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
7086 	    ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
7087 	    *flagp |= SIMPLE;
7088 	    goto finish_meta_pat;
7089 	case 's':
7090 	    ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
7091 	    *flagp |= HASWIDTH|SIMPLE;
7092 	    goto finish_meta_pat;
7093 	case 'S':
7094 	    ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
7095 	    *flagp |= HASWIDTH|SIMPLE;
7096 	    goto finish_meta_pat;
7097 	case 'd':
7098 	    ret = reg_node(pRExC_state, DIGIT);
7099 	    *flagp |= HASWIDTH|SIMPLE;
7100 	    goto finish_meta_pat;
7101 	case 'D':
7102 	    ret = reg_node(pRExC_state, NDIGIT);
7103 	    *flagp |= HASWIDTH|SIMPLE;
7104 	    goto finish_meta_pat;
7105 	case 'R':
7106 	    ret = reg_node(pRExC_state, LNBREAK);
7107 	    *flagp |= HASWIDTH|SIMPLE;
7108 	    goto finish_meta_pat;
7109 	case 'h':
7110 	    ret = reg_node(pRExC_state, HORIZWS);
7111 	    *flagp |= HASWIDTH|SIMPLE;
7112 	    goto finish_meta_pat;
7113 	case 'H':
7114 	    ret = reg_node(pRExC_state, NHORIZWS);
7115 	    *flagp |= HASWIDTH|SIMPLE;
7116 	    goto finish_meta_pat;
7117 	case 'v':
7118 	    ret = reg_node(pRExC_state, VERTWS);
7119 	    *flagp |= HASWIDTH|SIMPLE;
7120 	    goto finish_meta_pat;
7121 	case 'V':
7122 	    ret = reg_node(pRExC_state, NVERTWS);
7123 	    *flagp |= HASWIDTH|SIMPLE;
7124          finish_meta_pat:
7125 	    nextchar(pRExC_state);
7126             Set_Node_Length(ret, 2); /* MJD */
7127 	    break;
7128 	case 'p':
7129 	case 'P':
7130 	    {
7131 		char* const oldregxend = RExC_end;
7132 #ifdef DEBUGGING
7133 		char* parse_start = RExC_parse - 2;
7134 #endif
7135 
7136 		if (RExC_parse[1] == '{') {
7137 		  /* a lovely hack--pretend we saw [\pX] instead */
7138 		    RExC_end = strchr(RExC_parse, '}');
7139 		    if (!RExC_end) {
7140 		        const U8 c = (U8)*RExC_parse;
7141 			RExC_parse += 2;
7142 			RExC_end = oldregxend;
7143 			vFAIL2("Missing right brace on \\%c{}", c);
7144 		    }
7145 		    RExC_end++;
7146 		}
7147 		else {
7148 		    RExC_end = RExC_parse + 2;
7149 		    if (RExC_end > oldregxend)
7150 			RExC_end = oldregxend;
7151 		}
7152 		RExC_parse--;
7153 
7154                 ret = regclass(pRExC_state,depth+1);
7155 
7156 		RExC_end = oldregxend;
7157 		RExC_parse--;
7158 
7159 		Set_Node_Offset(ret, parse_start + 2);
7160 		Set_Node_Cur_Length(ret);
7161 		nextchar(pRExC_state);
7162 		*flagp |= HASWIDTH|SIMPLE;
7163 	    }
7164 	    break;
7165         case 'N':
7166             /* Handle \N{NAME} here and not below because it can be
7167             multicharacter. join_exact() will join them up later on.
7168             Also this makes sure that things like /\N{BLAH}+/ and
7169             \N{BLAH} being multi char Just Happen. dmq*/
7170             ++RExC_parse;
7171             ret= reg_namedseq(pRExC_state, NULL);
7172             break;
7173 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
7174 	parse_named_seq:
7175         {
7176             char ch= RExC_parse[1];
7177 	    if (ch != '<' && ch != '\'' && ch != '{') {
7178 	        RExC_parse++;
7179 	        vFAIL2("Sequence %.2s... not terminated",parse_start);
7180 	    } else {
7181 	        /* this pretty much dupes the code for (?P=...) in reg(), if
7182                    you change this make sure you change that */
7183 		char* name_start = (RExC_parse += 2);
7184 		U32 num = 0;
7185                 SV *sv_dat = reg_scan_name(pRExC_state,
7186                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7187                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7188                 if (RExC_parse == name_start || *RExC_parse != ch)
7189                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7190 
7191                 if (!SIZE_ONLY) {
7192                     num = add_data( pRExC_state, 1, "S" );
7193                     RExC_rxi->data->data[num]=(void*)sv_dat;
7194                     SvREFCNT_inc_simple_void(sv_dat);
7195                 }
7196 
7197                 RExC_sawback = 1;
7198                 ret = reganode(pRExC_state,
7199                 	   (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7200                 	   num);
7201                 *flagp |= HASWIDTH;
7202 
7203                 /* override incorrect value set in reganode MJD */
7204                 Set_Node_Offset(ret, parse_start+1);
7205                 Set_Node_Cur_Length(ret); /* MJD */
7206                 nextchar(pRExC_state);
7207 
7208             }
7209             break;
7210 	}
7211 	case 'g':
7212 	case '1': case '2': case '3': case '4':
7213 	case '5': case '6': case '7': case '8': case '9':
7214 	    {
7215 		I32 num;
7216 		bool isg = *RExC_parse == 'g';
7217 		bool isrel = 0;
7218 		bool hasbrace = 0;
7219 		if (isg) {
7220 		    RExC_parse++;
7221 		    if (*RExC_parse == '{') {
7222 		        RExC_parse++;
7223 		        hasbrace = 1;
7224 		    }
7225 		    if (*RExC_parse == '-') {
7226 		        RExC_parse++;
7227 		        isrel = 1;
7228 		    }
7229 		    if (hasbrace && !isDIGIT(*RExC_parse)) {
7230 		        if (isrel) RExC_parse--;
7231                         RExC_parse -= 2;
7232 		        goto parse_named_seq;
7233 		}   }
7234 		num = atoi(RExC_parse);
7235 		if (isg && num == 0)
7236 		    vFAIL("Reference to invalid group 0");
7237                 if (isrel) {
7238                     num = RExC_npar - num;
7239                     if (num < 1)
7240                         vFAIL("Reference to nonexistent or unclosed group");
7241                 }
7242 		if (!isg && num > 9 && num >= RExC_npar)
7243 		    goto defchar;
7244 		else {
7245 		    char * const parse_start = RExC_parse - 1; /* MJD */
7246 		    while (isDIGIT(*RExC_parse))
7247 			RExC_parse++;
7248 	            if (parse_start == RExC_parse - 1)
7249 	                vFAIL("Unterminated \\g... pattern");
7250                     if (hasbrace) {
7251                         if (*RExC_parse != '}')
7252                             vFAIL("Unterminated \\g{...} pattern");
7253                         RExC_parse++;
7254                     }
7255 		    if (!SIZE_ONLY) {
7256 		        if (num > (I32)RExC_rx->nparens)
7257 			    vFAIL("Reference to nonexistent group");
7258 		    }
7259 		    RExC_sawback = 1;
7260 		    ret = reganode(pRExC_state,
7261 				   (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7262 				   num);
7263 		    *flagp |= HASWIDTH;
7264 
7265                     /* override incorrect value set in reganode MJD */
7266                     Set_Node_Offset(ret, parse_start+1);
7267                     Set_Node_Cur_Length(ret); /* MJD */
7268 		    RExC_parse--;
7269 		    nextchar(pRExC_state);
7270 		}
7271 	    }
7272 	    break;
7273 	case '\0':
7274 	    if (RExC_parse >= RExC_end)
7275 		FAIL("Trailing \\");
7276 	    /* FALL THROUGH */
7277 	default:
7278 	    /* Do not generate "unrecognized" warnings here, we fall
7279 	       back into the quick-grab loop below */
7280 	    parse_start--;
7281 	    goto defchar;
7282 	}
7283 	break;
7284 
7285     case '#':
7286 	if (RExC_flags & RXf_PMf_EXTENDED) {
7287 	    if ( reg_skipcomment( pRExC_state ) )
7288 		goto tryagain;
7289 	}
7290 	/* FALL THROUGH */
7291 
7292     default:
7293         outer_default:{
7294 	    register STRLEN len;
7295 	    register UV ender;
7296 	    register char *p;
7297 	    char *s;
7298 	    STRLEN foldlen;
7299 	    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7300 
7301             parse_start = RExC_parse - 1;
7302 
7303 	    RExC_parse++;
7304 
7305 	defchar:
7306 	    ender = 0;
7307 	    ret = reg_node(pRExC_state,
7308 			   (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7309 	    s = STRING(ret);
7310 	    for (len = 0, p = RExC_parse - 1;
7311 	      len < 127 && p < RExC_end;
7312 	      len++)
7313 	    {
7314 		char * const oldp = p;
7315 
7316 		if (RExC_flags & RXf_PMf_EXTENDED)
7317 		    p = regwhite( pRExC_state, p );
7318 		switch ((U8)*p) {
7319 		case 0xDF:
7320 		case 0xC3:
7321 		case 0xCE:
7322 		           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7323 		                goto normal_default;
7324 		case '^':
7325 		case '$':
7326 		case '.':
7327 		case '[':
7328 		case '(':
7329 		case ')':
7330 		case '|':
7331 		    goto loopdone;
7332 		case '\\':
7333 		    /* Literal Escapes Switch
7334 
7335 		       This switch is meant to handle escape sequences that
7336 		       resolve to a literal character.
7337 
7338 		       Every escape sequence that represents something
7339 		       else, like an assertion or a char class, is handled
7340 		       in the switch marked 'Special Escapes' above in this
7341 		       routine, but also has an entry here as anything that
7342 		       isn't explicitly mentioned here will be treated as
7343 		       an unescaped equivalent literal.
7344 		    */
7345 
7346 		    switch ((U8)*++p) {
7347 		    /* These are all the special escapes. */
7348     		    case 0xDF:
7349     		    case 0xC3:
7350     		    case 0xCE:
7351     		           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7352     		                goto normal_default;
7353 		    case 'A':             /* Start assertion */
7354 		    case 'b': case 'B':   /* Word-boundary assertion*/
7355 		    case 'C':             /* Single char !DANGEROUS! */
7356 		    case 'd': case 'D':   /* digit class */
7357 		    case 'g': case 'G':   /* generic-backref, pos assertion */
7358 		    case 'h': case 'H':   /* HORIZWS */
7359 		    case 'k': case 'K':   /* named backref, keep marker */
7360 		    case 'N':             /* named char sequence */
7361 		    case 'p': case 'P':   /* Unicode property */
7362 		              case 'R':   /* LNBREAK */
7363 		    case 's': case 'S':   /* space class */
7364 		    case 'v': case 'V':   /* VERTWS */
7365 		    case 'w': case 'W':   /* word class */
7366 		    case 'X':             /* eXtended Unicode "combining character sequence" */
7367 		    case 'z': case 'Z':   /* End of line/string assertion */
7368 			--p;
7369 			goto loopdone;
7370 
7371 	            /* Anything after here is an escape that resolves to a
7372 	               literal. (Except digits, which may or may not)
7373 	             */
7374 		    case 'n':
7375 			ender = '\n';
7376 			p++;
7377 			break;
7378 		    case 'r':
7379 			ender = '\r';
7380 			p++;
7381 			break;
7382 		    case 't':
7383 			ender = '\t';
7384 			p++;
7385 			break;
7386 		    case 'f':
7387 			ender = '\f';
7388 			p++;
7389 			break;
7390 		    case 'e':
7391 			  ender = ASCII_TO_NATIVE('\033');
7392 			p++;
7393 			break;
7394 		    case 'a':
7395 			  ender = ASCII_TO_NATIVE('\007');
7396 			p++;
7397 			break;
7398 		    case 'x':
7399 			if (*++p == '{') {
7400 			    char* const e = strchr(p, '}');
7401 
7402 			    if (!e) {
7403 				RExC_parse = p + 1;
7404 				vFAIL("Missing right brace on \\x{}");
7405 			    }
7406 			    else {
7407                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7408                                     | PERL_SCAN_DISALLOW_PREFIX;
7409                                 STRLEN numlen = e - p - 1;
7410 				ender = grok_hex(p + 1, &numlen, &flags, NULL);
7411 				if (ender > 0xff)
7412 				    RExC_utf8 = 1;
7413 				p = e + 1;
7414 			    }
7415 			}
7416 			else {
7417                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7418 			    STRLEN numlen = 2;
7419 			    ender = grok_hex(p, &numlen, &flags, NULL);
7420 			    p += numlen;
7421 			}
7422 			if (PL_encoding && ender < 0x100)
7423 			    goto recode_encoding;
7424 			break;
7425 		    case 'c':
7426 			p++;
7427 			ender = UCHARAT(p++);
7428 			ender = toCTRL(ender);
7429 			break;
7430 		    case '0': case '1': case '2': case '3':case '4':
7431 		    case '5': case '6': case '7': case '8':case '9':
7432 			if (*p == '0' ||
7433 			  (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7434                             I32 flags = 0;
7435 			    STRLEN numlen = 3;
7436 			    ender = grok_oct(p, &numlen, &flags, NULL);
7437 			    p += numlen;
7438 			}
7439 			else {
7440 			    --p;
7441 			    goto loopdone;
7442 			}
7443 			if (PL_encoding && ender < 0x100)
7444 			    goto recode_encoding;
7445 			break;
7446 		    recode_encoding:
7447 			{
7448 			    SV* enc = PL_encoding;
7449 			    ender = reg_recode((const char)(U8)ender, &enc);
7450 			    if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7451 				vWARN(p, "Invalid escape in the specified encoding");
7452 			    RExC_utf8 = 1;
7453 			}
7454 			break;
7455 		    case '\0':
7456 			if (p >= RExC_end)
7457 			    FAIL("Trailing \\");
7458 			/* FALL THROUGH */
7459 		    default:
7460 			if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7461 			    vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7462 			goto normal_default;
7463 		    }
7464 		    break;
7465 		default:
7466 		  normal_default:
7467 		    if (UTF8_IS_START(*p) && UTF) {
7468 			STRLEN numlen;
7469 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7470 					       &numlen, UTF8_ALLOW_DEFAULT);
7471 			p += numlen;
7472 		    }
7473 		    else
7474 			ender = *p++;
7475 		    break;
7476 		}
7477 		if ( RExC_flags & RXf_PMf_EXTENDED)
7478 		    p = regwhite( pRExC_state, p );
7479 		if (UTF && FOLD) {
7480 		    /* Prime the casefolded buffer. */
7481 		    ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7482 		}
7483 		if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7484 		    if (len)
7485 			p = oldp;
7486 		    else if (UTF) {
7487 			 if (FOLD) {
7488 			      /* Emit all the Unicode characters. */
7489 			      STRLEN numlen;
7490 			      for (foldbuf = tmpbuf;
7491 				   foldlen;
7492 				   foldlen -= numlen) {
7493 				   ender = utf8_to_uvchr(foldbuf, &numlen);
7494 				   if (numlen > 0) {
7495 					const STRLEN unilen = reguni(pRExC_state, ender, s);
7496 					s       += unilen;
7497 					len     += unilen;
7498 					/* In EBCDIC the numlen
7499 					 * and unilen can differ. */
7500 					foldbuf += numlen;
7501 					if (numlen >= foldlen)
7502 					     break;
7503 				   }
7504 				   else
7505 					break; /* "Can't happen." */
7506 			      }
7507 			 }
7508 			 else {
7509 			      const STRLEN unilen = reguni(pRExC_state, ender, s);
7510 			      if (unilen > 0) {
7511 				   s   += unilen;
7512 				   len += unilen;
7513 			      }
7514 			 }
7515 		    }
7516 		    else {
7517 			len++;
7518 			REGC((char)ender, s++);
7519 		    }
7520 		    break;
7521 		}
7522 		if (UTF) {
7523 		     if (FOLD) {
7524 		          /* Emit all the Unicode characters. */
7525 			  STRLEN numlen;
7526 			  for (foldbuf = tmpbuf;
7527 			       foldlen;
7528 			       foldlen -= numlen) {
7529 			       ender = utf8_to_uvchr(foldbuf, &numlen);
7530 			       if (numlen > 0) {
7531 				    const STRLEN unilen = reguni(pRExC_state, ender, s);
7532 				    len     += unilen;
7533 				    s       += unilen;
7534 				    /* In EBCDIC the numlen
7535 				     * and unilen can differ. */
7536 				    foldbuf += numlen;
7537 				    if (numlen >= foldlen)
7538 					 break;
7539 			       }
7540 			       else
7541 				    break;
7542 			  }
7543 		     }
7544 		     else {
7545 			  const STRLEN unilen = reguni(pRExC_state, ender, s);
7546 			  if (unilen > 0) {
7547 			       s   += unilen;
7548 			       len += unilen;
7549 			  }
7550 		     }
7551 		     len--;
7552 		}
7553 		else
7554 		    REGC((char)ender, s++);
7555 	    }
7556 	loopdone:
7557 	    RExC_parse = p - 1;
7558             Set_Node_Cur_Length(ret); /* MJD */
7559 	    nextchar(pRExC_state);
7560 	    {
7561 		/* len is STRLEN which is unsigned, need to copy to signed */
7562 		IV iv = len;
7563 		if (iv < 0)
7564 		    vFAIL("Internal disaster");
7565 	    }
7566 	    if (len > 0)
7567 		*flagp |= HASWIDTH;
7568 	    if (len == 1 && UNI_IS_INVARIANT(ender))
7569 		*flagp |= SIMPLE;
7570 
7571 	    if (SIZE_ONLY)
7572 		RExC_size += STR_SZ(len);
7573 	    else {
7574 		STR_LEN(ret) = len;
7575 		RExC_emit += STR_SZ(len);
7576             }
7577 	}
7578 	break;
7579     }
7580 
7581     return(ret);
7582 }
7583 
7584 STATIC char *
7585 S_regwhite( RExC_state_t *pRExC_state, char *p )
7586 {
7587     const char *e = RExC_end;
7588 
7589     PERL_ARGS_ASSERT_REGWHITE;
7590 
7591     while (p < e) {
7592 	if (isSPACE(*p))
7593 	    ++p;
7594 	else if (*p == '#') {
7595             bool ended = 0;
7596 	    do {
7597 		if (*p++ == '\n') {
7598 		    ended = 1;
7599 		    break;
7600 		}
7601 	    } while (p < e);
7602 	    if (!ended)
7603 	        RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7604 	}
7605 	else
7606 	    break;
7607     }
7608     return p;
7609 }
7610 
7611 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7612    Character classes ([:foo:]) can also be negated ([:^foo:]).
7613    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7614    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7615    but trigger failures because they are currently unimplemented. */
7616 
7617 #define POSIXCC_DONE(c)   ((c) == ':')
7618 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7619 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7620 
7621 STATIC I32
7622 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7623 {
7624     dVAR;
7625     I32 namedclass = OOB_NAMEDCLASS;
7626 
7627     PERL_ARGS_ASSERT_REGPPOSIXCC;
7628 
7629     if (value == '[' && RExC_parse + 1 < RExC_end &&
7630 	/* I smell either [: or [= or [. -- POSIX has been here, right? */
7631 	POSIXCC(UCHARAT(RExC_parse))) {
7632 	const char c = UCHARAT(RExC_parse);
7633 	char* const s = RExC_parse++;
7634 
7635 	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7636 	    RExC_parse++;
7637 	if (RExC_parse == RExC_end)
7638 	    /* Grandfather lone [:, [=, [. */
7639 	    RExC_parse = s;
7640 	else {
7641 	    const char* const t = RExC_parse++; /* skip over the c */
7642 	    assert(*t == c);
7643 
7644   	    if (UCHARAT(RExC_parse) == ']') {
7645 		const char *posixcc = s + 1;
7646   		RExC_parse++; /* skip over the ending ] */
7647 
7648 		if (*s == ':') {
7649 		    const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7650 		    const I32 skip = t - posixcc;
7651 
7652 		    /* Initially switch on the length of the name.  */
7653 		    switch (skip) {
7654 		    case 4:
7655 			if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7656 			    namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7657 			break;
7658 		    case 5:
7659 			/* Names all of length 5.  */
7660 			/* alnum alpha ascii blank cntrl digit graph lower
7661 			   print punct space upper  */
7662 			/* Offset 4 gives the best switch position.  */
7663 			switch (posixcc[4]) {
7664 			case 'a':
7665 			    if (memEQ(posixcc, "alph", 4)) /* alpha */
7666 				namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7667 			    break;
7668 			case 'e':
7669 			    if (memEQ(posixcc, "spac", 4)) /* space */
7670 				namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7671 			    break;
7672 			case 'h':
7673 			    if (memEQ(posixcc, "grap", 4)) /* graph */
7674 				namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7675 			    break;
7676 			case 'i':
7677 			    if (memEQ(posixcc, "asci", 4)) /* ascii */
7678 				namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7679 			    break;
7680 			case 'k':
7681 			    if (memEQ(posixcc, "blan", 4)) /* blank */
7682 				namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7683 			    break;
7684 			case 'l':
7685 			    if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7686 				namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7687 			    break;
7688 			case 'm':
7689 			    if (memEQ(posixcc, "alnu", 4)) /* alnum */
7690 				namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7691 			    break;
7692 			case 'r':
7693 			    if (memEQ(posixcc, "lowe", 4)) /* lower */
7694 				namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7695 			    else if (memEQ(posixcc, "uppe", 4)) /* upper */
7696 				namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7697 			    break;
7698 			case 't':
7699 			    if (memEQ(posixcc, "digi", 4)) /* digit */
7700 				namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7701 			    else if (memEQ(posixcc, "prin", 4)) /* print */
7702 				namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7703 			    else if (memEQ(posixcc, "punc", 4)) /* punct */
7704 				namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7705 			    break;
7706 			}
7707 			break;
7708 		    case 6:
7709 			if (memEQ(posixcc, "xdigit", 6))
7710 			    namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7711 			break;
7712 		    }
7713 
7714 		    if (namedclass == OOB_NAMEDCLASS)
7715 			Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7716 				      t - s - 1, s + 1);
7717 		    assert (posixcc[skip] == ':');
7718 		    assert (posixcc[skip+1] == ']');
7719 		} else if (!SIZE_ONLY) {
7720 		    /* [[=foo=]] and [[.foo.]] are still future. */
7721 
7722 		    /* adjust RExC_parse so the warning shows after
7723 		       the class closes */
7724 		    while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7725 			RExC_parse++;
7726 		    Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7727 		}
7728 	    } else {
7729 		/* Maternal grandfather:
7730 		 * "[:" ending in ":" but not in ":]" */
7731 		RExC_parse = s;
7732 	    }
7733 	}
7734     }
7735 
7736     return namedclass;
7737 }
7738 
7739 STATIC void
7740 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7741 {
7742     dVAR;
7743 
7744     PERL_ARGS_ASSERT_CHECKPOSIXCC;
7745 
7746     if (POSIXCC(UCHARAT(RExC_parse))) {
7747 	const char *s = RExC_parse;
7748 	const char  c = *s++;
7749 
7750 	while (isALNUM(*s))
7751 	    s++;
7752 	if (*s && c == *s && s[1] == ']') {
7753 	    if (ckWARN(WARN_REGEXP))
7754 		vWARN3(s+2,
7755 			"POSIX syntax [%c %c] belongs inside character classes",
7756 			c, c);
7757 
7758 	    /* [[=foo=]] and [[.foo.]] are still future. */
7759 	    if (POSIXCC_NOTYET(c)) {
7760 		/* adjust RExC_parse so the error shows after
7761 		   the class closes */
7762 		while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7763 		    NOOP;
7764 		Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7765 	    }
7766 	}
7767     }
7768 }
7769 
7770 
7771 #define _C_C_T_(NAME,TEST,WORD)                         \
7772 ANYOF_##NAME:                                           \
7773     if (LOC)                                            \
7774 	ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
7775     else {                                              \
7776 	for (value = 0; value < 256; value++)           \
7777 	    if (TEST)                                   \
7778 		ANYOF_BITMAP_SET(ret, value);           \
7779     }                                                   \
7780     yesno = '+';                                        \
7781     what = WORD;                                        \
7782     break;                                              \
7783 case ANYOF_N##NAME:                                     \
7784     if (LOC)                                            \
7785 	ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
7786     else {                                              \
7787 	for (value = 0; value < 256; value++)           \
7788 	    if (!TEST)                                  \
7789 		ANYOF_BITMAP_SET(ret, value);           \
7790     }                                                   \
7791     yesno = '!';                                        \
7792     what = WORD;                                        \
7793     break
7794 
7795 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
7796 ANYOF_##NAME:                                           \
7797 	for (value = 0; value < 256; value++)           \
7798 	    if (TEST)                                   \
7799 		ANYOF_BITMAP_SET(ret, value);           \
7800     yesno = '+';                                        \
7801     what = WORD;                                        \
7802     break;                                              \
7803 case ANYOF_N##NAME:                                     \
7804 	for (value = 0; value < 256; value++)           \
7805 	    if (!TEST)                                  \
7806 		ANYOF_BITMAP_SET(ret, value);           \
7807     yesno = '!';                                        \
7808     what = WORD;                                        \
7809     break
7810 
7811 /*
7812    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7813    so that it is possible to override the option here without having to
7814    rebuild the entire core. as we are required to do if we change regcomp.h
7815    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7816 */
7817 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7818 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7819 #endif
7820 
7821 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7822 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7823 #else
7824 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7825 #endif
7826 
7827 /*
7828    parse a class specification and produce either an ANYOF node that
7829    matches the pattern or if the pattern matches a single char only and
7830    that char is < 256 and we are case insensitive then we produce an
7831    EXACT node instead.
7832 */
7833 
7834 STATIC regnode *
7835 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7836 {
7837     dVAR;
7838     register UV nextvalue;
7839     register IV prevvalue = OOB_UNICODE;
7840     register IV range = 0;
7841     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7842     register regnode *ret;
7843     STRLEN numlen;
7844     IV namedclass;
7845     char *rangebegin = NULL;
7846     bool need_class = 0;
7847     SV *listsv = NULL;
7848     UV n;
7849     bool optimize_invert   = TRUE;
7850     AV* unicode_alternate  = NULL;
7851 #ifdef EBCDIC
7852     UV literal_endpoint = 0;
7853 #endif
7854     UV stored = 0;  /* number of chars stored in the class */
7855 
7856     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7857         case we need to change the emitted regop to an EXACT. */
7858     const char * orig_parse = RExC_parse;
7859     GET_RE_DEBUG_FLAGS_DECL;
7860 
7861     PERL_ARGS_ASSERT_REGCLASS;
7862 #ifndef DEBUGGING
7863     PERL_UNUSED_ARG(depth);
7864 #endif
7865 
7866     DEBUG_PARSE("clas");
7867 
7868     /* Assume we are going to generate an ANYOF node. */
7869     ret = reganode(pRExC_state, ANYOF, 0);
7870 
7871     if (!SIZE_ONLY)
7872 	ANYOF_FLAGS(ret) = 0;
7873 
7874     if (UCHARAT(RExC_parse) == '^') {	/* Complement of range. */
7875 	RExC_naughty++;
7876 	RExC_parse++;
7877 	if (!SIZE_ONLY)
7878 	    ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7879     }
7880 
7881     if (SIZE_ONLY) {
7882 	RExC_size += ANYOF_SKIP;
7883 	listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7884     }
7885     else {
7886  	RExC_emit += ANYOF_SKIP;
7887 	if (FOLD)
7888 	    ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7889 	if (LOC)
7890 	    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7891 	ANYOF_BITMAP_ZERO(ret);
7892 	listsv = newSVpvs("# comment\n");
7893     }
7894 
7895     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7896 
7897     if (!SIZE_ONLY && POSIXCC(nextvalue))
7898 	checkposixcc(pRExC_state);
7899 
7900     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7901     if (UCHARAT(RExC_parse) == ']')
7902 	goto charclassloop;
7903 
7904 parseit:
7905     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7906 
7907     charclassloop:
7908 
7909 	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7910 
7911 	if (!range)
7912 	    rangebegin = RExC_parse;
7913 	if (UTF) {
7914 	    value = utf8n_to_uvchr((U8*)RExC_parse,
7915 				   RExC_end - RExC_parse,
7916 				   &numlen, UTF8_ALLOW_DEFAULT);
7917 	    RExC_parse += numlen;
7918 	}
7919 	else
7920 	    value = UCHARAT(RExC_parse++);
7921 
7922 	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7923 	if (value == '[' && POSIXCC(nextvalue))
7924 	    namedclass = regpposixcc(pRExC_state, value);
7925 	else if (value == '\\') {
7926 	    if (UTF) {
7927 		value = utf8n_to_uvchr((U8*)RExC_parse,
7928 				   RExC_end - RExC_parse,
7929 				   &numlen, UTF8_ALLOW_DEFAULT);
7930 		RExC_parse += numlen;
7931 	    }
7932 	    else
7933 		value = UCHARAT(RExC_parse++);
7934 	    /* Some compilers cannot handle switching on 64-bit integer
7935 	     * values, therefore value cannot be an UV.  Yes, this will
7936 	     * be a problem later if we want switch on Unicode.
7937 	     * A similar issue a little bit later when switching on
7938 	     * namedclass. --jhi */
7939 	    switch ((I32)value) {
7940 	    case 'w':	namedclass = ANYOF_ALNUM;	break;
7941 	    case 'W':	namedclass = ANYOF_NALNUM;	break;
7942 	    case 's':	namedclass = ANYOF_SPACE;	break;
7943 	    case 'S':	namedclass = ANYOF_NSPACE;	break;
7944 	    case 'd':	namedclass = ANYOF_DIGIT;	break;
7945 	    case 'D':	namedclass = ANYOF_NDIGIT;	break;
7946 	    case 'v':	namedclass = ANYOF_VERTWS;	break;
7947 	    case 'V':	namedclass = ANYOF_NVERTWS;	break;
7948 	    case 'h':	namedclass = ANYOF_HORIZWS;	break;
7949 	    case 'H':	namedclass = ANYOF_NHORIZWS;	break;
7950             case 'N':  /* Handle \N{NAME} in class */
7951                 {
7952                     /* We only pay attention to the first char of
7953                     multichar strings being returned. I kinda wonder
7954                     if this makes sense as it does change the behaviour
7955                     from earlier versions, OTOH that behaviour was broken
7956                     as well. */
7957                     UV v; /* value is register so we cant & it /grrr */
7958                     if (reg_namedseq(pRExC_state, &v)) {
7959                         goto parseit;
7960                     }
7961                     value= v;
7962                 }
7963                 break;
7964 	    case 'p':
7965 	    case 'P':
7966 		{
7967 		char *e;
7968 		if (RExC_parse >= RExC_end)
7969 		    vFAIL2("Empty \\%c{}", (U8)value);
7970 		if (*RExC_parse == '{') {
7971 		    const U8 c = (U8)value;
7972 		    e = strchr(RExC_parse++, '}');
7973                     if (!e)
7974                         vFAIL2("Missing right brace on \\%c{}", c);
7975 		    while (isSPACE(UCHARAT(RExC_parse)))
7976 		        RExC_parse++;
7977                     if (e == RExC_parse)
7978                         vFAIL2("Empty \\%c{}", c);
7979 		    n = e - RExC_parse;
7980 		    while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7981 		        n--;
7982 		}
7983 		else {
7984 		    e = RExC_parse;
7985 		    n = 1;
7986 		}
7987 		if (!SIZE_ONLY) {
7988 		    if (UCHARAT(RExC_parse) == '^') {
7989 			 RExC_parse++;
7990 			 n--;
7991 			 value = value == 'p' ? 'P' : 'p'; /* toggle */
7992 			 while (isSPACE(UCHARAT(RExC_parse))) {
7993 			      RExC_parse++;
7994 			      n--;
7995 			 }
7996 		    }
7997 		    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7998 			(value=='p' ? '+' : '!'), (int)n, RExC_parse);
7999 		}
8000 		RExC_parse = e + 1;
8001 		ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8002 		namedclass = ANYOF_MAX;  /* no official name, but it's named */
8003 		}
8004 		break;
8005 	    case 'n':	value = '\n';			break;
8006 	    case 'r':	value = '\r';			break;
8007 	    case 't':	value = '\t';			break;
8008 	    case 'f':	value = '\f';			break;
8009 	    case 'b':	value = '\b';			break;
8010 	    case 'e':	value = ASCII_TO_NATIVE('\033');break;
8011 	    case 'a':	value = ASCII_TO_NATIVE('\007');break;
8012 	    case 'x':
8013 		if (*RExC_parse == '{') {
8014                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8015                         | PERL_SCAN_DISALLOW_PREFIX;
8016 		    char * const e = strchr(RExC_parse++, '}');
8017                     if (!e)
8018                         vFAIL("Missing right brace on \\x{}");
8019 
8020 		    numlen = e - RExC_parse;
8021 		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8022 		    RExC_parse = e + 1;
8023 		}
8024 		else {
8025                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8026 		    numlen = 2;
8027 		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8028 		    RExC_parse += numlen;
8029 		}
8030 		if (PL_encoding && value < 0x100)
8031 		    goto recode_encoding;
8032 		break;
8033 	    case 'c':
8034 		value = UCHARAT(RExC_parse++);
8035 		value = toCTRL(value);
8036 		break;
8037 	    case '0': case '1': case '2': case '3': case '4':
8038 	    case '5': case '6': case '7': case '8': case '9':
8039 		{
8040 		    I32 flags = 0;
8041 		    numlen = 3;
8042 		    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8043 		    RExC_parse += numlen;
8044 		    if (PL_encoding && value < 0x100)
8045 			goto recode_encoding;
8046 		    break;
8047 		}
8048 	    recode_encoding:
8049 		{
8050 		    SV* enc = PL_encoding;
8051 		    value = reg_recode((const char)(U8)value, &enc);
8052 		    if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
8053 			vWARN(RExC_parse,
8054 			      "Invalid escape in the specified encoding");
8055 		    break;
8056 		}
8057 	    default:
8058 		if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
8059 		    vWARN2(RExC_parse,
8060 			   "Unrecognized escape \\%c in character class passed through",
8061 			   (int)value);
8062 		break;
8063 	    }
8064 	} /* end of \blah */
8065 #ifdef EBCDIC
8066 	else
8067 	    literal_endpoint++;
8068 #endif
8069 
8070 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8071 
8072 	    if (!SIZE_ONLY && !need_class)
8073 		ANYOF_CLASS_ZERO(ret);
8074 
8075 	    need_class = 1;
8076 
8077 	    /* a bad range like a-\d, a-[:digit:] ? */
8078 	    if (range) {
8079 		if (!SIZE_ONLY) {
8080 		    if (ckWARN(WARN_REGEXP)) {
8081 			const int w =
8082 			    RExC_parse >= rangebegin ?
8083 			    RExC_parse - rangebegin : 0;
8084 			vWARN4(RExC_parse,
8085 			       "False [] range \"%*.*s\"",
8086 			       w, w, rangebegin);
8087 		    }
8088 		    if (prevvalue < 256) {
8089 			ANYOF_BITMAP_SET(ret, prevvalue);
8090 			ANYOF_BITMAP_SET(ret, '-');
8091 		    }
8092 		    else {
8093 			ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8094 			Perl_sv_catpvf(aTHX_ listsv,
8095 				       "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8096 		    }
8097 		}
8098 
8099 		range = 0; /* this was not a true range */
8100 	    }
8101 
8102 
8103 
8104 	    if (!SIZE_ONLY) {
8105 		const char *what = NULL;
8106 		char yesno = 0;
8107 
8108 	        if (namedclass > OOB_NAMEDCLASS)
8109 		    optimize_invert = FALSE;
8110 		/* Possible truncation here but in some 64-bit environments
8111 		 * the compiler gets heartburn about switch on 64-bit values.
8112 		 * A similar issue a little earlier when switching on value.
8113 		 * --jhi */
8114 		switch ((I32)namedclass) {
8115 
8116 		case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8117 		case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8118 		case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8119 		case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8120 		case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8121 		case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8122 		case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8123 		case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8124 		case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8125 		case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8126 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8127 		case _C_C_T_(ALNUM, isALNUM(value), "Word");
8128 		case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
8129 #else
8130 		case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8131 		case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8132 #endif
8133 		case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
8134 		case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8135 		case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8136 		case ANYOF_ASCII:
8137 		    if (LOC)
8138 			ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8139 		    else {
8140 #ifndef EBCDIC
8141 			for (value = 0; value < 128; value++)
8142 			    ANYOF_BITMAP_SET(ret, value);
8143 #else  /* EBCDIC */
8144 			for (value = 0; value < 256; value++) {
8145 			    if (isASCII(value))
8146 			        ANYOF_BITMAP_SET(ret, value);
8147 			}
8148 #endif /* EBCDIC */
8149 		    }
8150 		    yesno = '+';
8151 		    what = "ASCII";
8152 		    break;
8153 		case ANYOF_NASCII:
8154 		    if (LOC)
8155 			ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8156 		    else {
8157 #ifndef EBCDIC
8158 			for (value = 128; value < 256; value++)
8159 			    ANYOF_BITMAP_SET(ret, value);
8160 #else  /* EBCDIC */
8161 			for (value = 0; value < 256; value++) {
8162 			    if (!isASCII(value))
8163 			        ANYOF_BITMAP_SET(ret, value);
8164 			}
8165 #endif /* EBCDIC */
8166 		    }
8167 		    yesno = '!';
8168 		    what = "ASCII";
8169 		    break;
8170 		case ANYOF_DIGIT:
8171 		    if (LOC)
8172 			ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8173 		    else {
8174 			/* consecutive digits assumed */
8175 			for (value = '0'; value <= '9'; value++)
8176 			    ANYOF_BITMAP_SET(ret, value);
8177 		    }
8178 		    yesno = '+';
8179 		    what = POSIX_CC_UNI_NAME("Digit");
8180 		    break;
8181 		case ANYOF_NDIGIT:
8182 		    if (LOC)
8183 			ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8184 		    else {
8185 			/* consecutive digits assumed */
8186 			for (value = 0; value < '0'; value++)
8187 			    ANYOF_BITMAP_SET(ret, value);
8188 			for (value = '9' + 1; value < 256; value++)
8189 			    ANYOF_BITMAP_SET(ret, value);
8190 		    }
8191 		    yesno = '!';
8192 		    what = POSIX_CC_UNI_NAME("Digit");
8193 		    break;
8194 		case ANYOF_MAX:
8195 		    /* this is to handle \p and \P */
8196 		    break;
8197 		default:
8198 		    vFAIL("Invalid [::] class");
8199 		    break;
8200 		}
8201 		if (what) {
8202 		    /* Strings such as "+utf8::isWord\n" */
8203 		    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8204 		}
8205 		if (LOC)
8206 		    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8207 		continue;
8208 	    }
8209 	} /* end of namedclass \blah */
8210 
8211 	if (range) {
8212 	    if (prevvalue > (IV)value) /* b-a */ {
8213 		const int w = RExC_parse - rangebegin;
8214 		Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8215 		range = 0; /* not a valid range */
8216 	    }
8217 	}
8218 	else {
8219 	    prevvalue = value; /* save the beginning of the range */
8220 	    if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8221 		RExC_parse[1] != ']') {
8222 		RExC_parse++;
8223 
8224 		/* a bad range like \w-, [:word:]- ? */
8225 		if (namedclass > OOB_NAMEDCLASS) {
8226 		    if (ckWARN(WARN_REGEXP)) {
8227 			const int w =
8228 			    RExC_parse >= rangebegin ?
8229 			    RExC_parse - rangebegin : 0;
8230 			vWARN4(RExC_parse,
8231 			       "False [] range \"%*.*s\"",
8232 			       w, w, rangebegin);
8233 		    }
8234 		    if (!SIZE_ONLY)
8235 			ANYOF_BITMAP_SET(ret, '-');
8236 		} else
8237 		    range = 1;	/* yeah, it's a range! */
8238 		continue;	/* but do it the next time */
8239 	    }
8240 	}
8241 
8242 	/* now is the next time */
8243         /*stored += (value - prevvalue + 1);*/
8244 	if (!SIZE_ONLY) {
8245 	    if (prevvalue < 256) {
8246 	        const IV ceilvalue = value < 256 ? value : 255;
8247 		IV i;
8248 #ifdef EBCDIC
8249 		/* In EBCDIC [\x89-\x91] should include
8250 		 * the \x8e but [i-j] should not. */
8251 		if (literal_endpoint == 2 &&
8252 		    ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8253 		     (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8254 		{
8255 		    if (isLOWER(prevvalue)) {
8256 			for (i = prevvalue; i <= ceilvalue; i++)
8257 			    if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8258 				stored++;
8259 				ANYOF_BITMAP_SET(ret, i);
8260 			    }
8261 		    } else {
8262 			for (i = prevvalue; i <= ceilvalue; i++)
8263 			    if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8264 				stored++;
8265 				ANYOF_BITMAP_SET(ret, i);
8266 			    }
8267 		    }
8268 		}
8269 		else
8270 #endif
8271 		      for (i = prevvalue; i <= ceilvalue; i++) {
8272 		        if (!ANYOF_BITMAP_TEST(ret,i)) {
8273 		            stored++;
8274 			    ANYOF_BITMAP_SET(ret, i);
8275 		        }
8276 	              }
8277 	  }
8278 	  if (value > 255 || UTF) {
8279 	        const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8280 		const UV natvalue      = NATIVE_TO_UNI(value);
8281                 stored+=2; /* can't optimize this class */
8282 		ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8283 		if (prevnatvalue < natvalue) { /* what about > ? */
8284 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8285 				   prevnatvalue, natvalue);
8286 		}
8287 		else if (prevnatvalue == natvalue) {
8288 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8289 		    if (FOLD) {
8290 			 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8291 			 STRLEN foldlen;
8292 			 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8293 
8294 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8295 			 if (RExC_precomp[0] == ':' &&
8296 			     RExC_precomp[1] == '[' &&
8297 			     (f == 0xDF || f == 0x92)) {
8298 			     f = NATIVE_TO_UNI(f);
8299                         }
8300 #endif
8301 			 /* If folding and foldable and a single
8302 			  * character, insert also the folded version
8303 			  * to the charclass. */
8304 			 if (f != value) {
8305 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8306 			     if ((RExC_precomp[0] == ':' &&
8307 				  RExC_precomp[1] == '[' &&
8308 				  (f == 0xA2 &&
8309 				   (value == 0xFB05 || value == 0xFB06))) ?
8310 				 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8311 				 foldlen == (STRLEN)UNISKIP(f) )
8312 #else
8313 			      if (foldlen == (STRLEN)UNISKIP(f))
8314 #endif
8315 				  Perl_sv_catpvf(aTHX_ listsv,
8316 						 "%04"UVxf"\n", f);
8317 			      else {
8318 				  /* Any multicharacter foldings
8319 				   * require the following transform:
8320 				   * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8321 				   * where E folds into "pq" and F folds
8322 				   * into "rst", all other characters
8323 				   * fold to single characters.  We save
8324 				   * away these multicharacter foldings,
8325 				   * to be later saved as part of the
8326 				   * additional "s" data. */
8327 				  SV *sv;
8328 
8329 				  if (!unicode_alternate)
8330 				      unicode_alternate = newAV();
8331 				  sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8332 						     TRUE);
8333 				  av_push(unicode_alternate, sv);
8334 			      }
8335 			 }
8336 
8337 			 /* If folding and the value is one of the Greek
8338 			  * sigmas insert a few more sigmas to make the
8339 			  * folding rules of the sigmas to work right.
8340 			  * Note that not all the possible combinations
8341 			  * are handled here: some of them are handled
8342 			  * by the standard folding rules, and some of
8343 			  * them (literal or EXACTF cases) are handled
8344 			  * during runtime in regexec.c:S_find_byclass(). */
8345 			 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8346 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8347 					     (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8348 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8349 					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8350 			 }
8351 			 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8352 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8353 					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8354 		    }
8355 		}
8356 	    }
8357 #ifdef EBCDIC
8358 	    literal_endpoint = 0;
8359 #endif
8360         }
8361 
8362 	range = 0; /* this range (if it was one) is done now */
8363     }
8364 
8365     if (need_class) {
8366 	ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8367 	if (SIZE_ONLY)
8368 	    RExC_size += ANYOF_CLASS_ADD_SKIP;
8369 	else
8370 	    RExC_emit += ANYOF_CLASS_ADD_SKIP;
8371     }
8372 
8373 
8374     if (SIZE_ONLY)
8375         return ret;
8376     /****** !SIZE_ONLY AFTER HERE *********/
8377 
8378     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8379         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8380     ) {
8381         /* optimize single char class to an EXACT node
8382            but *only* when its not a UTF/high char  */
8383         const char * cur_parse= RExC_parse;
8384         RExC_emit = (regnode *)orig_emit;
8385         RExC_parse = (char *)orig_parse;
8386         ret = reg_node(pRExC_state,
8387                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8388         RExC_parse = (char *)cur_parse;
8389         *STRING(ret)= (char)value;
8390         STR_LEN(ret)= 1;
8391         RExC_emit += STR_SZ(1);
8392 	if (listsv) {
8393 	    SvREFCNT_dec(listsv);
8394 	}
8395         return ret;
8396     }
8397     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8398     if ( /* If the only flag is folding (plus possibly inversion). */
8399 	((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8400        ) {
8401 	for (value = 0; value < 256; ++value) {
8402 	    if (ANYOF_BITMAP_TEST(ret, value)) {
8403 		UV fold = PL_fold[value];
8404 
8405 		if (fold != value)
8406 		    ANYOF_BITMAP_SET(ret, fold);
8407 	    }
8408 	}
8409 	ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8410     }
8411 
8412     /* optimize inverted simple patterns (e.g. [^a-z]) */
8413     if (optimize_invert &&
8414 	/* If the only flag is inversion. */
8415 	(ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) ==	ANYOF_INVERT) {
8416 	for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8417 	    ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8418 	ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8419     }
8420     {
8421 	AV * const av = newAV();
8422 	SV *rv;
8423 	/* The 0th element stores the character class description
8424 	 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8425 	 * to initialize the appropriate swash (which gets stored in
8426 	 * the 1st element), and also useful for dumping the regnode.
8427 	 * The 2nd element stores the multicharacter foldings,
8428 	 * used later (regexec.c:S_reginclass()). */
8429 	av_store(av, 0, listsv);
8430 	av_store(av, 1, NULL);
8431 	av_store(av, 2, MUTABLE_SV(unicode_alternate));
8432 	rv = newRV_noinc(MUTABLE_SV(av));
8433 	n = add_data(pRExC_state, 1, "s");
8434 	RExC_rxi->data->data[n] = (void*)rv;
8435 	ARG_SET(ret, n);
8436     }
8437     return ret;
8438 }
8439 #undef _C_C_T_
8440 
8441 
8442 /* reg_skipcomment()
8443 
8444    Absorbs an /x style # comments from the input stream.
8445    Returns true if there is more text remaining in the stream.
8446    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8447    terminates the pattern without including a newline.
8448 
8449    Note its the callers responsibility to ensure that we are
8450    actually in /x mode
8451 
8452 */
8453 
8454 STATIC bool
8455 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8456 {
8457     bool ended = 0;
8458 
8459     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8460 
8461     while (RExC_parse < RExC_end)
8462         if (*RExC_parse++ == '\n') {
8463             ended = 1;
8464             break;
8465         }
8466     if (!ended) {
8467         /* we ran off the end of the pattern without ending
8468            the comment, so we have to add an \n when wrapping */
8469         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8470         return 0;
8471     } else
8472         return 1;
8473 }
8474 
8475 /* nextchar()
8476 
8477    Advance that parse position, and optionall absorbs
8478    "whitespace" from the inputstream.
8479 
8480    Without /x "whitespace" means (?#...) style comments only,
8481    with /x this means (?#...) and # comments and whitespace proper.
8482 
8483    Returns the RExC_parse point from BEFORE the scan occurs.
8484 
8485    This is the /x friendly way of saying RExC_parse++.
8486 */
8487 
8488 STATIC char*
8489 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8490 {
8491     char* const retval = RExC_parse++;
8492 
8493     PERL_ARGS_ASSERT_NEXTCHAR;
8494 
8495     for (;;) {
8496 	if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8497 		RExC_parse[2] == '#') {
8498 	    while (*RExC_parse != ')') {
8499 		if (RExC_parse == RExC_end)
8500 		    FAIL("Sequence (?#... not terminated");
8501 		RExC_parse++;
8502 	    }
8503 	    RExC_parse++;
8504 	    continue;
8505 	}
8506 	if (RExC_flags & RXf_PMf_EXTENDED) {
8507 	    if (isSPACE(*RExC_parse)) {
8508 		RExC_parse++;
8509 		continue;
8510 	    }
8511 	    else if (*RExC_parse == '#') {
8512 	        if ( reg_skipcomment( pRExC_state ) )
8513 	            continue;
8514 	    }
8515 	}
8516 	return retval;
8517     }
8518 }
8519 
8520 /*
8521 - reg_node - emit a node
8522 */
8523 STATIC regnode *			/* Location. */
8524 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8525 {
8526     dVAR;
8527     register regnode *ptr;
8528     regnode * const ret = RExC_emit;
8529     GET_RE_DEBUG_FLAGS_DECL;
8530 
8531     PERL_ARGS_ASSERT_REG_NODE;
8532 
8533     if (SIZE_ONLY) {
8534 	SIZE_ALIGN(RExC_size);
8535 	RExC_size += 1;
8536 	return(ret);
8537     }
8538     if (RExC_emit >= RExC_emit_bound)
8539         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8540 
8541     NODE_ALIGN_FILL(ret);
8542     ptr = ret;
8543     FILL_ADVANCE_NODE(ptr, op);
8544 #ifdef RE_TRACK_PATTERN_OFFSETS
8545     if (RExC_offsets) {         /* MJD */
8546 	MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8547               "reg_node", __LINE__,
8548               PL_reg_name[op],
8549               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8550 		? "Overwriting end of array!\n" : "OK",
8551               (UV)(RExC_emit - RExC_emit_start),
8552               (UV)(RExC_parse - RExC_start),
8553               (UV)RExC_offsets[0]));
8554 	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8555     }
8556 #endif
8557     RExC_emit = ptr;
8558     return(ret);
8559 }
8560 
8561 /*
8562 - reganode - emit a node with an argument
8563 */
8564 STATIC regnode *			/* Location. */
8565 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8566 {
8567     dVAR;
8568     register regnode *ptr;
8569     regnode * const ret = RExC_emit;
8570     GET_RE_DEBUG_FLAGS_DECL;
8571 
8572     PERL_ARGS_ASSERT_REGANODE;
8573 
8574     if (SIZE_ONLY) {
8575 	SIZE_ALIGN(RExC_size);
8576 	RExC_size += 2;
8577 	/*
8578 	   We can't do this:
8579 
8580 	   assert(2==regarglen[op]+1);
8581 
8582 	   Anything larger than this has to allocate the extra amount.
8583 	   If we changed this to be:
8584 
8585 	   RExC_size += (1 + regarglen[op]);
8586 
8587 	   then it wouldn't matter. Its not clear what side effect
8588 	   might come from that so its not done so far.
8589 	   -- dmq
8590 	*/
8591 	return(ret);
8592     }
8593     if (RExC_emit >= RExC_emit_bound)
8594         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8595 
8596     NODE_ALIGN_FILL(ret);
8597     ptr = ret;
8598     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8599 #ifdef RE_TRACK_PATTERN_OFFSETS
8600     if (RExC_offsets) {         /* MJD */
8601 	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8602               "reganode",
8603 	      __LINE__,
8604 	      PL_reg_name[op],
8605               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8606               "Overwriting end of array!\n" : "OK",
8607               (UV)(RExC_emit - RExC_emit_start),
8608               (UV)(RExC_parse - RExC_start),
8609               (UV)RExC_offsets[0]));
8610 	Set_Cur_Node_Offset;
8611     }
8612 #endif
8613     RExC_emit = ptr;
8614     return(ret);
8615 }
8616 
8617 /*
8618 - reguni - emit (if appropriate) a Unicode character
8619 */
8620 STATIC STRLEN
8621 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8622 {
8623     dVAR;
8624 
8625     PERL_ARGS_ASSERT_REGUNI;
8626 
8627     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8628 }
8629 
8630 /*
8631 - reginsert - insert an operator in front of already-emitted operand
8632 *
8633 * Means relocating the operand.
8634 */
8635 STATIC void
8636 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8637 {
8638     dVAR;
8639     register regnode *src;
8640     register regnode *dst;
8641     register regnode *place;
8642     const int offset = regarglen[(U8)op];
8643     const int size = NODE_STEP_REGNODE + offset;
8644     GET_RE_DEBUG_FLAGS_DECL;
8645 
8646     PERL_ARGS_ASSERT_REGINSERT;
8647     PERL_UNUSED_ARG(depth);
8648 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8649     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8650     if (SIZE_ONLY) {
8651 	RExC_size += size;
8652 	return;
8653     }
8654 
8655     src = RExC_emit;
8656     RExC_emit += size;
8657     dst = RExC_emit;
8658     if (RExC_open_parens) {
8659         int paren;
8660         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8661         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8662             if ( RExC_open_parens[paren] >= opnd ) {
8663                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8664                 RExC_open_parens[paren] += size;
8665             } else {
8666                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8667             }
8668             if ( RExC_close_parens[paren] >= opnd ) {
8669                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8670                 RExC_close_parens[paren] += size;
8671             } else {
8672                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8673             }
8674         }
8675     }
8676 
8677     while (src > opnd) {
8678 	StructCopy(--src, --dst, regnode);
8679 #ifdef RE_TRACK_PATTERN_OFFSETS
8680         if (RExC_offsets) {     /* MJD 20010112 */
8681 	    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8682                   "reg_insert",
8683 		  __LINE__,
8684 		  PL_reg_name[op],
8685                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8686 		    ? "Overwriting end of array!\n" : "OK",
8687                   (UV)(src - RExC_emit_start),
8688                   (UV)(dst - RExC_emit_start),
8689                   (UV)RExC_offsets[0]));
8690 	    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8691 	    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8692         }
8693 #endif
8694     }
8695 
8696 
8697     place = opnd;		/* Op node, where operand used to be. */
8698 #ifdef RE_TRACK_PATTERN_OFFSETS
8699     if (RExC_offsets) {         /* MJD */
8700 	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8701               "reginsert",
8702 	      __LINE__,
8703 	      PL_reg_name[op],
8704               (UV)(place - RExC_emit_start) > RExC_offsets[0]
8705               ? "Overwriting end of array!\n" : "OK",
8706               (UV)(place - RExC_emit_start),
8707               (UV)(RExC_parse - RExC_start),
8708               (UV)RExC_offsets[0]));
8709 	Set_Node_Offset(place, RExC_parse);
8710 	Set_Node_Length(place, 1);
8711     }
8712 #endif
8713     src = NEXTOPER(place);
8714     FILL_ADVANCE_NODE(place, op);
8715     Zero(src, offset, regnode);
8716 }
8717 
8718 /*
8719 - regtail - set the next-pointer at the end of a node chain of p to val.
8720 - SEE ALSO: regtail_study
8721 */
8722 /* TODO: All three parms should be const */
8723 STATIC void
8724 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8725 {
8726     dVAR;
8727     register regnode *scan;
8728     GET_RE_DEBUG_FLAGS_DECL;
8729 
8730     PERL_ARGS_ASSERT_REGTAIL;
8731 #ifndef DEBUGGING
8732     PERL_UNUSED_ARG(depth);
8733 #endif
8734 
8735     if (SIZE_ONLY)
8736 	return;
8737 
8738     /* Find last node. */
8739     scan = p;
8740     for (;;) {
8741 	regnode * const temp = regnext(scan);
8742         DEBUG_PARSE_r({
8743             SV * const mysv=sv_newmortal();
8744             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8745             regprop(RExC_rx, mysv, scan);
8746             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8747                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8748                     (temp == NULL ? "->" : ""),
8749                     (temp == NULL ? PL_reg_name[OP(val)] : "")
8750             );
8751         });
8752         if (temp == NULL)
8753             break;
8754         scan = temp;
8755     }
8756 
8757     if (reg_off_by_arg[OP(scan)]) {
8758         ARG_SET(scan, val - scan);
8759     }
8760     else {
8761         NEXT_OFF(scan) = val - scan;
8762     }
8763 }
8764 
8765 #ifdef DEBUGGING
8766 /*
8767 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8768 - Look for optimizable sequences at the same time.
8769 - currently only looks for EXACT chains.
8770 
8771 This is expermental code. The idea is to use this routine to perform
8772 in place optimizations on branches and groups as they are constructed,
8773 with the long term intention of removing optimization from study_chunk so
8774 that it is purely analytical.
8775 
8776 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8777 to control which is which.
8778 
8779 */
8780 /* TODO: All four parms should be const */
8781 
8782 STATIC U8
8783 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8784 {
8785     dVAR;
8786     register regnode *scan;
8787     U8 exact = PSEUDO;
8788 #ifdef EXPERIMENTAL_INPLACESCAN
8789     I32 min = 0;
8790 #endif
8791     GET_RE_DEBUG_FLAGS_DECL;
8792 
8793     PERL_ARGS_ASSERT_REGTAIL_STUDY;
8794 
8795 
8796     if (SIZE_ONLY)
8797         return exact;
8798 
8799     /* Find last node. */
8800 
8801     scan = p;
8802     for (;;) {
8803         regnode * const temp = regnext(scan);
8804 #ifdef EXPERIMENTAL_INPLACESCAN
8805         if (PL_regkind[OP(scan)] == EXACT)
8806             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8807                 return EXACT;
8808 #endif
8809         if ( exact ) {
8810             switch (OP(scan)) {
8811                 case EXACT:
8812                 case EXACTF:
8813                 case EXACTFL:
8814                         if( exact == PSEUDO )
8815                             exact= OP(scan);
8816                         else if ( exact != OP(scan) )
8817                             exact= 0;
8818                 case NOTHING:
8819                     break;
8820                 default:
8821                     exact= 0;
8822             }
8823         }
8824         DEBUG_PARSE_r({
8825             SV * const mysv=sv_newmortal();
8826             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8827             regprop(RExC_rx, mysv, scan);
8828             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8829                 SvPV_nolen_const(mysv),
8830                 REG_NODE_NUM(scan),
8831                 PL_reg_name[exact]);
8832         });
8833 	if (temp == NULL)
8834 	    break;
8835 	scan = temp;
8836     }
8837     DEBUG_PARSE_r({
8838         SV * const mysv_val=sv_newmortal();
8839         DEBUG_PARSE_MSG("");
8840         regprop(RExC_rx, mysv_val, val);
8841         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8842 		      SvPV_nolen_const(mysv_val),
8843 		      (IV)REG_NODE_NUM(val),
8844 		      (IV)(val - scan)
8845         );
8846     });
8847     if (reg_off_by_arg[OP(scan)]) {
8848 	ARG_SET(scan, val - scan);
8849     }
8850     else {
8851 	NEXT_OFF(scan) = val - scan;
8852     }
8853 
8854     return exact;
8855 }
8856 #endif
8857 
8858 /*
8859  - regcurly - a little FSA that accepts {\d+,?\d*}
8860  */
8861 STATIC I32
8862 S_regcurly(register const char *s)
8863 {
8864     PERL_ARGS_ASSERT_REGCURLY;
8865 
8866     if (*s++ != '{')
8867 	return FALSE;
8868     if (!isDIGIT(*s))
8869 	return FALSE;
8870     while (isDIGIT(*s))
8871 	s++;
8872     if (*s == ',')
8873 	s++;
8874     while (isDIGIT(*s))
8875 	s++;
8876     if (*s != '}')
8877 	return FALSE;
8878     return TRUE;
8879 }
8880 
8881 
8882 /*
8883  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8884  */
8885 #ifdef DEBUGGING
8886 void
8887 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8888 {
8889     int bit;
8890     int set=0;
8891 
8892     for (bit=0; bit<32; bit++) {
8893         if (flags & (1<<bit)) {
8894             if (!set++ && lead)
8895                 PerlIO_printf(Perl_debug_log, "%s",lead);
8896             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8897         }
8898     }
8899     if (lead)  {
8900         if (set)
8901             PerlIO_printf(Perl_debug_log, "\n");
8902         else
8903             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8904     }
8905 }
8906 #endif
8907 
8908 void
8909 Perl_regdump(pTHX_ const regexp *r)
8910 {
8911 #ifdef DEBUGGING
8912     dVAR;
8913     SV * const sv = sv_newmortal();
8914     SV *dsv= sv_newmortal();
8915     RXi_GET_DECL(r,ri);
8916     GET_RE_DEBUG_FLAGS_DECL;
8917 
8918     PERL_ARGS_ASSERT_REGDUMP;
8919 
8920     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8921 
8922     /* Header fields of interest. */
8923     if (r->anchored_substr) {
8924 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8925 	    RE_SV_DUMPLEN(r->anchored_substr), 30);
8926 	PerlIO_printf(Perl_debug_log,
8927 		      "anchored %s%s at %"IVdf" ",
8928 		      s, RE_SV_TAIL(r->anchored_substr),
8929 		      (IV)r->anchored_offset);
8930     } else if (r->anchored_utf8) {
8931 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8932 	    RE_SV_DUMPLEN(r->anchored_utf8), 30);
8933 	PerlIO_printf(Perl_debug_log,
8934 		      "anchored utf8 %s%s at %"IVdf" ",
8935 		      s, RE_SV_TAIL(r->anchored_utf8),
8936 		      (IV)r->anchored_offset);
8937     }
8938     if (r->float_substr) {
8939 	RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8940 	    RE_SV_DUMPLEN(r->float_substr), 30);
8941 	PerlIO_printf(Perl_debug_log,
8942 		      "floating %s%s at %"IVdf"..%"UVuf" ",
8943 		      s, RE_SV_TAIL(r->float_substr),
8944 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
8945     } else if (r->float_utf8) {
8946 	RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8947 	    RE_SV_DUMPLEN(r->float_utf8), 30);
8948 	PerlIO_printf(Perl_debug_log,
8949 		      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8950 		      s, RE_SV_TAIL(r->float_utf8),
8951 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
8952     }
8953     if (r->check_substr || r->check_utf8)
8954 	PerlIO_printf(Perl_debug_log,
8955 		      (const char *)
8956 		      (r->check_substr == r->float_substr
8957 		       && r->check_utf8 == r->float_utf8
8958 		       ? "(checking floating" : "(checking anchored"));
8959     if (r->extflags & RXf_NOSCAN)
8960 	PerlIO_printf(Perl_debug_log, " noscan");
8961     if (r->extflags & RXf_CHECK_ALL)
8962 	PerlIO_printf(Perl_debug_log, " isall");
8963     if (r->check_substr || r->check_utf8)
8964 	PerlIO_printf(Perl_debug_log, ") ");
8965 
8966     if (ri->regstclass) {
8967 	regprop(r, sv, ri->regstclass);
8968 	PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8969     }
8970     if (r->extflags & RXf_ANCH) {
8971 	PerlIO_printf(Perl_debug_log, "anchored");
8972 	if (r->extflags & RXf_ANCH_BOL)
8973 	    PerlIO_printf(Perl_debug_log, "(BOL)");
8974 	if (r->extflags & RXf_ANCH_MBOL)
8975 	    PerlIO_printf(Perl_debug_log, "(MBOL)");
8976 	if (r->extflags & RXf_ANCH_SBOL)
8977 	    PerlIO_printf(Perl_debug_log, "(SBOL)");
8978 	if (r->extflags & RXf_ANCH_GPOS)
8979 	    PerlIO_printf(Perl_debug_log, "(GPOS)");
8980 	PerlIO_putc(Perl_debug_log, ' ');
8981     }
8982     if (r->extflags & RXf_GPOS_SEEN)
8983 	PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8984     if (r->intflags & PREGf_SKIP)
8985 	PerlIO_printf(Perl_debug_log, "plus ");
8986     if (r->intflags & PREGf_IMPLICIT)
8987 	PerlIO_printf(Perl_debug_log, "implicit ");
8988     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8989     if (r->extflags & RXf_EVAL_SEEN)
8990 	PerlIO_printf(Perl_debug_log, "with eval ");
8991     PerlIO_printf(Perl_debug_log, "\n");
8992     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
8993 #else
8994     PERL_ARGS_ASSERT_REGDUMP;
8995     PERL_UNUSED_CONTEXT;
8996     PERL_UNUSED_ARG(r);
8997 #endif	/* DEBUGGING */
8998 }
8999 
9000 /*
9001 - regprop - printable representation of opcode
9002 */
9003 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9004 STMT_START { \
9005         if (do_sep) {                           \
9006             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9007             if (flags & ANYOF_INVERT)           \
9008                 /*make sure the invert info is in each */ \
9009                 sv_catpvs(sv, "^");             \
9010             do_sep = 0;                         \
9011         }                                       \
9012 } STMT_END
9013 
9014 void
9015 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9016 {
9017 #ifdef DEBUGGING
9018     dVAR;
9019     register int k;
9020     RXi_GET_DECL(prog,progi);
9021     GET_RE_DEBUG_FLAGS_DECL;
9022 
9023     PERL_ARGS_ASSERT_REGPROP;
9024 
9025     sv_setpvs(sv, "");
9026 
9027     if (OP(o) > REGNODE_MAX)		/* regnode.type is unsigned */
9028 	/* It would be nice to FAIL() here, but this may be called from
9029 	   regexec.c, and it would be hard to supply pRExC_state. */
9030 	Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9031     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9032 
9033     k = PL_regkind[OP(o)];
9034 
9035     if (k == EXACT) {
9036 	sv_catpvs(sv, " ");
9037 	/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9038 	 * is a crude hack but it may be the best for now since
9039 	 * we have no flag "this EXACTish node was UTF-8"
9040 	 * --jhi */
9041 	pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9042 		  PERL_PV_ESCAPE_UNI_DETECT |
9043 		  PERL_PV_PRETTY_ELLIPSES   |
9044 		  PERL_PV_PRETTY_LTGT       |
9045 		  PERL_PV_PRETTY_NOCLEAR
9046 		  );
9047     } else if (k == TRIE) {
9048 	/* print the details of the trie in dumpuntil instead, as
9049 	 * progi->data isn't available here */
9050         const char op = OP(o);
9051         const U32 n = ARG(o);
9052         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9053                (reg_ac_data *)progi->data->data[n] :
9054                NULL;
9055         const reg_trie_data * const trie
9056 	    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9057 
9058         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9059         DEBUG_TRIE_COMPILE_r(
9060             Perl_sv_catpvf(aTHX_ sv,
9061                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9062                 (UV)trie->startstate,
9063                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9064                 (UV)trie->wordcount,
9065                 (UV)trie->minlen,
9066                 (UV)trie->maxlen,
9067                 (UV)TRIE_CHARCOUNT(trie),
9068                 (UV)trie->uniquecharcount
9069             )
9070         );
9071         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9072             int i;
9073             int rangestart = -1;
9074             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9075             sv_catpvs(sv, "[");
9076             for (i = 0; i <= 256; i++) {
9077                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9078                     if (rangestart == -1)
9079                         rangestart = i;
9080                 } else if (rangestart != -1) {
9081                     if (i <= rangestart + 3)
9082                         for (; rangestart < i; rangestart++)
9083                             put_byte(sv, rangestart);
9084                     else {
9085                         put_byte(sv, rangestart);
9086                         sv_catpvs(sv, "-");
9087                         put_byte(sv, i - 1);
9088                     }
9089                     rangestart = -1;
9090                 }
9091             }
9092             sv_catpvs(sv, "]");
9093         }
9094 
9095     } else if (k == CURLY) {
9096 	if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9097 	    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9098 	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9099     }
9100     else if (k == WHILEM && o->flags)			/* Ordinal/of */
9101 	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9102     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9103 	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));	/* Parenth number */
9104 	if ( RXp_PAREN_NAMES(prog) ) {
9105             if ( k != REF || OP(o) < NREF) {
9106 	        AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9107 	        SV **name= av_fetch(list, ARG(o), 0 );
9108 	        if (name)
9109 	            Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9110             }
9111             else {
9112                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9113                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9114                 I32 *nums=(I32*)SvPVX(sv_dat);
9115                 SV **name= av_fetch(list, nums[0], 0 );
9116                 I32 n;
9117                 if (name) {
9118                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
9119                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9120 			   	    (n ? "," : ""), (IV)nums[n]);
9121                     }
9122                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9123                 }
9124             }
9125         }
9126     } else if (k == GOSUB)
9127 	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));	/* Paren and offset */
9128     else if (k == VERB) {
9129         if (!o->flags)
9130             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9131 			   SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9132     } else if (k == LOGICAL)
9133 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);	/* 2: embedded, otherwise 1 */
9134     else if (k == FOLDCHAR)
9135 	Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9136     else if (k == ANYOF) {
9137 	int i, rangestart = -1;
9138 	const U8 flags = ANYOF_FLAGS(o);
9139 	int do_sep = 0;
9140 
9141 	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9142 	static const char * const anyofs[] = {
9143 	    "\\w",
9144 	    "\\W",
9145 	    "\\s",
9146 	    "\\S",
9147 	    "\\d",
9148 	    "\\D",
9149 	    "[:alnum:]",
9150 	    "[:^alnum:]",
9151 	    "[:alpha:]",
9152 	    "[:^alpha:]",
9153 	    "[:ascii:]",
9154 	    "[:^ascii:]",
9155 	    "[:cntrl:]",
9156 	    "[:^cntrl:]",
9157 	    "[:graph:]",
9158 	    "[:^graph:]",
9159 	    "[:lower:]",
9160 	    "[:^lower:]",
9161 	    "[:print:]",
9162 	    "[:^print:]",
9163 	    "[:punct:]",
9164 	    "[:^punct:]",
9165 	    "[:upper:]",
9166 	    "[:^upper:]",
9167 	    "[:xdigit:]",
9168 	    "[:^xdigit:]",
9169 	    "[:space:]",
9170 	    "[:^space:]",
9171 	    "[:blank:]",
9172 	    "[:^blank:]"
9173 	};
9174 
9175 	if (flags & ANYOF_LOCALE)
9176 	    sv_catpvs(sv, "{loc}");
9177 	if (flags & ANYOF_FOLD)
9178 	    sv_catpvs(sv, "{i}");
9179 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9180 	if (flags & ANYOF_INVERT)
9181 	    sv_catpvs(sv, "^");
9182 
9183 	/* output what the standard cp 0-255 bitmap matches */
9184 	for (i = 0; i <= 256; i++) {
9185 	    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9186 		if (rangestart == -1)
9187 		    rangestart = i;
9188 	    } else if (rangestart != -1) {
9189 		if (i <= rangestart + 3)
9190 		    for (; rangestart < i; rangestart++)
9191 			put_byte(sv, rangestart);
9192 		else {
9193 		    put_byte(sv, rangestart);
9194 		    sv_catpvs(sv, "-");
9195 		    put_byte(sv, i - 1);
9196 		}
9197 		do_sep = 1;
9198 		rangestart = -1;
9199 	    }
9200 	}
9201 
9202         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9203         /* output any special charclass tests (used mostly under use locale) */
9204 	if (o->flags & ANYOF_CLASS)
9205 	    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9206 		if (ANYOF_CLASS_TEST(o,i)) {
9207 		    sv_catpv(sv, anyofs[i]);
9208 		    do_sep = 1;
9209 		}
9210 
9211         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9212 
9213         /* output information about the unicode matching */
9214 	if (flags & ANYOF_UNICODE)
9215 	    sv_catpvs(sv, "{unicode}");
9216 	else if (flags & ANYOF_UNICODE_ALL)
9217 	    sv_catpvs(sv, "{unicode_all}");
9218 
9219 	{
9220 	    SV *lv;
9221 	    SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9222 
9223 	    if (lv) {
9224 		if (sw) {
9225 		    U8 s[UTF8_MAXBYTES_CASE+1];
9226 
9227 		    for (i = 0; i <= 256; i++) { /* just the first 256 */
9228 			uvchr_to_utf8(s, i);
9229 
9230 			if (i < 256 && swash_fetch(sw, s, TRUE)) {
9231 			    if (rangestart == -1)
9232 				rangestart = i;
9233 			} else if (rangestart != -1) {
9234 			    if (i <= rangestart + 3)
9235 				for (; rangestart < i; rangestart++) {
9236 				    const U8 * const e = uvchr_to_utf8(s,rangestart);
9237 				    U8 *p;
9238 				    for(p = s; p < e; p++)
9239 					put_byte(sv, *p);
9240 				}
9241 			    else {
9242 				const U8 *e = uvchr_to_utf8(s,rangestart);
9243 				U8 *p;
9244 				for (p = s; p < e; p++)
9245 				    put_byte(sv, *p);
9246 				sv_catpvs(sv, "-");
9247 				e = uvchr_to_utf8(s, i-1);
9248 				for (p = s; p < e; p++)
9249 				    put_byte(sv, *p);
9250 				}
9251 				rangestart = -1;
9252 			    }
9253 			}
9254 
9255 		    sv_catpvs(sv, "..."); /* et cetera */
9256 		}
9257 
9258 		{
9259 		    char *s = savesvpv(lv);
9260 		    char * const origs = s;
9261 
9262 		    while (*s && *s != '\n')
9263 			s++;
9264 
9265 		    if (*s == '\n') {
9266 			const char * const t = ++s;
9267 
9268 			while (*s) {
9269 			    if (*s == '\n')
9270 				*s = ' ';
9271 			    s++;
9272 			}
9273 			if (s[-1] == ' ')
9274 			    s[-1] = 0;
9275 
9276 			sv_catpv(sv, t);
9277 		    }
9278 
9279 		    Safefree(origs);
9280 		}
9281 	    }
9282 	}
9283 
9284 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9285     }
9286     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9287 	Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9288 #else
9289     PERL_UNUSED_CONTEXT;
9290     PERL_UNUSED_ARG(sv);
9291     PERL_UNUSED_ARG(o);
9292     PERL_UNUSED_ARG(prog);
9293 #endif	/* DEBUGGING */
9294 }
9295 
9296 SV *
9297 Perl_re_intuit_string(pTHX_ REGEXP * const prog)
9298 {				/* Assume that RE_INTUIT is set */
9299     dVAR;
9300     GET_RE_DEBUG_FLAGS_DECL;
9301 
9302     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9303     PERL_UNUSED_CONTEXT;
9304 
9305     DEBUG_COMPILE_r(
9306 	{
9307 	    const char * const s = SvPV_nolen_const(prog->check_substr
9308 		      ? prog->check_substr : prog->check_utf8);
9309 
9310 	    if (!PL_colorset) reginitcolors();
9311 	    PerlIO_printf(Perl_debug_log,
9312 		      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9313 		      PL_colors[4],
9314 		      prog->check_substr ? "" : "utf8 ",
9315 		      PL_colors[5],PL_colors[0],
9316 		      s,
9317 		      PL_colors[1],
9318 		      (strlen(s) > 60 ? "..." : ""));
9319 	} );
9320 
9321     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9322 }
9323 
9324 /*
9325    pregfree()
9326 
9327    handles refcounting and freeing the perl core regexp structure. When
9328    it is necessary to actually free the structure the first thing it
9329    does is call the 'free' method of the regexp_engine associated to to
9330    the regexp, allowing the handling of the void *pprivate; member
9331    first. (This routine is not overridable by extensions, which is why
9332    the extensions free is called first.)
9333 
9334    See regdupe and regdupe_internal if you change anything here.
9335 */
9336 #ifndef PERL_IN_XSUB_RE
9337 void
9338 Perl_pregfree(pTHX_ REGEXP *r)
9339 {
9340     dVAR;
9341     GET_RE_DEBUG_FLAGS_DECL;
9342 
9343     if (!r || (--r->refcnt > 0))
9344 	return;
9345     if (r->mother_re) {
9346         ReREFCNT_dec(r->mother_re);
9347     } else {
9348         CALLREGFREE_PVT(r); /* free the private data */
9349         if (RXp_PAREN_NAMES(r))
9350             SvREFCNT_dec(RXp_PAREN_NAMES(r));
9351         Safefree(RX_WRAPPED(r));
9352     }
9353     if (r->substrs) {
9354         if (r->anchored_substr)
9355             SvREFCNT_dec(r->anchored_substr);
9356         if (r->anchored_utf8)
9357             SvREFCNT_dec(r->anchored_utf8);
9358         if (r->float_substr)
9359             SvREFCNT_dec(r->float_substr);
9360         if (r->float_utf8)
9361             SvREFCNT_dec(r->float_utf8);
9362 	Safefree(r->substrs);
9363     }
9364     RX_MATCH_COPY_FREE(r);
9365 #ifdef PERL_OLD_COPY_ON_WRITE
9366     if (r->saved_copy)
9367         SvREFCNT_dec(r->saved_copy);
9368 #endif
9369     Safefree(r->swap);
9370     Safefree(r->offs);
9371     Safefree(r);
9372 }
9373 
9374 /*  reg_temp_copy()
9375 
9376     This is a hacky workaround to the structural issue of match results
9377     being stored in the regexp structure which is in turn stored in
9378     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9379     could be PL_curpm in multiple contexts, and could require multiple
9380     result sets being associated with the pattern simultaneously, such
9381     as when doing a recursive match with (??{$qr})
9382 
9383     The solution is to make a lightweight copy of the regexp structure
9384     when a qr// is returned from the code executed by (??{$qr}) this
9385     lightweight copy doesnt actually own any of its data except for
9386     the starp/end and the actual regexp structure itself.
9387 
9388 */
9389 
9390 
9391 REGEXP *
9392 Perl_reg_temp_copy (pTHX_ REGEXP *r) {
9393     regexp *ret;
9394     register const I32 npar = r->nparens+1;
9395 
9396     PERL_ARGS_ASSERT_REG_TEMP_COPY;
9397 
9398     (void)ReREFCNT_inc(r);
9399     Newx(ret, 1, regexp);
9400     StructCopy(r, ret, regexp);
9401     Newx(ret->offs, npar, regexp_paren_pair);
9402     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9403     ret->refcnt = 1;
9404     if (r->substrs) {
9405         Newx(ret->substrs, 1, struct reg_substr_data);
9406 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9407 
9408 	SvREFCNT_inc_void(ret->anchored_substr);
9409 	SvREFCNT_inc_void(ret->anchored_utf8);
9410 	SvREFCNT_inc_void(ret->float_substr);
9411 	SvREFCNT_inc_void(ret->float_utf8);
9412 
9413 	/* check_substr and check_utf8, if non-NULL, point to either their
9414 	   anchored or float namesakes, and don't hold a second reference.  */
9415     }
9416     RX_MATCH_COPIED_off(ret);
9417 #ifdef PERL_OLD_COPY_ON_WRITE
9418     ret->saved_copy = NULL;
9419 #endif
9420     ret->mother_re = r;
9421     ret->swap = NULL;
9422 
9423     return ret;
9424 }
9425 #endif
9426 
9427 /* regfree_internal()
9428 
9429    Free the private data in a regexp. This is overloadable by
9430    extensions. Perl takes care of the regexp structure in pregfree(),
9431    this covers the *pprivate pointer which technically perldoesnt
9432    know about, however of course we have to handle the
9433    regexp_internal structure when no extension is in use.
9434 
9435    Note this is called before freeing anything in the regexp
9436    structure.
9437  */
9438 
9439 void
9440 Perl_regfree_internal(pTHX_ REGEXP * const r)
9441 {
9442     dVAR;
9443     RXi_GET_DECL(r,ri);
9444     GET_RE_DEBUG_FLAGS_DECL;
9445 
9446     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9447 
9448     DEBUG_COMPILE_r({
9449 	if (!PL_colorset)
9450 	    reginitcolors();
9451 	{
9452 	    SV *dsv= sv_newmortal();
9453             RE_PV_QUOTED_DECL(s, RX_UTF8(r),
9454                 dsv, RX_PRECOMP(r), RX_PRELEN(r), 60);
9455             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9456                 PL_colors[4],PL_colors[5],s);
9457         }
9458     });
9459 #ifdef RE_TRACK_PATTERN_OFFSETS
9460     if (ri->u.offsets)
9461         Safefree(ri->u.offsets);             /* 20010421 MJD */
9462 #endif
9463     if (ri->data) {
9464 	int n = ri->data->count;
9465 	PAD* new_comppad = NULL;
9466 	PAD* old_comppad;
9467 	PADOFFSET refcnt;
9468 
9469 	while (--n >= 0) {
9470           /* If you add a ->what type here, update the comment in regcomp.h */
9471 	    switch (ri->data->what[n]) {
9472 	    case 's':
9473 	    case 'S':
9474 	    case 'u':
9475 		SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9476 		break;
9477 	    case 'f':
9478 		Safefree(ri->data->data[n]);
9479 		break;
9480 	    case 'p':
9481 		new_comppad = MUTABLE_AV(ri->data->data[n]);
9482 		break;
9483 	    case 'o':
9484 		if (new_comppad == NULL)
9485 		    Perl_croak(aTHX_ "panic: pregfree comppad");
9486 		PAD_SAVE_LOCAL(old_comppad,
9487 		    /* Watch out for global destruction's random ordering. */
9488 		    (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9489 		);
9490 		OP_REFCNT_LOCK;
9491 		refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9492 		OP_REFCNT_UNLOCK;
9493 		if (!refcnt)
9494                     op_free((OP_4tree*)ri->data->data[n]);
9495 
9496 		PAD_RESTORE_LOCAL(old_comppad);
9497 		SvREFCNT_dec(MUTABLE_SV(new_comppad));
9498 		new_comppad = NULL;
9499 		break;
9500 	    case 'n':
9501 	        break;
9502             case 'T':
9503                 { /* Aho Corasick add-on structure for a trie node.
9504                      Used in stclass optimization only */
9505                     U32 refcount;
9506                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9507                     OP_REFCNT_LOCK;
9508                     refcount = --aho->refcount;
9509                     OP_REFCNT_UNLOCK;
9510                     if ( !refcount ) {
9511                         PerlMemShared_free(aho->states);
9512                         PerlMemShared_free(aho->fail);
9513 			 /* do this last!!!! */
9514                         PerlMemShared_free(ri->data->data[n]);
9515                         PerlMemShared_free(ri->regstclass);
9516                     }
9517                 }
9518                 break;
9519 	    case 't':
9520 	        {
9521 	            /* trie structure. */
9522 	            U32 refcount;
9523 	            reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9524                     OP_REFCNT_LOCK;
9525                     refcount = --trie->refcount;
9526                     OP_REFCNT_UNLOCK;
9527                     if ( !refcount ) {
9528                         PerlMemShared_free(trie->charmap);
9529                         PerlMemShared_free(trie->states);
9530                         PerlMemShared_free(trie->trans);
9531                         if (trie->bitmap)
9532                             PerlMemShared_free(trie->bitmap);
9533                         if (trie->wordlen)
9534                             PerlMemShared_free(trie->wordlen);
9535                         if (trie->jump)
9536                             PerlMemShared_free(trie->jump);
9537                         if (trie->nextword)
9538                             PerlMemShared_free(trie->nextword);
9539                         /* do this last!!!! */
9540                         PerlMemShared_free(ri->data->data[n]);
9541 		    }
9542 		}
9543 		break;
9544 	    default:
9545 		Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9546 	    }
9547 	}
9548 	Safefree(ri->data->what);
9549 	Safefree(ri->data);
9550     }
9551 
9552     Safefree(ri);
9553 }
9554 
9555 #define sv_dup_inc(s,t)	SvREFCNT_inc(sv_dup(s,t))
9556 #define av_dup_inc(s,t)	MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9557 #define hv_dup_inc(s,t)	MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
9558 #define SAVEPVN(p,n)	((p) ? savepvn(p,n) : NULL)
9559 
9560 /*
9561    re_dup - duplicate a regexp.
9562 
9563    This routine is expected to clone a given regexp structure. It is only
9564    compiled under USE_ITHREADS.
9565 
9566    After all of the core data stored in struct regexp is duplicated
9567    the regexp_engine.dupe method is used to copy any private data
9568    stored in the *pprivate pointer. This allows extensions to handle
9569    any duplication it needs to do.
9570 
9571    See pregfree() and regfree_internal() if you change anything here.
9572 */
9573 #if defined(USE_ITHREADS)
9574 #ifndef PERL_IN_XSUB_RE
9575 regexp *
9576 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
9577 {
9578     dVAR;
9579     regexp *ret;
9580     I32 npar;
9581     U32 precomp_offset;
9582 
9583     PERL_ARGS_ASSERT_RE_DUP;
9584 
9585     if (!r)
9586 	return (REGEXP *)NULL;
9587 
9588     if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
9589 	return ret;
9590 
9591     npar = r->nparens+1;
9592     Newx(ret, 1, regexp);
9593     StructCopy(r, ret, regexp);
9594     Newx(ret->offs, npar, regexp_paren_pair);
9595     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9596     if(ret->swap) {
9597         /* no need to copy these */
9598         Newx(ret->swap, npar, regexp_paren_pair);
9599     }
9600 
9601     if (ret->substrs) {
9602 	/* Do it this way to avoid reading from *r after the StructCopy().
9603 	   That way, if any of the sv_dup_inc()s dislodge *r from the L1
9604 	   cache, it doesn't matter.  */
9605 	const bool anchored = r->check_substr
9606 	    ? r->check_substr == r->anchored_substr
9607 	    : r->check_utf8 == r->anchored_utf8;
9608         Newx(ret->substrs, 1, struct reg_substr_data);
9609 	StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9610 
9611 	ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9612 	ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9613 	ret->float_substr = sv_dup_inc(ret->float_substr, param);
9614 	ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9615 
9616 	/* check_substr and check_utf8, if non-NULL, point to either their
9617 	   anchored or float namesakes, and don't hold a second reference.  */
9618 
9619 	if (ret->check_substr) {
9620 	    if (anchored) {
9621 		assert(r->check_utf8 == r->anchored_utf8);
9622 		ret->check_substr = ret->anchored_substr;
9623 		ret->check_utf8 = ret->anchored_utf8;
9624 	    } else {
9625 		assert(r->check_substr == r->float_substr);
9626 		assert(r->check_utf8 == r->float_utf8);
9627 		ret->check_substr = ret->float_substr;
9628 		ret->check_utf8 = ret->float_utf8;
9629 	    }
9630 	} else if (ret->check_utf8) {
9631 	    if (anchored) {
9632 		ret->check_utf8 = ret->anchored_utf8;
9633 	    } else {
9634 		ret->check_utf8 = ret->float_utf8;
9635 	    }
9636 	}
9637     }
9638 
9639     precomp_offset = RX_PRECOMP(ret) - ret->wrapped;
9640 
9641     RX_WRAPPED(ret)     = SAVEPVN(RX_WRAPPED(ret), RX_WRAPLEN(ret)+1);
9642     RX_PRECOMP(ret)     = ret->wrapped + precomp_offset;
9643     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
9644 
9645     if (ret->pprivate)
9646 	RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
9647 
9648     if (RX_MATCH_COPIED(ret))
9649 	ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9650     else
9651 	ret->subbeg = NULL;
9652 #ifdef PERL_OLD_COPY_ON_WRITE
9653     ret->saved_copy = NULL;
9654 #endif
9655 
9656     ret->mother_re      = NULL;
9657     ret->gofs = 0;
9658 
9659     ptr_table_store(PL_ptr_table, r, ret);
9660     return ret;
9661 }
9662 #endif /* PERL_IN_XSUB_RE */
9663 
9664 /*
9665    regdupe_internal()
9666 
9667    This is the internal complement to regdupe() which is used to copy
9668    the structure pointed to by the *pprivate pointer in the regexp.
9669    This is the core version of the extension overridable cloning hook.
9670    The regexp structure being duplicated will be copied by perl prior
9671    to this and will be provided as the regexp *r argument, however
9672    with the /old/ structures pprivate pointer value. Thus this routine
9673    may override any copying normally done by perl.
9674 
9675    It returns a pointer to the new regexp_internal structure.
9676 */
9677 
9678 void *
9679 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param)
9680 {
9681     dVAR;
9682     regexp_internal *reti;
9683     int len, npar;
9684     RXi_GET_DECL(r,ri);
9685 
9686     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
9687 
9688     npar = r->nparens+1;
9689     len = ProgLen(ri);
9690 
9691     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
9692     Copy(ri->program, reti->program, len+1, regnode);
9693 
9694 
9695     reti->regstclass = NULL;
9696 
9697     if (ri->data) {
9698 	struct reg_data *d;
9699         const int count = ri->data->count;
9700 	int i;
9701 
9702 	Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9703 		char, struct reg_data);
9704 	Newx(d->what, count, U8);
9705 
9706 	d->count = count;
9707 	for (i = 0; i < count; i++) {
9708 	    d->what[i] = ri->data->what[i];
9709 	    switch (d->what[i]) {
9710 	        /* legal options are one of: sSfpontTu
9711 	           see also regcomp.h and pregfree() */
9712 	    case 's':
9713 	    case 'S':
9714 	    case 'p': /* actually an AV, but the dup function is identical.  */
9715 	    case 'u': /* actually an HV, but the dup function is identical.  */
9716 		d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
9717 		break;
9718 	    case 'f':
9719 		/* This is cheating. */
9720 		Newx(d->data[i], 1, struct regnode_charclass_class);
9721 		StructCopy(ri->data->data[i], d->data[i],
9722 			    struct regnode_charclass_class);
9723 		reti->regstclass = (regnode*)d->data[i];
9724 		break;
9725 	    case 'o':
9726 		/* Compiled op trees are readonly and in shared memory,
9727 		   and can thus be shared without duplication. */
9728 		OP_REFCNT_LOCK;
9729 		d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9730 		OP_REFCNT_UNLOCK;
9731 		break;
9732 	    case 'T':
9733 		/* Trie stclasses are readonly and can thus be shared
9734 		 * without duplication. We free the stclass in pregfree
9735 		 * when the corresponding reg_ac_data struct is freed.
9736 		 */
9737 		reti->regstclass= ri->regstclass;
9738 		/* Fall through */
9739 	    case 't':
9740 		OP_REFCNT_LOCK;
9741 		((reg_trie_data*)ri->data->data[i])->refcount++;
9742 		OP_REFCNT_UNLOCK;
9743 		/* Fall through */
9744 	    case 'n':
9745 		d->data[i] = ri->data->data[i];
9746 		break;
9747             default:
9748 		Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9749 	    }
9750 	}
9751 
9752 	reti->data = d;
9753     }
9754     else
9755 	reti->data = NULL;
9756 
9757     reti->name_list_idx = ri->name_list_idx;
9758 
9759 #ifdef RE_TRACK_PATTERN_OFFSETS
9760     if (ri->u.offsets) {
9761         Newx(reti->u.offsets, 2*len+1, U32);
9762         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9763     }
9764 #else
9765     SetProgLen(reti,len);
9766 #endif
9767 
9768     return (void*)reti;
9769 }
9770 
9771 #endif    /* USE_ITHREADS */
9772 
9773 /*
9774    reg_stringify()
9775 
9776    converts a regexp embedded in a MAGIC struct to its stringified form,
9777    caching the converted form in the struct and returns the cached
9778    string.
9779 
9780    If lp is nonnull then it is used to return the length of the
9781    resulting string
9782 
9783    If flags is nonnull and the returned string contains UTF8 then
9784    (*flags & 1) will be true.
9785 
9786    If haseval is nonnull then it is used to return whether the pattern
9787    contains evals.
9788 
9789    Normally called via macro:
9790 
9791         CALLREG_STRINGIFY(mg,&len,&utf8);
9792 
9793    And internally with
9794 
9795         CALLREG_AS_STR(mg,&lp,&flags,&haseval)
9796 
9797    See sv_2pv_flags() in sv.c for an example of internal usage.
9798 
9799  */
9800 #ifndef PERL_IN_XSUB_RE
9801 
9802 char *
9803 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9804     dVAR;
9805     const REGEXP * const re = (REGEXP *)mg->mg_obj;
9806     if (haseval)
9807         *haseval = RX_SEEN_EVALS(re);
9808     if (flags)
9809 	*flags = RX_UTF8(re) ? 1 : 0;
9810     if (lp)
9811 	*lp = RX_WRAPLEN(re);
9812     return RX_WRAPPED(re);
9813 }
9814 
9815 /*
9816  - regnext - dig the "next" pointer out of a node
9817  */
9818 regnode *
9819 Perl_regnext(pTHX_ register regnode *p)
9820 {
9821     dVAR;
9822     register I32 offset;
9823 
9824     if (!p)
9825 	return(NULL);
9826 
9827     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9828     if (offset == 0)
9829 	return(NULL);
9830 
9831     return(p+offset);
9832 }
9833 #endif
9834 
9835 STATIC void
9836 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9837 {
9838     va_list args;
9839     STRLEN l1 = strlen(pat1);
9840     STRLEN l2 = strlen(pat2);
9841     char buf[512];
9842     SV *msv;
9843     const char *message;
9844 
9845     PERL_ARGS_ASSERT_RE_CROAK2;
9846 
9847     if (l1 > 510)
9848 	l1 = 510;
9849     if (l1 + l2 > 510)
9850 	l2 = 510 - l1;
9851     Copy(pat1, buf, l1 , char);
9852     Copy(pat2, buf + l1, l2 , char);
9853     buf[l1 + l2] = '\n';
9854     buf[l1 + l2 + 1] = '\0';
9855 #ifdef I_STDARG
9856     /* ANSI variant takes additional second argument */
9857     va_start(args, pat2);
9858 #else
9859     va_start(args);
9860 #endif
9861     msv = vmess(buf, &args);
9862     va_end(args);
9863     message = SvPV_const(msv,l1);
9864     if (l1 > 512)
9865 	l1 = 512;
9866     Copy(message, buf, l1 , char);
9867     buf[l1-1] = '\0';			/* Overwrite \n */
9868     Perl_croak(aTHX_ "%s", buf);
9869 }
9870 
9871 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
9872 
9873 #ifndef PERL_IN_XSUB_RE
9874 void
9875 Perl_save_re_context(pTHX)
9876 {
9877     dVAR;
9878 
9879     struct re_save_state *state;
9880 
9881     SAVEVPTR(PL_curcop);
9882     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9883 
9884     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9885     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9886     SSPUSHINT(SAVEt_RE_STATE);
9887 
9888     Copy(&PL_reg_state, state, 1, struct re_save_state);
9889 
9890     PL_reg_start_tmp = 0;
9891     PL_reg_start_tmpl = 0;
9892     PL_reg_oldsaved = NULL;
9893     PL_reg_oldsavedlen = 0;
9894     PL_reg_maxiter = 0;
9895     PL_reg_leftiter = 0;
9896     PL_reg_poscache = NULL;
9897     PL_reg_poscache_size = 0;
9898 #ifdef PERL_OLD_COPY_ON_WRITE
9899     PL_nrs = NULL;
9900 #endif
9901 
9902     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9903     if (PL_curpm) {
9904 	const REGEXP * const rx = PM_GETRE(PL_curpm);
9905 	if (rx) {
9906 	    U32 i;
9907 	    for (i = 1; i <= RX_NPARENS(rx); i++) {
9908 		char digits[TYPE_CHARS(long)];
9909 		const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9910 		GV *const *const gvp
9911 		    = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9912 
9913 		if (gvp) {
9914 		    GV * const gv = *gvp;
9915 		    if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9916 			save_scalar(gv);
9917 		}
9918 	    }
9919 	}
9920     }
9921 }
9922 #endif
9923 
9924 static void
9925 clear_re(pTHX_ void *r)
9926 {
9927     dVAR;
9928     ReREFCNT_dec((REGEXP *)r);
9929 }
9930 
9931 #ifdef DEBUGGING
9932 
9933 STATIC void
9934 S_put_byte(pTHX_ SV *sv, int c)
9935 {
9936     PERL_ARGS_ASSERT_PUT_BYTE;
9937 
9938     /* Our definition of isPRINT() ignores locales, so only bytes that are
9939        not part of UTF-8 are considered printable. I assume that the same
9940        holds for UTF-EBCDIC.
9941        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9942        which Wikipedia says:
9943 
9944        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9945        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9946        identical, to the ASCII delete (DEL) or rubout control character.
9947        ) So the old condition can be simplified to !isPRINT(c)  */
9948     if (!isPRINT(c))
9949 	Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9950     else {
9951 	const char string = c;
9952 	if (c == '-' || c == ']' || c == '\\' || c == '^')
9953 	    sv_catpvs(sv, "\\");
9954 	sv_catpvn(sv, &string, 1);
9955     }
9956 }
9957 
9958 
9959 #define CLEAR_OPTSTART \
9960     if (optstart) STMT_START { \
9961 	    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9962 	    optstart=NULL; \
9963     } STMT_END
9964 
9965 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9966 
9967 STATIC const regnode *
9968 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9969 	    const regnode *last, const regnode *plast,
9970 	    SV* sv, I32 indent, U32 depth)
9971 {
9972     dVAR;
9973     register U8 op = PSEUDO;	/* Arbitrary non-END op. */
9974     register const regnode *next;
9975     const regnode *optstart= NULL;
9976 
9977     RXi_GET_DECL(r,ri);
9978     GET_RE_DEBUG_FLAGS_DECL;
9979 
9980     PERL_ARGS_ASSERT_DUMPUNTIL;
9981 
9982 #ifdef DEBUG_DUMPUNTIL
9983     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9984         last ? last-start : 0,plast ? plast-start : 0);
9985 #endif
9986 
9987     if (plast && plast < last)
9988         last= plast;
9989 
9990     while (PL_regkind[op] != END && (!last || node < last)) {
9991 	/* While that wasn't END last time... */
9992 	NODE_ALIGN(node);
9993 	op = OP(node);
9994 	if (op == CLOSE || op == WHILEM)
9995 	    indent--;
9996 	next = regnext((regnode *)node);
9997 
9998 	/* Where, what. */
9999 	if (OP(node) == OPTIMIZED) {
10000 	    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10001 	        optstart = node;
10002 	    else
10003 		goto after_print;
10004 	} else
10005 	    CLEAR_OPTSTART;
10006 
10007 	regprop(r, sv, node);
10008 	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10009 		      (int)(2*indent + 1), "", SvPVX_const(sv));
10010 
10011         if (OP(node) != OPTIMIZED) {
10012             if (next == NULL)		/* Next ptr. */
10013                 PerlIO_printf(Perl_debug_log, " (0)");
10014             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10015                 PerlIO_printf(Perl_debug_log, " (FAIL)");
10016             else
10017                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10018             (void)PerlIO_putc(Perl_debug_log, '\n');
10019         }
10020 
10021       after_print:
10022 	if (PL_regkind[(U8)op] == BRANCHJ) {
10023 	    assert(next);
10024 	    {
10025                 register const regnode *nnode = (OP(next) == LONGJMP
10026 					     ? regnext((regnode *)next)
10027 					     : next);
10028                 if (last && nnode > last)
10029                     nnode = last;
10030                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10031 	    }
10032 	}
10033 	else if (PL_regkind[(U8)op] == BRANCH) {
10034 	    assert(next);
10035 	    DUMPUNTIL(NEXTOPER(node), next);
10036 	}
10037 	else if ( PL_regkind[(U8)op]  == TRIE ) {
10038 	    const regnode *this_trie = node;
10039 	    const char op = OP(node);
10040             const U32 n = ARG(node);
10041 	    const reg_ac_data * const ac = op>=AHOCORASICK ?
10042                (reg_ac_data *)ri->data->data[n] :
10043                NULL;
10044 	    const reg_trie_data * const trie =
10045 	        (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10046 #ifdef DEBUGGING
10047 	    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10048 #endif
10049 	    const regnode *nextbranch= NULL;
10050 	    I32 word_idx;
10051             sv_setpvs(sv, "");
10052 	    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10053 		SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10054 
10055                 PerlIO_printf(Perl_debug_log, "%*s%s ",
10056                    (int)(2*(indent+3)), "",
10057                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10058 	                    PL_colors[0], PL_colors[1],
10059 	                    (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10060 	                    PERL_PV_PRETTY_ELLIPSES    |
10061 	                    PERL_PV_PRETTY_LTGT
10062                             )
10063                             : "???"
10064                 );
10065                 if (trie->jump) {
10066                     U16 dist= trie->jump[word_idx+1];
10067 		    PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10068 				  (UV)((dist ? this_trie + dist : next) - start));
10069                     if (dist) {
10070                         if (!nextbranch)
10071                             nextbranch= this_trie + trie->jump[0];
10072 			DUMPUNTIL(this_trie + dist, nextbranch);
10073                     }
10074                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10075                         nextbranch= regnext((regnode *)nextbranch);
10076                 } else {
10077                     PerlIO_printf(Perl_debug_log, "\n");
10078 		}
10079 	    }
10080 	    if (last && next > last)
10081 	        node= last;
10082 	    else
10083 	        node= next;
10084 	}
10085 	else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
10086 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10087                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10088 	}
10089 	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10090 	    assert(next);
10091 	    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10092 	}
10093 	else if ( op == PLUS || op == STAR) {
10094 	    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10095 	}
10096 	else if (op == ANYOF) {
10097 	    /* arglen 1 + class block */
10098 	    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10099 		    ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10100 	    node = NEXTOPER(node);
10101 	}
10102 	else if (PL_regkind[(U8)op] == EXACT) {
10103             /* Literal string, where present. */
10104 	    node += NODE_SZ_STR(node) - 1;
10105 	    node = NEXTOPER(node);
10106 	}
10107 	else {
10108 	    node = NEXTOPER(node);
10109 	    node += regarglen[(U8)op];
10110 	}
10111 	if (op == CURLYX || op == OPEN)
10112 	    indent++;
10113     }
10114     CLEAR_OPTSTART;
10115 #ifdef DEBUG_DUMPUNTIL
10116     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10117 #endif
10118     return node;
10119 }
10120 
10121 #endif	/* DEBUGGING */
10122 
10123 /*
10124  * Local variables:
10125  * c-indentation-style: bsd
10126  * c-basic-offset: 4
10127  * indent-tabs-mode: t
10128  * End:
10129  *
10130  * ex: set ts=8 sts=4 sw=4 noet:
10131  */
10132