xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/regcomp.c (revision 6696:a476264b067f)
1 /*    regcomp.c
2  */
3 
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7 
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11 
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16 
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21 
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32 
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44 
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47 
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *	Copyright (c) 1986 by University of Toronto.
53  *	Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *	Permission is granted to anyone to use this software for any
56  *	purpose on any computer system, and to redistribute it freely,
57  *	subject to the following restrictions:
58  *
59  *	1. The author is not responsible for the consequences of use of
60  *		this software, no matter how awful, even if they arise
61  *		from defects in it.
62  *
63  *	2. The origin of this software must not be misrepresented, either
64  *		by explicit claim or by omission.
65  *
66  *	3. Altered versions must be plainly marked as such, and must not
67  *		be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73  ****    2000, 2001, 2002, 2003, by Larry Wall and others
74  ****
75  ****    You may distribute under the terms of either the GNU General Public
76  ****    License or the Artistic License, as specified in the README file.
77 
78  *
79  * Beware that some of this code is subtly aware of the way operator
80  * precedence is structured in regular expressions.  Serious changes in
81  * regular-expression syntax might require a total rethink.
82  */
83 #include "EXTERN.h"
84 #define PERL_IN_REGCOMP_C
85 #include "perl.h"
86 
87 #ifndef PERL_IN_XSUB_RE
88 #  include "INTERN.h"
89 #endif
90 
91 #define REG_COMP_C
92 #include "regcomp.h"
93 
94 #ifdef op
95 #undef op
96 #endif /* op */
97 
98 #ifdef MSDOS
99 #  if defined(BUGGY_MSC6)
100  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101 #    pragma optimize("a",off)
102  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103 #    pragma optimize("w",on )
104 #  endif /* BUGGY_MSC6 */
105 #endif /* MSDOS */
106 
107 #ifndef STATIC
108 #define	STATIC	static
109 #endif
110 
111 typedef struct RExC_state_t {
112     U32		flags;			/* are we folding, multilining? */
113     char	*precomp;		/* uncompiled string. */
114     regexp	*rx;
115     char	*start;			/* Start of input for compile */
116     char	*end;			/* End of input for compile */
117     char	*parse;			/* Input-scan pointer. */
118     I32		whilem_seen;		/* number of WHILEM in this expr */
119     regnode	*emit_start;		/* Start of emitted-code area */
120     regnode	*emit;			/* Code-emit pointer; &regdummy = don't = compiling */
121     I32		naughty;		/* How bad is this pattern? */
122     I32		sawback;		/* Did we see \1, ...? */
123     U32		seen;
124     I32		size;			/* Code size. */
125     I32		npar;			/* () count. */
126     I32		extralen;
127     I32		seen_zerolen;
128     I32		seen_evals;
129     I32		utf8;		/* whether the pattern is utf8 or not */
130     I32		orig_utf8;	/* whether the pattern was originally in utf8 */
131 				/* XXX use this for future optimisation of case
132 				 * where pattern must be upgraded to utf8. */
133 #if ADD_TO_REGEXEC
134     char 	*starttry;		/* -Dr: where regtry was called. */
135 #define RExC_starttry	(pRExC_state->starttry)
136 #endif
137 } RExC_state_t;
138 
139 #define RExC_flags	(pRExC_state->flags)
140 #define RExC_precomp	(pRExC_state->precomp)
141 #define RExC_rx		(pRExC_state->rx)
142 #define RExC_start	(pRExC_state->start)
143 #define RExC_end	(pRExC_state->end)
144 #define RExC_parse	(pRExC_state->parse)
145 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
146 #define RExC_offsets	(pRExC_state->rx->offsets) /* I am not like the others */
147 #define RExC_emit	(pRExC_state->emit)
148 #define RExC_emit_start	(pRExC_state->emit_start)
149 #define RExC_naughty	(pRExC_state->naughty)
150 #define RExC_sawback	(pRExC_state->sawback)
151 #define RExC_seen	(pRExC_state->seen)
152 #define RExC_size	(pRExC_state->size)
153 #define RExC_npar	(pRExC_state->npar)
154 #define RExC_extralen	(pRExC_state->extralen)
155 #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
156 #define RExC_seen_evals	(pRExC_state->seen_evals)
157 #define RExC_utf8	(pRExC_state->utf8)
158 #define RExC_orig_utf8	(pRExC_state->orig_utf8)
159 
160 #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
161 #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
162 	((*s) == '{' && regcurly(s)))
163 
164 #ifdef SPSTART
165 #undef SPSTART		/* dratted cpp namespace... */
166 #endif
167 /*
168  * Flags to be passed up and down.
169  */
170 #define	WORST		0	/* Worst case. */
171 #define	HASWIDTH	0x1	/* Known to match non-null strings. */
172 #define	SIMPLE		0x2	/* Simple enough to be STAR/PLUS operand. */
173 #define	SPSTART		0x4	/* Starts with * or +. */
174 #define TRYAGAIN	0x8	/* Weeded out a declaration. */
175 
176 /* Length of a variant. */
177 
178 typedef struct scan_data_t {
179     I32 len_min;
180     I32 len_delta;
181     I32 pos_min;
182     I32 pos_delta;
183     SV *last_found;
184     I32 last_end;			/* min value, <0 unless valid. */
185     I32 last_start_min;
186     I32 last_start_max;
187     SV **longest;			/* Either &l_fixed, or &l_float. */
188     SV *longest_fixed;
189     I32 offset_fixed;
190     SV *longest_float;
191     I32 offset_float_min;
192     I32 offset_float_max;
193     I32 flags;
194     I32 whilem_c;
195     I32 *last_closep;
196     struct regnode_charclass_class *start_class;
197 } scan_data_t;
198 
199 /*
200  * Forward declarations for pregcomp()'s friends.
201  */
202 
203 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204 				      0, 0, 0, 0, 0, 0};
205 
206 #define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL		0x1
208 #define SF_BEFORE_MEOL		0x2
209 #define SF_FIX_BEFORE_EOL	(SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL	(SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
211 
212 #ifdef NO_UNARY_PLUS
213 #  define SF_FIX_SHIFT_EOL	(0+2)
214 #  define SF_FL_SHIFT_EOL		(0+4)
215 #else
216 #  define SF_FIX_SHIFT_EOL	(+2)
217 #  define SF_FL_SHIFT_EOL		(+4)
218 #endif
219 
220 #define SF_FIX_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
222 
223 #define SF_FL_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF		0x40
226 #define SF_HAS_PAR		0x80
227 #define SF_IN_PAR		0x100
228 #define SF_HAS_EVAL		0x200
229 #define SCF_DO_SUBSTR		0x400
230 #define SCF_DO_STCLASS_AND	0x0800
231 #define SCF_DO_STCLASS_OR	0x1000
232 #define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS	0x2000
234 
235 #define UTF (RExC_utf8 != 0)
236 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
237 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
238 
239 #define OOB_UNICODE		12345678
240 #define OOB_NAMEDCLASS		-1
241 
242 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
243 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
244 
245 
246 /* length of regex to show in messages that don't mark a position within */
247 #define RegexLengthToShowInErrorMessages 127
248 
249 /*
250  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
251  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
252  * op/pragma/warn/regcomp.
253  */
254 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
255 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
256 
257 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
258 
259 /*
260  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
261  * arg. Show regex, up to a maximum length. If it's too long, chop and add
262  * "...".
263  */
264 #define	FAIL(msg) STMT_START {						\
265     char *ellipses = "";						\
266     IV len = RExC_end - RExC_precomp;					\
267 									\
268     if (!SIZE_ONLY)							\
269 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);			\
270     if (len > RegexLengthToShowInErrorMessages) {			\
271 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
272 	len = RegexLengthToShowInErrorMessages - 10;			\
273 	ellipses = "...";						\
274     }									\
275     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",				\
276 	    msg, (int)len, RExC_precomp, ellipses);			\
277 } STMT_END
278 
279 /*
280  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
281  * args. Show regex, up to a maximum length. If it's too long, chop and add
282  * "...".
283  */
284 #define	FAIL2(pat,msg) STMT_START {					\
285     char *ellipses = "";						\
286     IV len = RExC_end - RExC_precomp;					\
287 									\
288     if (!SIZE_ONLY)							\
289 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);			\
290     if (len > RegexLengthToShowInErrorMessages) {			\
291 	/* chop 10 shorter than the max, to ensure meaning of "..." */	\
292 	len = RegexLengthToShowInErrorMessages - 10;			\
293 	ellipses = "...";						\
294     }									\
295     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",			\
296 	    msg, (int)len, RExC_precomp, ellipses);			\
297 } STMT_END
298 
299 
300 /*
301  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
302  */
303 #define	Simple_vFAIL(m) STMT_START {					\
304     IV offset = RExC_parse - RExC_precomp;				\
305     Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
306 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
307 } STMT_END
308 
309 /*
310  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
311  */
312 #define	vFAIL(m) STMT_START {				\
313     if (!SIZE_ONLY)					\
314 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
315     Simple_vFAIL(m);					\
316 } STMT_END
317 
318 /*
319  * Like Simple_vFAIL(), but accepts two arguments.
320  */
321 #define	Simple_vFAIL2(m,a1) STMT_START {			\
322     IV offset = RExC_parse - RExC_precomp;			\
323     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,			\
324 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
325 } STMT_END
326 
327 /*
328  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
329  */
330 #define	vFAIL2(m,a1) STMT_START {			\
331     if (!SIZE_ONLY)					\
332 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
333     Simple_vFAIL2(m, a1);				\
334 } STMT_END
335 
336 
337 /*
338  * Like Simple_vFAIL(), but accepts three arguments.
339  */
340 #define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
341     IV offset = RExC_parse - RExC_precomp;			\
342     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,		\
343 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
344 } STMT_END
345 
346 /*
347  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
348  */
349 #define	vFAIL3(m,a1,a2) STMT_START {			\
350     if (!SIZE_ONLY)					\
351 	SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
352     Simple_vFAIL3(m, a1, a2);				\
353 } STMT_END
354 
355 /*
356  * Like Simple_vFAIL(), but accepts four arguments.
357  */
358 #define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
359     IV offset = RExC_parse - RExC_precomp;			\
360     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,		\
361 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
362 } STMT_END
363 
364 /*
365  * Like Simple_vFAIL(), but accepts five arguments.
366  */
367 #define	Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {		\
368     IV offset = RExC_parse - RExC_precomp;			\
369     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,	\
370 	    (int)offset, RExC_precomp, RExC_precomp + offset);	\
371 } STMT_END
372 
373 
374 #define	vWARN(loc,m) STMT_START {					\
375     IV offset = loc - RExC_precomp;					\
376     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,	\
377 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
378 } STMT_END
379 
380 #define	vWARNdep(loc,m) STMT_START {					\
381     IV offset = loc - RExC_precomp;					\
382     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),		\
383 	    "%s" REPORT_LOCATION,					\
384 	    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
385 } STMT_END
386 
387 
388 #define	vWARN2(loc, m, a1) STMT_START {					\
389     IV offset = loc - RExC_precomp;					\
390     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
391 	    a1, (int)offset, RExC_precomp, RExC_precomp + offset);	\
392 } STMT_END
393 
394 #define	vWARN3(loc, m, a1, a2) STMT_START {				\
395     IV offset = loc - RExC_precomp;					\
396     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
397 	    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);	\
398 } STMT_END
399 
400 #define	vWARN4(loc, m, a1, a2, a3) STMT_START {				\
401     IV offset = loc - RExC_precomp;					\
402     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
403 	    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
404 } STMT_END
405 
406 #define	vWARN5(loc, m, a1, a2, a3, a4) STMT_START {			\
407     IV offset = loc - RExC_precomp;					\
408     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
409 	    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
410 } STMT_END
411 
412 
413 /* Allow for side effects in s */
414 #define REGC(c,s) STMT_START {			\
415     if (!SIZE_ONLY) *(s) = (c); else (void)(s);	\
416 } STMT_END
417 
418 /* Macros for recording node offsets.   20001227 mjd@plover.com
419  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
420  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
421  * Element 0 holds the number n.
422  */
423 
424 #define MJD_OFFSET_DEBUG(x)
425 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
426 
427 
428 #define Set_Node_Offset_To_R(node,byte) STMT_START {			\
429     if (! SIZE_ONLY) {							\
430 	MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
431 		__LINE__, (node), (byte)));				\
432 	if((node) < 0) {						\
433 	    Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
434 	} else {							\
435 	    RExC_offsets[2*(node)-1] = (byte);				\
436 	}								\
437     }									\
438 } STMT_END
439 
440 #define Set_Node_Offset(node,byte) \
441     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
442 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
443 
444 #define Set_Node_Length_To_R(node,len) STMT_START {			\
445     if (! SIZE_ONLY) {							\
446 	MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
447 		__LINE__, (node), (len)));				\
448 	if((node) < 0) {						\
449 	    Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
450 	} else {							\
451 	    RExC_offsets[2*(node)] = (len);				\
452 	}								\
453     }									\
454 } STMT_END
455 
456 #define Set_Node_Length(node,len) \
457     Set_Node_Length_To_R((node)-RExC_emit_start, len)
458 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
459 #define Set_Node_Cur_Length(node) \
460     Set_Node_Length(node, RExC_parse - parse_start)
461 
462 /* Get offsets and lengths */
463 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
464 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
465 
466 static void clear_re(pTHX_ void *r);
467 
468 /* Mark that we cannot extend a found fixed substring at this point.
469    Updata the longest found anchored substring and the longest found
470    floating substrings if needed. */
471 
472 STATIC void
S_scan_commit(pTHX_ RExC_state_t * pRExC_state,scan_data_t * data)473 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
474 {
475     STRLEN l = CHR_SVLEN(data->last_found);
476     STRLEN old_l = CHR_SVLEN(*data->longest);
477 
478     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
479 	SvSetMagicSV(*data->longest, data->last_found);
480 	if (*data->longest == data->longest_fixed) {
481 	    data->offset_fixed = l ? data->last_start_min : data->pos_min;
482 	    if (data->flags & SF_BEFORE_EOL)
483 		data->flags
484 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
485 	    else
486 		data->flags &= ~SF_FIX_BEFORE_EOL;
487 	}
488 	else {
489 	    data->offset_float_min = l ? data->last_start_min : data->pos_min;
490 	    data->offset_float_max = (l
491 				      ? data->last_start_max
492 				      : data->pos_min + data->pos_delta);
493 	    if ((U32)data->offset_float_max > (U32)I32_MAX)
494 		data->offset_float_max = I32_MAX;
495 	    if (data->flags & SF_BEFORE_EOL)
496 		data->flags
497 		    |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
498 	    else
499 		data->flags &= ~SF_FL_BEFORE_EOL;
500 	}
501     }
502     SvCUR_set(data->last_found, 0);
503     {
504 	SV * sv = data->last_found;
505 	MAGIC *mg =
506 	    SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
507 	if (mg && mg->mg_len > 0)
508 	    mg->mg_len = 0;
509     }
510     data->last_end = -1;
511     data->flags &= ~SF_BEFORE_EOL;
512 }
513 
514 /* Can match anything (initialization) */
515 STATIC void
S_cl_anything(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl)516 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
517 {
518     ANYOF_CLASS_ZERO(cl);
519     ANYOF_BITMAP_SETALL(cl);
520     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
521     if (LOC)
522 	cl->flags |= ANYOF_LOCALE;
523 }
524 
525 /* Can match anything (initialization) */
526 STATIC int
S_cl_is_anything(pTHX_ struct regnode_charclass_class * cl)527 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
528 {
529     int value;
530 
531     for (value = 0; value <= ANYOF_MAX; value += 2)
532 	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
533 	    return 1;
534     if (!(cl->flags & ANYOF_UNICODE_ALL))
535 	return 0;
536     if (!ANYOF_BITMAP_TESTALLSET(cl))
537 	return 0;
538     return 1;
539 }
540 
541 /* Can match anything (initialization) */
542 STATIC void
S_cl_init(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl)543 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
544 {
545     Zero(cl, 1, struct regnode_charclass_class);
546     cl->type = ANYOF;
547     cl_anything(pRExC_state, cl);
548 }
549 
550 STATIC void
S_cl_init_zero(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl)551 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
552 {
553     Zero(cl, 1, struct regnode_charclass_class);
554     cl->type = ANYOF;
555     cl_anything(pRExC_state, cl);
556     if (LOC)
557 	cl->flags |= ANYOF_LOCALE;
558 }
559 
560 /* 'And' a given class with another one.  Can create false positives */
561 /* We assume that cl is not inverted */
562 STATIC void
S_cl_and(pTHX_ struct regnode_charclass_class * cl,struct regnode_charclass_class * and_with)563 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
564 	 struct regnode_charclass_class *and_with)
565 {
566     if (!(and_with->flags & ANYOF_CLASS)
567 	&& !(cl->flags & ANYOF_CLASS)
568 	&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
569 	&& !(and_with->flags & ANYOF_FOLD)
570 	&& !(cl->flags & ANYOF_FOLD)) {
571 	int i;
572 
573 	if (and_with->flags & ANYOF_INVERT)
574 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
575 		cl->bitmap[i] &= ~and_with->bitmap[i];
576 	else
577 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
578 		cl->bitmap[i] &= and_with->bitmap[i];
579     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
580     if (!(and_with->flags & ANYOF_EOS))
581 	cl->flags &= ~ANYOF_EOS;
582 
583     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
584 	!(and_with->flags & ANYOF_INVERT)) {
585 	cl->flags &= ~ANYOF_UNICODE_ALL;
586 	cl->flags |= ANYOF_UNICODE;
587 	ARG_SET(cl, ARG(and_with));
588     }
589     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
590 	!(and_with->flags & ANYOF_INVERT))
591 	cl->flags &= ~ANYOF_UNICODE_ALL;
592     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
593 	!(and_with->flags & ANYOF_INVERT))
594 	cl->flags &= ~ANYOF_UNICODE;
595 }
596 
597 /* 'OR' a given class with another one.  Can create false positives */
598 /* We assume that cl is not inverted */
599 STATIC void
S_cl_or(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl,struct regnode_charclass_class * or_with)600 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
601 {
602     if (or_with->flags & ANYOF_INVERT) {
603 	/* We do not use
604 	 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
605 	 *   <= (B1 | !B2) | (CL1 | !CL2)
606 	 * which is wasteful if CL2 is small, but we ignore CL2:
607 	 *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
608 	 * XXXX Can we handle case-fold?  Unclear:
609 	 *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
610 	 *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
611 	 */
612 	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
613 	     && !(or_with->flags & ANYOF_FOLD)
614 	     && !(cl->flags & ANYOF_FOLD) ) {
615 	    int i;
616 
617 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
618 		cl->bitmap[i] |= ~or_with->bitmap[i];
619 	} /* XXXX: logic is complicated otherwise */
620 	else {
621 	    cl_anything(pRExC_state, cl);
622 	}
623     } else {
624 	/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
625 	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
626 	     && (!(or_with->flags & ANYOF_FOLD)
627 		 || (cl->flags & ANYOF_FOLD)) ) {
628 	    int i;
629 
630 	    /* OR char bitmap and class bitmap separately */
631 	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
632 		cl->bitmap[i] |= or_with->bitmap[i];
633 	    if (or_with->flags & ANYOF_CLASS) {
634 		for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
635 		    cl->classflags[i] |= or_with->classflags[i];
636 		cl->flags |= ANYOF_CLASS;
637 	    }
638 	}
639 	else { /* XXXX: logic is complicated, leave it along for a moment. */
640 	    cl_anything(pRExC_state, cl);
641 	}
642     }
643     if (or_with->flags & ANYOF_EOS)
644 	cl->flags |= ANYOF_EOS;
645 
646     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
647 	ARG(cl) != ARG(or_with)) {
648 	cl->flags |= ANYOF_UNICODE_ALL;
649 	cl->flags &= ~ANYOF_UNICODE;
650     }
651     if (or_with->flags & ANYOF_UNICODE_ALL) {
652 	cl->flags |= ANYOF_UNICODE_ALL;
653 	cl->flags &= ~ANYOF_UNICODE;
654     }
655 }
656 
657 /*
658  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
659  * These need to be revisited when a newer toolchain becomes available.
660  */
661 #if defined(__sparc64__) && defined(__GNUC__)
662 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
663 #       undef  SPARC64_GCC_WORKAROUND
664 #       define SPARC64_GCC_WORKAROUND 1
665 #   endif
666 #endif
667 
668 /* REx optimizer.  Converts nodes into quickier variants "in place".
669    Finds fixed substrings.  */
670 
671 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
672    to the position after last scanned or to NULL. */
673 
674 STATIC I32
S_study_chunk(pTHX_ RExC_state_t * pRExC_state,regnode ** scanp,I32 * deltap,regnode * last,scan_data_t * data,U32 flags)675 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
676 			/* scanp: Start here (read-write). */
677 			/* deltap: Write maxlen-minlen here. */
678 			/* last: Stop before this one. */
679 {
680     I32 min = 0, pars = 0, code;
681     regnode *scan = *scanp, *next;
682     I32 delta = 0;
683     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
684     int is_inf_internal = 0;		/* The studied chunk is infinite */
685     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
686     scan_data_t data_fake;
687     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
688 
689     while (scan && OP(scan) != END && scan < last) {
690 	/* Peephole optimizer: */
691 
692 	if (PL_regkind[(U8)OP(scan)] == EXACT) {
693 	    /* Merge several consecutive EXACTish nodes into one. */
694 	    regnode *n = regnext(scan);
695 	    U32 stringok = 1;
696 #ifdef DEBUGGING
697 	    regnode *stop = scan;
698 #endif
699 
700 	    next = scan + NODE_SZ_STR(scan);
701 	    /* Skip NOTHING, merge EXACT*. */
702 	    while (n &&
703 		   ( PL_regkind[(U8)OP(n)] == NOTHING ||
704 		     (stringok && (OP(n) == OP(scan))))
705 		   && NEXT_OFF(n)
706 		   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
707 		if (OP(n) == TAIL || n > next)
708 		    stringok = 0;
709 		if (PL_regkind[(U8)OP(n)] == NOTHING) {
710 		    NEXT_OFF(scan) += NEXT_OFF(n);
711 		    next = n + NODE_STEP_REGNODE;
712 #ifdef DEBUGGING
713 		    if (stringok)
714 			stop = n;
715 #endif
716 		    n = regnext(n);
717 		}
718 		else if (stringok) {
719 		    int oldl = STR_LEN(scan);
720 		    regnode *nnext = regnext(n);
721 
722 		    if (oldl + STR_LEN(n) > U8_MAX)
723 			break;
724 		    NEXT_OFF(scan) += NEXT_OFF(n);
725 		    STR_LEN(scan) += STR_LEN(n);
726 		    next = n + NODE_SZ_STR(n);
727 		    /* Now we can overwrite *n : */
728 		    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
729 #ifdef DEBUGGING
730 		    stop = next - 1;
731 #endif
732 		    n = nnext;
733 		}
734 	    }
735 
736 	    if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
737 /*
738   Two problematic code points in Unicode casefolding of EXACT nodes:
739 
740    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
741    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
742 
743    which casefold to
744 
745    Unicode			UTF-8
746 
747    U+03B9 U+0308 U+0301		0xCE 0xB9 0xCC 0x88 0xCC 0x81
748    U+03C5 U+0308 U+0301		0xCF 0x85 0xCC 0x88 0xCC 0x81
749 
750    This means that in case-insensitive matching (or "loose matching",
751    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
752    length of the above casefolded versions) can match a target string
753    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
754    This would rather mess up the minimum length computation.
755 
756    What we'll do is to look for the tail four bytes, and then peek
757    at the preceding two bytes to see whether we need to decrease
758    the minimum length by four (six minus two).
759 
760    Thanks to the design of UTF-8, there cannot be false matches:
761    A sequence of valid UTF-8 bytes cannot be a subsequence of
762    another valid sequence of UTF-8 bytes.
763 
764 */
765 		 char *s0 = STRING(scan), *s, *t;
766 		 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
767 		 char *t0 = "\xcc\x88\xcc\x81";
768 		 char *t1 = t0 + 3;
769 
770 		 for (s = s0 + 2;
771 		      s < s2 && (t = ninstr(s, s1, t0, t1));
772 		      s = t + 4) {
773 		      if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
774 			  ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
775 			   min -= 4;
776 		 }
777 	    }
778 
779 #ifdef DEBUGGING
780 	    /* Allow dumping */
781 	    n = scan + NODE_SZ_STR(scan);
782 	    while (n <= stop) {
783 		if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
784 		    OP(n) = OPTIMIZED;
785 		    NEXT_OFF(n) = 0;
786 		}
787 		n++;
788 	    }
789 #endif
790 	}
791 	/* Follow the next-chain of the current node and optimize
792 	   away all the NOTHINGs from it.  */
793 	if (OP(scan) != CURLYX) {
794 	    int max = (reg_off_by_arg[OP(scan)]
795 		       ? I32_MAX
796 		       /* I32 may be smaller than U16 on CRAYs! */
797 		       : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
798 	    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
799 	    int noff;
800 	    regnode *n = scan;
801 
802 	    /* Skip NOTHING and LONGJMP. */
803 	    while ((n = regnext(n))
804 		   && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
805 		       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
806 		   && off + noff < max)
807 		off += noff;
808 	    if (reg_off_by_arg[OP(scan)])
809 		ARG(scan) = off;
810 	    else
811 		NEXT_OFF(scan) = off;
812 	}
813 	/* The principal pseudo-switch.  Cannot be a switch, since we
814 	   look into several different things.  */
815 	if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
816 		   || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
817 	    next = regnext(scan);
818 	    code = OP(scan);
819 
820 	    if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
821 		I32 max1 = 0, min1 = I32_MAX, num = 0;
822 		struct regnode_charclass_class accum;
823 
824 		if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
825 		    scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
826 		if (flags & SCF_DO_STCLASS)
827 		    cl_init_zero(pRExC_state, &accum);
828 		while (OP(scan) == code) {
829 		    I32 deltanext, minnext, f = 0, fake;
830 		    struct regnode_charclass_class this_class;
831 
832 		    num++;
833 		    data_fake.flags = 0;
834 		    if (data) {
835 			data_fake.whilem_c = data->whilem_c;
836 			data_fake.last_closep = data->last_closep;
837 		    }
838 		    else
839 			data_fake.last_closep = &fake;
840 		    next = regnext(scan);
841 		    scan = NEXTOPER(scan);
842 		    if (code != BRANCH)
843 			scan = NEXTOPER(scan);
844 		    if (flags & SCF_DO_STCLASS) {
845 			cl_init(pRExC_state, &this_class);
846 			data_fake.start_class = &this_class;
847 			f = SCF_DO_STCLASS_AND;
848 		    }
849 		    if (flags & SCF_WHILEM_VISITED_POS)
850 			f |= SCF_WHILEM_VISITED_POS;
851 		    /* we suppose the run is continuous, last=next...*/
852 		    minnext = study_chunk(pRExC_state, &scan, &deltanext,
853 					  next, &data_fake, f);
854 		    if (min1 > minnext)
855 			min1 = minnext;
856 		    if (max1 < minnext + deltanext)
857 			max1 = minnext + deltanext;
858 		    if (deltanext == I32_MAX)
859 			is_inf = is_inf_internal = 1;
860 		    scan = next;
861 		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
862 			pars++;
863 		    if (data && (data_fake.flags & SF_HAS_EVAL))
864 			data->flags |= SF_HAS_EVAL;
865 		    if (data)
866 			data->whilem_c = data_fake.whilem_c;
867 		    if (flags & SCF_DO_STCLASS)
868 			cl_or(pRExC_state, &accum, &this_class);
869 		    if (code == SUSPEND)
870 			break;
871 		}
872 		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
873 		    min1 = 0;
874 		if (flags & SCF_DO_SUBSTR) {
875 		    data->pos_min += min1;
876 		    data->pos_delta += max1 - min1;
877 		    if (max1 != min1 || is_inf)
878 			data->longest = &(data->longest_float);
879 		}
880 		min += min1;
881 		delta += max1 - min1;
882 		if (flags & SCF_DO_STCLASS_OR) {
883 		    cl_or(pRExC_state, data->start_class, &accum);
884 		    if (min1) {
885 			cl_and(data->start_class, &and_with);
886 			flags &= ~SCF_DO_STCLASS;
887 		    }
888 		}
889 		else if (flags & SCF_DO_STCLASS_AND) {
890 		    if (min1) {
891 			cl_and(data->start_class, &accum);
892 			flags &= ~SCF_DO_STCLASS;
893 		    }
894 		    else {
895 			/* Switch to OR mode: cache the old value of
896 			 * data->start_class */
897 			StructCopy(data->start_class, &and_with,
898 				   struct regnode_charclass_class);
899 			flags &= ~SCF_DO_STCLASS_AND;
900 			StructCopy(&accum, data->start_class,
901 				   struct regnode_charclass_class);
902 			flags |= SCF_DO_STCLASS_OR;
903 			data->start_class->flags |= ANYOF_EOS;
904 		    }
905 		}
906 	    }
907 	    else if (code == BRANCHJ)	/* single branch is optimized. */
908 		scan = NEXTOPER(NEXTOPER(scan));
909 	    else			/* single branch is optimized. */
910 		scan = NEXTOPER(scan);
911 	    continue;
912 	}
913 	else if (OP(scan) == EXACT) {
914 	    I32 l = STR_LEN(scan);
915 	    UV uc = *((U8*)STRING(scan));
916 	    if (UTF) {
917 		U8 *s = (U8*)STRING(scan);
918 		l = utf8_length(s, s + l);
919 		uc = utf8_to_uvchr(s, NULL);
920 	    }
921 	    min += l;
922 	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
923 		/* The code below prefers earlier match for fixed
924 		   offset, later match for variable offset.  */
925 		if (data->last_end == -1) { /* Update the start info. */
926 		    data->last_start_min = data->pos_min;
927  		    data->last_start_max = is_inf
928  			? I32_MAX : data->pos_min + data->pos_delta;
929 		}
930 		sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
931 		{
932 		    SV * sv = data->last_found;
933 		    MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
934 			mg_find(sv, PERL_MAGIC_utf8) : NULL;
935 		    if (mg && mg->mg_len >= 0)
936 			mg->mg_len += utf8_length((U8*)STRING(scan),
937 						  (U8*)STRING(scan)+STR_LEN(scan));
938 		}
939 		if (UTF)
940 		    SvUTF8_on(data->last_found);
941 		data->last_end = data->pos_min + l;
942 		data->pos_min += l; /* As in the first entry. */
943 		data->flags &= ~SF_BEFORE_EOL;
944 	    }
945 	    if (flags & SCF_DO_STCLASS_AND) {
946 		/* Check whether it is compatible with what we know already! */
947 		int compat = 1;
948 
949 		if (uc >= 0x100 ||
950 		    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
951 		    && !ANYOF_BITMAP_TEST(data->start_class, uc)
952 		    && (!(data->start_class->flags & ANYOF_FOLD)
953 			|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
954                     )
955 		    compat = 0;
956 		ANYOF_CLASS_ZERO(data->start_class);
957 		ANYOF_BITMAP_ZERO(data->start_class);
958 		if (compat)
959 		    ANYOF_BITMAP_SET(data->start_class, uc);
960 		data->start_class->flags &= ~ANYOF_EOS;
961 		if (uc < 0x100)
962 		  data->start_class->flags &= ~ANYOF_UNICODE_ALL;
963 	    }
964 	    else if (flags & SCF_DO_STCLASS_OR) {
965 		/* false positive possible if the class is case-folded */
966 		if (uc < 0x100)
967 		    ANYOF_BITMAP_SET(data->start_class, uc);
968 		else
969 		    data->start_class->flags |= ANYOF_UNICODE_ALL;
970 		data->start_class->flags &= ~ANYOF_EOS;
971 		cl_and(data->start_class, &and_with);
972 	    }
973 	    flags &= ~SCF_DO_STCLASS;
974 	}
975 	else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
976 	    I32 l = STR_LEN(scan);
977 	    UV uc = *((U8*)STRING(scan));
978 
979 	    /* Search for fixed substrings supports EXACT only. */
980 	    if (flags & SCF_DO_SUBSTR)
981 		scan_commit(pRExC_state, data);
982 	    if (UTF) {
983 		U8 *s = (U8 *)STRING(scan);
984 		l = utf8_length(s, s + l);
985 		uc = utf8_to_uvchr(s, NULL);
986 	    }
987 	    min += l;
988 	    if (data && (flags & SCF_DO_SUBSTR))
989 		data->pos_min += l;
990 	    if (flags & SCF_DO_STCLASS_AND) {
991 		/* Check whether it is compatible with what we know already! */
992 		int compat = 1;
993 
994 		if (uc >= 0x100 ||
995 		    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
996 		    && !ANYOF_BITMAP_TEST(data->start_class, uc)
997 		     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
998 		    compat = 0;
999 		ANYOF_CLASS_ZERO(data->start_class);
1000 		ANYOF_BITMAP_ZERO(data->start_class);
1001 		if (compat) {
1002 		    ANYOF_BITMAP_SET(data->start_class, uc);
1003 		    data->start_class->flags &= ~ANYOF_EOS;
1004 		    data->start_class->flags |= ANYOF_FOLD;
1005 		    if (OP(scan) == EXACTFL)
1006 			data->start_class->flags |= ANYOF_LOCALE;
1007 		}
1008 	    }
1009 	    else if (flags & SCF_DO_STCLASS_OR) {
1010 		if (data->start_class->flags & ANYOF_FOLD) {
1011 		    /* false positive possible if the class is case-folded.
1012 		       Assume that the locale settings are the same... */
1013 		    if (uc < 0x100)
1014 			ANYOF_BITMAP_SET(data->start_class, uc);
1015 		    data->start_class->flags &= ~ANYOF_EOS;
1016 		}
1017 		cl_and(data->start_class, &and_with);
1018 	    }
1019 	    flags &= ~SCF_DO_STCLASS;
1020 	}
1021 	else if (strchr((char*)PL_varies,OP(scan))) {
1022 	    I32 mincount, maxcount, minnext, deltanext, fl = 0;
1023 	    I32 f = flags, pos_before = 0;
1024 	    regnode *oscan = scan;
1025 	    struct regnode_charclass_class this_class;
1026 	    struct regnode_charclass_class *oclass = NULL;
1027 	    I32 next_is_eval = 0;
1028 
1029 	    switch (PL_regkind[(U8)OP(scan)]) {
1030 	    case WHILEM:		/* End of (?:...)* . */
1031 		scan = NEXTOPER(scan);
1032 		goto finish;
1033 	    case PLUS:
1034 		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1035 		    next = NEXTOPER(scan);
1036 		    if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1037 			mincount = 1;
1038 			maxcount = REG_INFTY;
1039 			next = regnext(scan);
1040 			scan = NEXTOPER(scan);
1041 			goto do_curly;
1042 		    }
1043 		}
1044 		if (flags & SCF_DO_SUBSTR)
1045 		    data->pos_min++;
1046 		min++;
1047 		/* Fall through. */
1048 	    case STAR:
1049 		if (flags & SCF_DO_STCLASS) {
1050 		    mincount = 0;
1051 		    maxcount = REG_INFTY;
1052 		    next = regnext(scan);
1053 		    scan = NEXTOPER(scan);
1054 		    goto do_curly;
1055 		}
1056 		is_inf = is_inf_internal = 1;
1057 		scan = regnext(scan);
1058 		if (flags & SCF_DO_SUBSTR) {
1059 		    scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1060 		    data->longest = &(data->longest_float);
1061 		}
1062 		goto optimize_curly_tail;
1063 	    case CURLY:
1064 		mincount = ARG1(scan);
1065 		maxcount = ARG2(scan);
1066 		next = regnext(scan);
1067 		if (OP(scan) == CURLYX) {
1068 		    I32 lp = (data ? *(data->last_closep) : 0);
1069 
1070 		    scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1071 		}
1072 		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1073 		next_is_eval = (OP(scan) == EVAL);
1074 	      do_curly:
1075 		if (flags & SCF_DO_SUBSTR) {
1076 		    if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1077 		    pos_before = data->pos_min;
1078 		}
1079 		if (data) {
1080 		    fl = data->flags;
1081 		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1082 		    if (is_inf)
1083 			data->flags |= SF_IS_INF;
1084 		}
1085 		if (flags & SCF_DO_STCLASS) {
1086 		    cl_init(pRExC_state, &this_class);
1087 		    oclass = data->start_class;
1088 		    data->start_class = &this_class;
1089 		    f |= SCF_DO_STCLASS_AND;
1090 		    f &= ~SCF_DO_STCLASS_OR;
1091 		}
1092 		/* These are the cases when once a subexpression
1093 		   fails at a particular position, it cannot succeed
1094 		   even after backtracking at the enclosing scope.
1095 
1096 		   XXXX what if minimal match and we are at the
1097 		        initial run of {n,m}? */
1098 		if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1099 		    f &= ~SCF_WHILEM_VISITED_POS;
1100 
1101 		/* This will finish on WHILEM, setting scan, or on NULL: */
1102 		minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1103 				      mincount == 0
1104 					? (f & ~SCF_DO_SUBSTR) : f);
1105 
1106 		if (flags & SCF_DO_STCLASS)
1107 		    data->start_class = oclass;
1108 		if (mincount == 0 || minnext == 0) {
1109 		    if (flags & SCF_DO_STCLASS_OR) {
1110 			cl_or(pRExC_state, data->start_class, &this_class);
1111 		    }
1112 		    else if (flags & SCF_DO_STCLASS_AND) {
1113 			/* Switch to OR mode: cache the old value of
1114 			 * data->start_class */
1115 			StructCopy(data->start_class, &and_with,
1116 				   struct regnode_charclass_class);
1117 			flags &= ~SCF_DO_STCLASS_AND;
1118 			StructCopy(&this_class, data->start_class,
1119 				   struct regnode_charclass_class);
1120 			flags |= SCF_DO_STCLASS_OR;
1121 			data->start_class->flags |= ANYOF_EOS;
1122 		    }
1123 		} else {		/* Non-zero len */
1124 		    if (flags & SCF_DO_STCLASS_OR) {
1125 			cl_or(pRExC_state, data->start_class, &this_class);
1126 			cl_and(data->start_class, &and_with);
1127 		    }
1128 		    else if (flags & SCF_DO_STCLASS_AND)
1129 			cl_and(data->start_class, &this_class);
1130 		    flags &= ~SCF_DO_STCLASS;
1131 		}
1132 		if (!scan) 		/* It was not CURLYX, but CURLY. */
1133 		    scan = next;
1134 		if (ckWARN(WARN_REGEXP)
1135 		       /* ? quantifier ok, except for (?{ ... }) */
1136 		    && (next_is_eval || !(mincount == 0 && maxcount == 1))
1137 		    && (minnext == 0) && (deltanext == 0)
1138 		    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1139 		    && maxcount <= REG_INFTY/3) /* Complement check for big count */
1140 		{
1141 		    vWARN(RExC_parse,
1142 			  "Quantifier unexpected on zero-length expression");
1143 		}
1144 
1145 		min += minnext * mincount;
1146 		is_inf_internal |= ((maxcount == REG_INFTY
1147 				     && (minnext + deltanext) > 0)
1148 				    || deltanext == I32_MAX);
1149 		is_inf |= is_inf_internal;
1150 		delta += (minnext + deltanext) * maxcount - minnext * mincount;
1151 
1152 		/* Try powerful optimization CURLYX => CURLYN. */
1153 		if (  OP(oscan) == CURLYX && data
1154 		      && data->flags & SF_IN_PAR
1155 		      && !(data->flags & SF_HAS_EVAL)
1156 		      && !deltanext && minnext == 1 ) {
1157 		    /* Try to optimize to CURLYN.  */
1158 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1159 		    regnode *nxt1 = nxt;
1160 #ifdef DEBUGGING
1161 		    regnode *nxt2;
1162 #endif
1163 
1164 		    /* Skip open. */
1165 		    nxt = regnext(nxt);
1166 		    if (!strchr((char*)PL_simple,OP(nxt))
1167 			&& !(PL_regkind[(U8)OP(nxt)] == EXACT
1168 			     && STR_LEN(nxt) == 1))
1169 			goto nogo;
1170 #ifdef DEBUGGING
1171 		    nxt2 = nxt;
1172 #endif
1173 		    nxt = regnext(nxt);
1174 		    if (OP(nxt) != CLOSE)
1175 			goto nogo;
1176 		    /* Now we know that nxt2 is the only contents: */
1177 		    oscan->flags = (U8)ARG(nxt);
1178 		    OP(oscan) = CURLYN;
1179 		    OP(nxt1) = NOTHING;	/* was OPEN. */
1180 #ifdef DEBUGGING
1181 		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1182 		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1183 		    NEXT_OFF(nxt2) = 0;	/* just for consistancy with CURLY. */
1184 		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
1185 		    OP(nxt + 1) = OPTIMIZED; /* was count. */
1186 		    NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1187 #endif
1188 		}
1189 	      nogo:
1190 
1191 		/* Try optimization CURLYX => CURLYM. */
1192 		if (  OP(oscan) == CURLYX && data
1193 		      && !(data->flags & SF_HAS_PAR)
1194 		      && !(data->flags & SF_HAS_EVAL)
1195 		      && !deltanext  ) {
1196 		    /* XXXX How to optimize if data == 0? */
1197 		    /* Optimize to a simpler form.  */
1198 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1199 		    regnode *nxt2;
1200 
1201 		    OP(oscan) = CURLYM;
1202 		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1203 			    && (OP(nxt2) != WHILEM))
1204 			nxt = nxt2;
1205 		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1206 		    /* Need to optimize away parenths. */
1207 		    if (data->flags & SF_IN_PAR) {
1208 			/* Set the parenth number.  */
1209 			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1210 
1211 			if (OP(nxt) != CLOSE)
1212 			    FAIL("Panic opt close");
1213 			oscan->flags = (U8)ARG(nxt);
1214 			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
1215 			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
1216 #ifdef DEBUGGING
1217 			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1218 			OP(nxt + 1) = OPTIMIZED; /* was count. */
1219 			NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1220 			NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1221 #endif
1222 #if 0
1223 			while ( nxt1 && (OP(nxt1) != WHILEM)) {
1224 			    regnode *nnxt = regnext(nxt1);
1225 
1226 			    if (nnxt == nxt) {
1227 				if (reg_off_by_arg[OP(nxt1)])
1228 				    ARG_SET(nxt1, nxt2 - nxt1);
1229 				else if (nxt2 - nxt1 < U16_MAX)
1230 				    NEXT_OFF(nxt1) = nxt2 - nxt1;
1231 				else
1232 				    OP(nxt) = NOTHING;	/* Cannot beautify */
1233 			    }
1234 			    nxt1 = nnxt;
1235 			}
1236 #endif
1237 			/* Optimize again: */
1238 			study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1239 				    NULL, 0);
1240 		    }
1241 		    else
1242 			oscan->flags = 0;
1243 		}
1244 		else if ((OP(oscan) == CURLYX)
1245 			 && (flags & SCF_WHILEM_VISITED_POS)
1246 			 /* See the comment on a similar expression above.
1247 			    However, this time it not a subexpression
1248 			    we care about, but the expression itself. */
1249 			 && (maxcount == REG_INFTY)
1250 			 && data && ++data->whilem_c < 16) {
1251 		    /* This stays as CURLYX, we can put the count/of pair. */
1252 		    /* Find WHILEM (as in regexec.c) */
1253 		    regnode *nxt = oscan + NEXT_OFF(oscan);
1254 
1255 		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1256 			nxt += ARG(nxt);
1257 		    PREVOPER(nxt)->flags = (U8)(data->whilem_c
1258 			| (RExC_whilem_seen << 4)); /* On WHILEM */
1259 		}
1260 		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1261 		    pars++;
1262 		if (flags & SCF_DO_SUBSTR) {
1263 		    SV *last_str = Nullsv;
1264 		    int counted = mincount != 0;
1265 
1266 		    if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1267 #if defined(SPARC64_GCC_WORKAROUND)
1268 			I32 b = 0;
1269 			STRLEN l = 0;
1270 			char *s = NULL;
1271 			I32 old = 0;
1272 
1273 			if (pos_before >= data->last_start_min)
1274 			    b = pos_before;
1275 			else
1276 			    b = data->last_start_min;
1277 
1278 			l = 0;
1279 			s = SvPV(data->last_found, l);
1280 			old = b - data->last_start_min;
1281 
1282 #else
1283 			I32 b = pos_before >= data->last_start_min
1284 			    ? pos_before : data->last_start_min;
1285 			STRLEN l;
1286 			char *s = SvPV(data->last_found, l);
1287 			I32 old = b - data->last_start_min;
1288 #endif
1289 
1290 			if (UTF)
1291 			    old = utf8_hop((U8*)s, old) - (U8*)s;
1292 
1293 			l -= old;
1294 			/* Get the added string: */
1295 			last_str = newSVpvn(s  + old, l);
1296 			if (UTF)
1297 			    SvUTF8_on(last_str);
1298 			if (deltanext == 0 && pos_before == b) {
1299 			    /* What was added is a constant string */
1300 			    if (mincount > 1) {
1301 				SvGROW(last_str, (mincount * l) + 1);
1302 				repeatcpy(SvPVX(last_str) + l,
1303 					  SvPVX(last_str), l, mincount - 1);
1304 				SvCUR(last_str) *= mincount;
1305 				/* Add additional parts. */
1306 				SvCUR_set(data->last_found,
1307 					  SvCUR(data->last_found) - l);
1308 				sv_catsv(data->last_found, last_str);
1309 				{
1310 				    SV * sv = data->last_found;
1311 				    MAGIC *mg =
1312 					SvUTF8(sv) && SvMAGICAL(sv) ?
1313 					mg_find(sv, PERL_MAGIC_utf8) : NULL;
1314 				    if (mg && mg->mg_len >= 0)
1315 					mg->mg_len += CHR_SVLEN(last_str);
1316 				}
1317 				data->last_end += l * (mincount - 1);
1318 			    }
1319 			} else {
1320 			    /* start offset must point into the last copy */
1321 			    data->last_start_min += minnext * (mincount - 1);
1322 			    data->last_start_max += is_inf ? I32_MAX
1323 				: (maxcount - 1) * (minnext + data->pos_delta);
1324 			}
1325 		    }
1326 		    /* It is counted once already... */
1327 		    data->pos_min += minnext * (mincount - counted);
1328 		    data->pos_delta += - counted * deltanext +
1329 			(minnext + deltanext) * maxcount - minnext * mincount;
1330 		    if (mincount != maxcount) {
1331 			 /* Cannot extend fixed substrings found inside
1332 			    the group.  */
1333 			scan_commit(pRExC_state,data);
1334 			if (mincount && last_str) {
1335 			    sv_setsv(data->last_found, last_str);
1336 			    data->last_end = data->pos_min;
1337 			    data->last_start_min =
1338 				data->pos_min - CHR_SVLEN(last_str);
1339 			    data->last_start_max = is_inf
1340 				? I32_MAX
1341 				: data->pos_min + data->pos_delta
1342 				- CHR_SVLEN(last_str);
1343 			}
1344 			data->longest = &(data->longest_float);
1345 		    }
1346 		    SvREFCNT_dec(last_str);
1347 		}
1348 		if (data && (fl & SF_HAS_EVAL))
1349 		    data->flags |= SF_HAS_EVAL;
1350 	      optimize_curly_tail:
1351 		if (OP(oscan) != CURLYX) {
1352 		    while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1353 			   && NEXT_OFF(next))
1354 			NEXT_OFF(oscan) += NEXT_OFF(next);
1355 		}
1356 		continue;
1357 	    default:			/* REF and CLUMP only? */
1358 		if (flags & SCF_DO_SUBSTR) {
1359 		    scan_commit(pRExC_state,data);	/* Cannot expect anything... */
1360 		    data->longest = &(data->longest_float);
1361 		}
1362 		is_inf = is_inf_internal = 1;
1363 		if (flags & SCF_DO_STCLASS_OR)
1364 		    cl_anything(pRExC_state, data->start_class);
1365 		flags &= ~SCF_DO_STCLASS;
1366 		break;
1367 	    }
1368 	}
1369 	else if (strchr((char*)PL_simple,OP(scan))) {
1370 	    int value = 0;
1371 
1372 	    if (flags & SCF_DO_SUBSTR) {
1373 		scan_commit(pRExC_state,data);
1374 		data->pos_min++;
1375 	    }
1376 	    min++;
1377 	    if (flags & SCF_DO_STCLASS) {
1378 		data->start_class->flags &= ~ANYOF_EOS;	/* No match on empty */
1379 
1380 		/* Some of the logic below assumes that switching
1381 		   locale on will only add false positives. */
1382 		switch (PL_regkind[(U8)OP(scan)]) {
1383 		case SANY:
1384 		default:
1385 		  do_default:
1386 		    /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1387 		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1388 			cl_anything(pRExC_state, data->start_class);
1389 		    break;
1390 		case REG_ANY:
1391 		    if (OP(scan) == SANY)
1392 			goto do_default;
1393 		    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1394 			value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1395 				 || (data->start_class->flags & ANYOF_CLASS));
1396 			cl_anything(pRExC_state, data->start_class);
1397 		    }
1398 		    if (flags & SCF_DO_STCLASS_AND || !value)
1399 			ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1400 		    break;
1401 		case ANYOF:
1402 		    if (flags & SCF_DO_STCLASS_AND)
1403 			cl_and(data->start_class,
1404 			       (struct regnode_charclass_class*)scan);
1405 		    else
1406 			cl_or(pRExC_state, data->start_class,
1407 			      (struct regnode_charclass_class*)scan);
1408 		    break;
1409 		case ALNUM:
1410 		    if (flags & SCF_DO_STCLASS_AND) {
1411 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
1412 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1413 			    for (value = 0; value < 256; value++)
1414 				if (!isALNUM(value))
1415 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
1416 			}
1417 		    }
1418 		    else {
1419 			if (data->start_class->flags & ANYOF_LOCALE)
1420 			    ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1421 			else {
1422 			    for (value = 0; value < 256; value++)
1423 				if (isALNUM(value))
1424 				    ANYOF_BITMAP_SET(data->start_class, value);
1425 			}
1426 		    }
1427 		    break;
1428 		case ALNUML:
1429 		    if (flags & SCF_DO_STCLASS_AND) {
1430 			if (data->start_class->flags & ANYOF_LOCALE)
1431 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1432 		    }
1433 		    else {
1434 			ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1435 			data->start_class->flags |= ANYOF_LOCALE;
1436 		    }
1437 		    break;
1438 		case NALNUM:
1439 		    if (flags & SCF_DO_STCLASS_AND) {
1440 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
1441 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1442 			    for (value = 0; value < 256; value++)
1443 				if (isALNUM(value))
1444 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
1445 			}
1446 		    }
1447 		    else {
1448 			if (data->start_class->flags & ANYOF_LOCALE)
1449 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1450 			else {
1451 			    for (value = 0; value < 256; value++)
1452 				if (!isALNUM(value))
1453 				    ANYOF_BITMAP_SET(data->start_class, value);
1454 			}
1455 		    }
1456 		    break;
1457 		case NALNUML:
1458 		    if (flags & SCF_DO_STCLASS_AND) {
1459 			if (data->start_class->flags & ANYOF_LOCALE)
1460 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1461 		    }
1462 		    else {
1463 			data->start_class->flags |= ANYOF_LOCALE;
1464 			ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1465 		    }
1466 		    break;
1467 		case SPACE:
1468 		    if (flags & SCF_DO_STCLASS_AND) {
1469 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
1470 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1471 			    for (value = 0; value < 256; value++)
1472 				if (!isSPACE(value))
1473 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
1474 			}
1475 		    }
1476 		    else {
1477 			if (data->start_class->flags & ANYOF_LOCALE)
1478 			    ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1479 			else {
1480 			    for (value = 0; value < 256; value++)
1481 				if (isSPACE(value))
1482 				    ANYOF_BITMAP_SET(data->start_class, value);
1483 			}
1484 		    }
1485 		    break;
1486 		case SPACEL:
1487 		    if (flags & SCF_DO_STCLASS_AND) {
1488 			if (data->start_class->flags & ANYOF_LOCALE)
1489 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1490 		    }
1491 		    else {
1492 			data->start_class->flags |= ANYOF_LOCALE;
1493 			ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1494 		    }
1495 		    break;
1496 		case NSPACE:
1497 		    if (flags & SCF_DO_STCLASS_AND) {
1498 			if (!(data->start_class->flags & ANYOF_LOCALE)) {
1499 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1500 			    for (value = 0; value < 256; value++)
1501 				if (isSPACE(value))
1502 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
1503 			}
1504 		    }
1505 		    else {
1506 			if (data->start_class->flags & ANYOF_LOCALE)
1507 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1508 			else {
1509 			    for (value = 0; value < 256; value++)
1510 				if (!isSPACE(value))
1511 				    ANYOF_BITMAP_SET(data->start_class, value);
1512 			}
1513 		    }
1514 		    break;
1515 		case NSPACEL:
1516 		    if (flags & SCF_DO_STCLASS_AND) {
1517 			if (data->start_class->flags & ANYOF_LOCALE) {
1518 			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1519 			    for (value = 0; value < 256; value++)
1520 				if (!isSPACE(value))
1521 				    ANYOF_BITMAP_CLEAR(data->start_class, value);
1522 			}
1523 		    }
1524 		    else {
1525 			data->start_class->flags |= ANYOF_LOCALE;
1526 			ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1527 		    }
1528 		    break;
1529 		case DIGIT:
1530 		    if (flags & SCF_DO_STCLASS_AND) {
1531 			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1532 			for (value = 0; value < 256; value++)
1533 			    if (!isDIGIT(value))
1534 				ANYOF_BITMAP_CLEAR(data->start_class, value);
1535 		    }
1536 		    else {
1537 			if (data->start_class->flags & ANYOF_LOCALE)
1538 			    ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1539 			else {
1540 			    for (value = 0; value < 256; value++)
1541 				if (isDIGIT(value))
1542 				    ANYOF_BITMAP_SET(data->start_class, value);
1543 			}
1544 		    }
1545 		    break;
1546 		case NDIGIT:
1547 		    if (flags & SCF_DO_STCLASS_AND) {
1548 			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1549 			for (value = 0; value < 256; value++)
1550 			    if (isDIGIT(value))
1551 				ANYOF_BITMAP_CLEAR(data->start_class, value);
1552 		    }
1553 		    else {
1554 			if (data->start_class->flags & ANYOF_LOCALE)
1555 			    ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1556 			else {
1557 			    for (value = 0; value < 256; value++)
1558 				if (!isDIGIT(value))
1559 				    ANYOF_BITMAP_SET(data->start_class, value);
1560 			}
1561 		    }
1562 		    break;
1563 		}
1564 		if (flags & SCF_DO_STCLASS_OR)
1565 		    cl_and(data->start_class, &and_with);
1566 		flags &= ~SCF_DO_STCLASS;
1567 	    }
1568 	}
1569 	else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1570 	    data->flags |= (OP(scan) == MEOL
1571 			    ? SF_BEFORE_MEOL
1572 			    : SF_BEFORE_SEOL);
1573 	}
1574 	else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1575 		 /* Lookbehind, or need to calculate parens/evals/stclass: */
1576 		   && (scan->flags || data || (flags & SCF_DO_STCLASS))
1577 		   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1578 	    /* Lookahead/lookbehind */
1579 	    I32 deltanext, minnext, fake = 0;
1580 	    regnode *nscan;
1581 	    struct regnode_charclass_class intrnl;
1582 	    int f = 0;
1583 
1584 	    data_fake.flags = 0;
1585 	    if (data) {
1586 		data_fake.whilem_c = data->whilem_c;
1587 		data_fake.last_closep = data->last_closep;
1588 	    }
1589 	    else
1590 		data_fake.last_closep = &fake;
1591 	    if ( flags & SCF_DO_STCLASS && !scan->flags
1592 		 && OP(scan) == IFMATCH ) { /* Lookahead */
1593 		cl_init(pRExC_state, &intrnl);
1594 		data_fake.start_class = &intrnl;
1595 		f |= SCF_DO_STCLASS_AND;
1596 	    }
1597 	    if (flags & SCF_WHILEM_VISITED_POS)
1598 		f |= SCF_WHILEM_VISITED_POS;
1599 	    next = regnext(scan);
1600 	    nscan = NEXTOPER(NEXTOPER(scan));
1601 	    minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1602 	    if (scan->flags) {
1603 		if (deltanext) {
1604 		    vFAIL("Variable length lookbehind not implemented");
1605 		}
1606 		else if (minnext > U8_MAX) {
1607 		    vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1608 		}
1609 		scan->flags = (U8)minnext;
1610 	    }
1611 	    if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1612 		pars++;
1613 	    if (data && (data_fake.flags & SF_HAS_EVAL))
1614 		data->flags |= SF_HAS_EVAL;
1615 	    if (data)
1616 		data->whilem_c = data_fake.whilem_c;
1617 	    if (f & SCF_DO_STCLASS_AND) {
1618 		int was = (data->start_class->flags & ANYOF_EOS);
1619 
1620 		cl_and(data->start_class, &intrnl);
1621 		if (was)
1622 		    data->start_class->flags |= ANYOF_EOS;
1623 	    }
1624 	}
1625 	else if (OP(scan) == OPEN) {
1626 	    pars++;
1627 	}
1628 	else if (OP(scan) == CLOSE) {
1629 	    if ((I32)ARG(scan) == is_par) {
1630 		next = regnext(scan);
1631 
1632 		if ( next && (OP(next) != WHILEM) && next < last)
1633 		    is_par = 0;		/* Disable optimization */
1634 	    }
1635 	    if (data)
1636 		*(data->last_closep) = ARG(scan);
1637 	}
1638 	else if (OP(scan) == EVAL) {
1639 		if (data)
1640 		    data->flags |= SF_HAS_EVAL;
1641 	}
1642 	else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1643 		if (flags & SCF_DO_SUBSTR) {
1644 		    scan_commit(pRExC_state,data);
1645 		    data->longest = &(data->longest_float);
1646 		}
1647 		is_inf = is_inf_internal = 1;
1648 		if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1649 		    cl_anything(pRExC_state, data->start_class);
1650 		flags &= ~SCF_DO_STCLASS;
1651 	}
1652 	/* Else: zero-length, ignore. */
1653 	scan = regnext(scan);
1654     }
1655 
1656   finish:
1657     *scanp = scan;
1658     *deltap = is_inf_internal ? I32_MAX : delta;
1659     if (flags & SCF_DO_SUBSTR && is_inf)
1660 	data->pos_delta = I32_MAX - data->pos_min;
1661     if (is_par > U8_MAX)
1662 	is_par = 0;
1663     if (is_par && pars==1 && data) {
1664 	data->flags |= SF_IN_PAR;
1665 	data->flags &= ~SF_HAS_PAR;
1666     }
1667     else if (pars && data) {
1668 	data->flags |= SF_HAS_PAR;
1669 	data->flags &= ~SF_IN_PAR;
1670     }
1671     if (flags & SCF_DO_STCLASS_OR)
1672 	cl_and(data->start_class, &and_with);
1673     return min;
1674 }
1675 
1676 STATIC I32
S_add_data(pTHX_ RExC_state_t * pRExC_state,I32 n,char * s)1677 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1678 {
1679     if (RExC_rx->data) {
1680 	Renewc(RExC_rx->data,
1681 	       sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1682 	       char, struct reg_data);
1683 	Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1684 	RExC_rx->data->count += n;
1685     }
1686     else {
1687 	Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1688 	     char, struct reg_data);
1689 	New(1208, RExC_rx->data->what, n, U8);
1690 	RExC_rx->data->count = n;
1691     }
1692     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1693     return RExC_rx->data->count - n;
1694 }
1695 
1696 void
Perl_reginitcolors(pTHX)1697 Perl_reginitcolors(pTHX)
1698 {
1699     int i = 0;
1700     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1701 
1702     if (s) {
1703 	PL_colors[0] = s = savepv(s);
1704 	while (++i < 6) {
1705 	    s = strchr(s, '\t');
1706 	    if (s) {
1707 		*s = '\0';
1708 		PL_colors[i] = ++s;
1709 	    }
1710 	    else
1711 		PL_colors[i] = s = "";
1712 	}
1713     } else {
1714 	while (i < 6)
1715 	    PL_colors[i++] = "";
1716     }
1717     PL_colorset = 1;
1718 }
1719 
1720 
1721 /*
1722  - pregcomp - compile a regular expression into internal code
1723  *
1724  * We can't allocate space until we know how big the compiled form will be,
1725  * but we can't compile it (and thus know how big it is) until we've got a
1726  * place to put the code.  So we cheat:  we compile it twice, once with code
1727  * generation turned off and size counting turned on, and once "for real".
1728  * This also means that we don't allocate space until we are sure that the
1729  * thing really will compile successfully, and we never have to move the
1730  * code and thus invalidate pointers into it.  (Note that it has to be in
1731  * one piece because free() must be able to free it all.) [NB: not true in perl]
1732  *
1733  * Beware that the optimization-preparation code in here knows about some
1734  * of the structure of the compiled regexp.  [I'll say.]
1735  */
1736 regexp *
Perl_pregcomp(pTHX_ char * exp,char * xend,PMOP * pm)1737 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1738 {
1739     register regexp *r;
1740     regnode *scan;
1741     regnode *first;
1742     I32 flags;
1743     I32 minlen = 0;
1744     I32 sawplus = 0;
1745     I32 sawopen = 0;
1746     scan_data_t data;
1747     RExC_state_t RExC_state;
1748     RExC_state_t *pRExC_state = &RExC_state;
1749 
1750     if (exp == NULL)
1751 	FAIL("NULL regexp argument");
1752 
1753     RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1754 
1755     DEBUG_r({
1756 	 if (!PL_colorset) reginitcolors();
1757 	 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1758 		       PL_colors[4],PL_colors[5],PL_colors[0],
1759 		       (int)(xend - exp), exp, PL_colors[1]);
1760     });
1761 
1762 redo_first_pass:
1763     RExC_precomp = exp;
1764     RExC_flags = pm->op_pmflags;
1765     RExC_sawback = 0;
1766 
1767     RExC_seen = 0;
1768     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1769     RExC_seen_evals = 0;
1770     RExC_extralen = 0;
1771 
1772     /* First pass: determine size, legality. */
1773     RExC_parse = exp;
1774     RExC_start = exp;
1775     RExC_end = xend;
1776     RExC_naughty = 0;
1777     RExC_npar = 1;
1778     RExC_size = 0L;
1779     RExC_emit = &PL_regdummy;
1780     RExC_whilem_seen = 0;
1781 #if 0 /* REGC() is (currently) a NOP at the first pass.
1782        * Clever compilers notice this and complain. --jhi */
1783     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1784 #endif
1785     if (reg(pRExC_state, 0, &flags) == NULL) {
1786 	RExC_precomp = Nullch;
1787 	return(NULL);
1788     }
1789 
1790     if (RExC_utf8 && !RExC_orig_utf8) {
1791         /* It's possible to write a regexp in ascii that represents unicode
1792         codepoints outside of the byte range, such as via \x{100}. If we
1793         detect such a sequence we have to convert the entire pattern to utf8
1794         and then recompile, as our sizing calculation will have been based
1795         on 1 byte == 1 character, but we will need to use utf8 to encode
1796         at least some part of the pattern, and therefore must convert the whole
1797         thing.
1798         XXX: somehow figure out how to make this less expensive...
1799         -- dmq */
1800         STRLEN len = xend-exp;
1801         DEBUG_r(PerlIO_printf(Perl_debug_log,
1802 	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
1803         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
1804         xend = exp + len;
1805         RExC_orig_utf8 = RExC_utf8;
1806         SAVEFREEPV(exp);
1807         goto redo_first_pass;
1808     }
1809     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1810 
1811     /* Small enough for pointer-storage convention?
1812        If extralen==0, this means that we will not need long jumps. */
1813     if (RExC_size >= 0x10000L && RExC_extralen)
1814         RExC_size += RExC_extralen;
1815     else
1816 	RExC_extralen = 0;
1817     if (RExC_whilem_seen > 15)
1818 	RExC_whilem_seen = 15;
1819 
1820     /* Allocate space and initialize. */
1821     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1822 	 char, regexp);
1823     if (r == NULL)
1824 	FAIL("Regexp out of space");
1825 
1826 #ifdef DEBUGGING
1827     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1828     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1829 #endif
1830     r->refcnt = 1;
1831     r->prelen = xend - exp;
1832     r->precomp = savepvn(RExC_precomp, r->prelen);
1833     r->subbeg = NULL;
1834     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1835     r->nparens = RExC_npar - 1;	/* set early to validate backrefs */
1836 
1837     r->substrs = 0;			/* Useful during FAIL. */
1838     r->startp = 0;			/* Useful during FAIL. */
1839     r->endp = 0;			/* Useful during FAIL. */
1840 
1841     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1842     if (r->offsets) {
1843       r->offsets[0] = RExC_size;
1844     }
1845     DEBUG_r(PerlIO_printf(Perl_debug_log,
1846                           "%s %"UVuf" bytes for offset annotations.\n",
1847                           r->offsets ? "Got" : "Couldn't get",
1848                           (UV)((2*RExC_size+1) * sizeof(U32))));
1849 
1850     RExC_rx = r;
1851 
1852     /* Second pass: emit code. */
1853     RExC_flags = pm->op_pmflags;	/* don't let top level (?i) bleed */
1854     RExC_parse = exp;
1855     RExC_end = xend;
1856     RExC_naughty = 0;
1857     RExC_npar = 1;
1858     RExC_emit_start = r->program;
1859     RExC_emit = r->program;
1860     /* Store the count of eval-groups for security checks: */
1861     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1862     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1863     r->data = 0;
1864     if (reg(pRExC_state, 0, &flags) == NULL)
1865 	return(NULL);
1866 
1867     /* Dig out information for optimizations. */
1868     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1869     pm->op_pmflags = RExC_flags;
1870     if (UTF)
1871         r->reganch |= ROPT_UTF8;	/* Unicode in it? */
1872     r->regstclass = NULL;
1873     if (RExC_naughty >= 10)	/* Probably an expensive pattern. */
1874 	r->reganch |= ROPT_NAUGHTY;
1875     scan = r->program + 1;		/* First BRANCH. */
1876 
1877     /* XXXX To minimize changes to RE engine we always allocate
1878        3-units-long substrs field. */
1879     Newz(1004, r->substrs, 1, struct reg_substr_data);
1880 
1881     StructCopy(&zero_scan_data, &data, scan_data_t);
1882     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1883     if (OP(scan) != BRANCH) {	/* Only one top-level choice. */
1884 	I32 fake;
1885 	STRLEN longest_float_length, longest_fixed_length;
1886 	struct regnode_charclass_class ch_class;
1887 	int stclass_flag;
1888 	I32 last_close = 0;
1889 
1890 	first = scan;
1891 	/* Skip introductions and multiplicators >= 1. */
1892 	while ((OP(first) == OPEN && (sawopen = 1)) ||
1893 	       /* An OR of *one* alternative - should not happen now. */
1894 	    (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1895 	    (OP(first) == PLUS) ||
1896 	    (OP(first) == MINMOD) ||
1897 	       /* An {n,m} with n>0 */
1898 	    (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1899 		if (OP(first) == PLUS)
1900 		    sawplus = 1;
1901 		else
1902 		    first += regarglen[(U8)OP(first)];
1903 		first = NEXTOPER(first);
1904 	}
1905 
1906 	/* Starting-point info. */
1907       again:
1908 	if (PL_regkind[(U8)OP(first)] == EXACT) {
1909 	    if (OP(first) == EXACT)
1910 	        ;	/* Empty, get anchored substr later. */
1911 	    else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1912 		r->regstclass = first;
1913 	}
1914 	else if (strchr((char*)PL_simple,OP(first)))
1915 	    r->regstclass = first;
1916 	else if (PL_regkind[(U8)OP(first)] == BOUND ||
1917 		 PL_regkind[(U8)OP(first)] == NBOUND)
1918 	    r->regstclass = first;
1919 	else if (PL_regkind[(U8)OP(first)] == BOL) {
1920 	    r->reganch |= (OP(first) == MBOL
1921 			   ? ROPT_ANCH_MBOL
1922 			   : (OP(first) == SBOL
1923 			      ? ROPT_ANCH_SBOL
1924 			      : ROPT_ANCH_BOL));
1925 	    first = NEXTOPER(first);
1926 	    goto again;
1927 	}
1928 	else if (OP(first) == GPOS) {
1929 	    r->reganch |= ROPT_ANCH_GPOS;
1930 	    first = NEXTOPER(first);
1931 	    goto again;
1932 	}
1933 	else if (!sawopen && (OP(first) == STAR &&
1934 	    PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1935 	    !(r->reganch & ROPT_ANCH) )
1936 	{
1937 	    /* turn .* into ^.* with an implied $*=1 */
1938 	    int type = OP(NEXTOPER(first));
1939 
1940 	    if (type == REG_ANY)
1941 		type = ROPT_ANCH_MBOL;
1942 	    else
1943 		type = ROPT_ANCH_SBOL;
1944 
1945 	    r->reganch |= type | ROPT_IMPLICIT;
1946 	    first = NEXTOPER(first);
1947 	    goto again;
1948 	}
1949 	if (sawplus && (!sawopen || !RExC_sawback)
1950 	    && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1951 	    /* x+ must match at the 1st pos of run of x's */
1952 	    r->reganch |= ROPT_SKIP;
1953 
1954 	/* Scan is after the zeroth branch, first is atomic matcher. */
1955 	DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1956 			      (IV)(first - scan + 1)));
1957 	/*
1958 	* If there's something expensive in the r.e., find the
1959 	* longest literal string that must appear and make it the
1960 	* regmust.  Resolve ties in favor of later strings, since
1961 	* the regstart check works with the beginning of the r.e.
1962 	* and avoiding duplication strengthens checking.  Not a
1963 	* strong reason, but sufficient in the absence of others.
1964 	* [Now we resolve ties in favor of the earlier string if
1965 	* it happens that c_offset_min has been invalidated, since the
1966 	* earlier string may buy us something the later one won't.]
1967 	*/
1968 	minlen = 0;
1969 
1970 	data.longest_fixed = newSVpvn("",0);
1971 	data.longest_float = newSVpvn("",0);
1972 	data.last_found = newSVpvn("",0);
1973 	data.longest = &(data.longest_fixed);
1974 	first = scan;
1975 	if (!r->regstclass) {
1976 	    cl_init(pRExC_state, &ch_class);
1977 	    data.start_class = &ch_class;
1978 	    stclass_flag = SCF_DO_STCLASS_AND;
1979 	} else				/* XXXX Check for BOUND? */
1980 	    stclass_flag = 0;
1981 	data.last_closep = &last_close;
1982 
1983 	minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1984 			     &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1985 	if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1986 	     && data.last_start_min == 0 && data.last_end > 0
1987 	     && !RExC_seen_zerolen
1988 	     && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1989 	    r->reganch |= ROPT_CHECK_ALL;
1990 	scan_commit(pRExC_state, &data);
1991 	SvREFCNT_dec(data.last_found);
1992 
1993 	longest_float_length = CHR_SVLEN(data.longest_float);
1994 	if (longest_float_length
1995 	    || (data.flags & SF_FL_BEFORE_EOL
1996 		&& (!(data.flags & SF_FL_BEFORE_MEOL)
1997 		    || (RExC_flags & PMf_MULTILINE)))) {
1998 	    int t;
1999 
2000 	    if (SvCUR(data.longest_fixed) 			/* ok to leave SvCUR */
2001 		&& data.offset_fixed == data.offset_float_min
2002 		&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
2003 		    goto remove_float;		/* As in (a)+. */
2004 
2005 	    if (SvUTF8(data.longest_float)) {
2006 		r->float_utf8 = data.longest_float;
2007 		r->float_substr = Nullsv;
2008 	    } else {
2009 		r->float_substr = data.longest_float;
2010 		r->float_utf8 = Nullsv;
2011 	    }
2012 	    r->float_min_offset = data.offset_float_min;
2013 	    r->float_max_offset = data.offset_float_max;
2014 	    t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
2015 		       && (!(data.flags & SF_FL_BEFORE_MEOL)
2016 			   || (RExC_flags & PMf_MULTILINE)));
2017 	    fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2018 	}
2019 	else {
2020 	  remove_float:
2021 	    r->float_substr = r->float_utf8 = Nullsv;
2022 	    SvREFCNT_dec(data.longest_float);
2023 	    longest_float_length = 0;
2024 	}
2025 
2026 	longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2027 	if (longest_fixed_length
2028 	    || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2029 		&& (!(data.flags & SF_FIX_BEFORE_MEOL)
2030 		    || (RExC_flags & PMf_MULTILINE)))) {
2031 	    int t;
2032 
2033 	    if (SvUTF8(data.longest_fixed)) {
2034 		r->anchored_utf8 = data.longest_fixed;
2035 		r->anchored_substr = Nullsv;
2036 	    } else {
2037 		r->anchored_substr = data.longest_fixed;
2038 		r->anchored_utf8 = Nullsv;
2039 	    }
2040 	    r->anchored_offset = data.offset_fixed;
2041 	    t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2042 		 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2043 		     || (RExC_flags & PMf_MULTILINE)));
2044 	    fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2045 	}
2046 	else {
2047 	    r->anchored_substr = r->anchored_utf8 = Nullsv;
2048 	    SvREFCNT_dec(data.longest_fixed);
2049 	    longest_fixed_length = 0;
2050 	}
2051 	if (r->regstclass
2052 	    && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2053 	    r->regstclass = NULL;
2054 	if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2055 	    && stclass_flag
2056 	    && !(data.start_class->flags & ANYOF_EOS)
2057 	    && !cl_is_anything(data.start_class))
2058 	{
2059 	    I32 n = add_data(pRExC_state, 1, "f");
2060 
2061 	    New(1006, RExC_rx->data->data[n], 1,
2062 		struct regnode_charclass_class);
2063 	    StructCopy(data.start_class,
2064 		       (struct regnode_charclass_class*)RExC_rx->data->data[n],
2065 		       struct regnode_charclass_class);
2066 	    r->regstclass = (regnode*)RExC_rx->data->data[n];
2067 	    r->reganch &= ~ROPT_SKIP;	/* Used in find_byclass(). */
2068 	    PL_regdata = r->data; /* for regprop() */
2069 	    DEBUG_r({ SV *sv = sv_newmortal();
2070 	              regprop(sv, (regnode*)data.start_class);
2071 		      PerlIO_printf(Perl_debug_log,
2072 				    "synthetic stclass `%s'.\n",
2073 				    SvPVX(sv));});
2074 	}
2075 
2076 	/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2077 	if (longest_fixed_length > longest_float_length) {
2078 	    r->check_substr = r->anchored_substr;
2079 	    r->check_utf8 = r->anchored_utf8;
2080 	    r->check_offset_min = r->check_offset_max = r->anchored_offset;
2081 	    if (r->reganch & ROPT_ANCH_SINGLE)
2082 		r->reganch |= ROPT_NOSCAN;
2083 	}
2084 	else {
2085 	    r->check_substr = r->float_substr;
2086 	    r->check_utf8 = r->float_utf8;
2087 	    r->check_offset_min = data.offset_float_min;
2088 	    r->check_offset_max = data.offset_float_max;
2089 	}
2090 	/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2091 	   This should be changed ASAP!  */
2092 	if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2093 	    r->reganch |= RE_USE_INTUIT;
2094 	    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2095 		r->reganch |= RE_INTUIT_TAIL;
2096 	}
2097     }
2098     else {
2099 	/* Several toplevels. Best we can is to set minlen. */
2100 	I32 fake;
2101 	struct regnode_charclass_class ch_class;
2102 	I32 last_close = 0;
2103 
2104 	DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2105 	scan = r->program + 1;
2106 	cl_init(pRExC_state, &ch_class);
2107 	data.start_class = &ch_class;
2108 	data.last_closep = &last_close;
2109 	minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2110 	r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2111 		= r->float_substr = r->float_utf8 = Nullsv;
2112 	if (!(data.start_class->flags & ANYOF_EOS)
2113 	    && !cl_is_anything(data.start_class))
2114 	{
2115 	    I32 n = add_data(pRExC_state, 1, "f");
2116 
2117 	    New(1006, RExC_rx->data->data[n], 1,
2118 		struct regnode_charclass_class);
2119 	    StructCopy(data.start_class,
2120 		       (struct regnode_charclass_class*)RExC_rx->data->data[n],
2121 		       struct regnode_charclass_class);
2122 	    r->regstclass = (regnode*)RExC_rx->data->data[n];
2123 	    r->reganch &= ~ROPT_SKIP;	/* Used in find_byclass(). */
2124 	    DEBUG_r({ SV* sv = sv_newmortal();
2125 	              regprop(sv, (regnode*)data.start_class);
2126 		      PerlIO_printf(Perl_debug_log,
2127 				    "synthetic stclass `%s'.\n",
2128 				    SvPVX(sv));});
2129 	}
2130     }
2131 
2132     r->minlen = minlen;
2133     if (RExC_seen & REG_SEEN_GPOS)
2134 	r->reganch |= ROPT_GPOS_SEEN;
2135     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2136 	r->reganch |= ROPT_LOOKBEHIND_SEEN;
2137     if (RExC_seen & REG_SEEN_EVAL)
2138 	r->reganch |= ROPT_EVAL_SEEN;
2139     if (RExC_seen & REG_SEEN_CANY)
2140 	r->reganch |= ROPT_CANY_SEEN;
2141     Newz(1002, r->startp, RExC_npar, I32);
2142     Newz(1002, r->endp, RExC_npar, I32);
2143     PL_regdata = r->data; /* for regprop() */
2144     DEBUG_r(regdump(r));
2145     return(r);
2146 }
2147 
2148 /*
2149  - reg - regular expression, i.e. main body or parenthesized thing
2150  *
2151  * Caller must absorb opening parenthesis.
2152  *
2153  * Combining parenthesis handling with the base level of regular expression
2154  * is a trifle forced, but the need to tie the tails of the branches to what
2155  * follows makes it hard to avoid.
2156  */
2157 STATIC regnode *
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp)2158 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2159     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2160 {
2161     register regnode *ret;		/* Will be the head of the group. */
2162     register regnode *br;
2163     register regnode *lastbr;
2164     register regnode *ender = 0;
2165     register I32 parno = 0;
2166     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2167 
2168     /* for (?g), (?gc), and (?o) warnings; warning
2169        about (?c) will warn about (?g) -- japhy    */
2170 
2171     I32 wastedflags = 0x00,
2172         wasted_o    = 0x01,
2173         wasted_g    = 0x02,
2174         wasted_gc   = 0x02 | 0x04,
2175         wasted_c    = 0x04;
2176 
2177     char * parse_start = RExC_parse; /* MJD */
2178     char *oregcomp_parse = RExC_parse;
2179     char c;
2180 
2181     *flagp = 0;				/* Tentatively. */
2182 
2183 
2184     /* Make an OPEN node, if parenthesized. */
2185     if (paren) {
2186 	if (*RExC_parse == '?') { /* (?...) */
2187 	    U32 posflags = 0, negflags = 0;
2188 	    U32 *flagsp = &posflags;
2189 	    int logical = 0;
2190 	    char *seqstart = RExC_parse;
2191 
2192 	    RExC_parse++;
2193 	    paren = *RExC_parse++;
2194 	    ret = NULL;			/* For look-ahead/behind. */
2195 	    switch (paren) {
2196 	    case '<':           /* (?<...) */
2197 		RExC_seen |= REG_SEEN_LOOKBEHIND;
2198 		if (*RExC_parse == '!')
2199 		    paren = ',';
2200 		if (*RExC_parse != '=' && *RExC_parse != '!')
2201 		    goto unknown;
2202 		RExC_parse++;
2203 	    case '=':           /* (?=...) */
2204 	    case '!':           /* (?!...) */
2205 		RExC_seen_zerolen++;
2206 	    case ':':           /* (?:...) */
2207 	    case '>':           /* (?>...) */
2208 		break;
2209 	    case '$':           /* (?$...) */
2210 	    case '@':           /* (?@...) */
2211 		vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2212 		break;
2213 	    case '#':           /* (?#...) */
2214 		while (*RExC_parse && *RExC_parse != ')')
2215 		    RExC_parse++;
2216 		if (*RExC_parse != ')')
2217 		    FAIL("Sequence (?#... not terminated");
2218 		nextchar(pRExC_state);
2219 		*flagp = TRYAGAIN;
2220 		return NULL;
2221 	    case 'p':           /* (?p...) */
2222 		if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2223 		    vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2224 		/* FALL THROUGH*/
2225 	    case '?':           /* (??...) */
2226 		logical = 1;
2227 		if (*RExC_parse != '{')
2228 		    goto unknown;
2229 		paren = *RExC_parse++;
2230 		/* FALL THROUGH */
2231 	    case '{':           /* (?{...}) */
2232 	    {
2233 		I32 count = 1, n = 0;
2234 		char c;
2235 		char *s = RExC_parse;
2236 		SV *sv;
2237 		OP_4tree *sop, *rop;
2238 
2239 		RExC_seen_zerolen++;
2240 		RExC_seen |= REG_SEEN_EVAL;
2241 		while (count && (c = *RExC_parse)) {
2242 		    if (c == '\\' && RExC_parse[1])
2243 			RExC_parse++;
2244 		    else if (c == '{')
2245 			count++;
2246 		    else if (c == '}')
2247 			count--;
2248 		    RExC_parse++;
2249 		}
2250 		if (*RExC_parse != ')')
2251 		{
2252 		    RExC_parse = s;
2253 		    vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2254 		}
2255 		if (!SIZE_ONLY) {
2256 		    PAD *pad;
2257 
2258 		    if (RExC_parse - 1 - s)
2259 			sv = newSVpvn(s, RExC_parse - 1 - s);
2260 		    else
2261 			sv = newSVpvn("", 0);
2262 
2263 		    ENTER;
2264 		    Perl_save_re_context(aTHX);
2265 		    rop = sv_compile_2op(sv, &sop, "re", &pad);
2266 		    sop->op_private |= OPpREFCOUNTED;
2267 		    /* re_dup will OpREFCNT_inc */
2268 		    OpREFCNT_set(sop, 1);
2269 		    LEAVE;
2270 
2271 		    n = add_data(pRExC_state, 3, "nop");
2272 		    RExC_rx->data->data[n] = (void*)rop;
2273 		    RExC_rx->data->data[n+1] = (void*)sop;
2274 		    RExC_rx->data->data[n+2] = (void*)pad;
2275 		    SvREFCNT_dec(sv);
2276 		}
2277 		else {						/* First pass */
2278 		    if (PL_reginterp_cnt < ++RExC_seen_evals
2279 			&& IN_PERL_RUNTIME)
2280 			/* No compiled RE interpolated, has runtime
2281 			   components ===> unsafe.  */
2282 			FAIL("Eval-group not allowed at runtime, use re 'eval'");
2283 		    if (PL_tainting && PL_tainted)
2284 			FAIL("Eval-group in insecure regular expression");
2285 		}
2286 
2287 		nextchar(pRExC_state);
2288 		if (logical) {
2289 		    ret = reg_node(pRExC_state, LOGICAL);
2290 		    if (!SIZE_ONLY)
2291 			ret->flags = 2;
2292 		    regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2293                     /* deal with the length of this later - MJD */
2294 		    return ret;
2295 		}
2296 		ret = reganode(pRExC_state, EVAL, n);
2297 		Set_Node_Length(ret, RExC_parse - parse_start + 1);
2298 		Set_Node_Offset(ret, parse_start);
2299 		return ret;
2300 	    }
2301 	    case '(':           /* (?(?{...})...) and (?(?=...)...) */
2302 	    {
2303 		if (RExC_parse[0] == '?') {        /* (?(?...)) */
2304 		    if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2305 			|| RExC_parse[1] == '<'
2306 			|| RExC_parse[1] == '{') { /* Lookahead or eval. */
2307 			I32 flag;
2308 
2309 			ret = reg_node(pRExC_state, LOGICAL);
2310 			if (!SIZE_ONLY)
2311 			    ret->flags = 1;
2312 			regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2313 			goto insert_if;
2314 		    }
2315 		}
2316 		else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2317                     /* (?(1)...) */
2318 		    parno = atoi(RExC_parse++);
2319 
2320 		    while (isDIGIT(*RExC_parse))
2321 			RExC_parse++;
2322                     ret = reganode(pRExC_state, GROUPP, parno);
2323 
2324 		    if ((c = *nextchar(pRExC_state)) != ')')
2325 			vFAIL("Switch condition not recognized");
2326 		  insert_if:
2327 		    regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2328 		    br = regbranch(pRExC_state, &flags, 1);
2329 		    if (br == NULL)
2330 			br = reganode(pRExC_state, LONGJMP, 0);
2331 		    else
2332 			regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2333 		    c = *nextchar(pRExC_state);
2334 		    if (flags&HASWIDTH)
2335 			*flagp |= HASWIDTH;
2336 		    if (c == '|') {
2337 			lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2338 			regbranch(pRExC_state, &flags, 1);
2339 			regtail(pRExC_state, ret, lastbr);
2340 		 	if (flags&HASWIDTH)
2341 			    *flagp |= HASWIDTH;
2342 			c = *nextchar(pRExC_state);
2343 		    }
2344 		    else
2345 			lastbr = NULL;
2346 		    if (c != ')')
2347 			vFAIL("Switch (?(condition)... contains too many branches");
2348 		    ender = reg_node(pRExC_state, TAIL);
2349 		    regtail(pRExC_state, br, ender);
2350 		    if (lastbr) {
2351 			regtail(pRExC_state, lastbr, ender);
2352 			regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2353 		    }
2354 		    else
2355 			regtail(pRExC_state, ret, ender);
2356 		    return ret;
2357 		}
2358 		else {
2359 		    vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2360 		}
2361 	    }
2362             case 0:
2363 		RExC_parse--; /* for vFAIL to print correctly */
2364                 vFAIL("Sequence (? incomplete");
2365                 break;
2366 	    default:
2367 		--RExC_parse;
2368 	      parse_flags:      /* (?i) */
2369 		while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2370 		    /* (?g), (?gc) and (?o) are useless here
2371 		       and must be globally applied -- japhy */
2372 
2373 		    if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2374 			if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2375 			    I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2376 			    if (! (wastedflags & wflagbit) ) {
2377 				wastedflags |= wflagbit;
2378 				vWARN5(
2379 				    RExC_parse + 1,
2380 				    "Useless (%s%c) - %suse /%c modifier",
2381 				    flagsp == &negflags ? "?-" : "?",
2382 				    *RExC_parse,
2383 				    flagsp == &negflags ? "don't " : "",
2384 				    *RExC_parse
2385 				);
2386 			    }
2387 			}
2388 		    }
2389 		    else if (*RExC_parse == 'c') {
2390 			if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2391 			    if (! (wastedflags & wasted_c) ) {
2392 				wastedflags |= wasted_gc;
2393 				vWARN3(
2394 				    RExC_parse + 1,
2395 				    "Useless (%sc) - %suse /gc modifier",
2396 				    flagsp == &negflags ? "?-" : "?",
2397 				    flagsp == &negflags ? "don't " : ""
2398 				);
2399 			    }
2400 			}
2401 		    }
2402 		    else { pmflag(flagsp, *RExC_parse); }
2403 
2404 		    ++RExC_parse;
2405 		}
2406 		if (*RExC_parse == '-') {
2407 		    flagsp = &negflags;
2408 		    wastedflags = 0;  /* reset so (?g-c) warns twice */
2409 		    ++RExC_parse;
2410 		    goto parse_flags;
2411 		}
2412 		RExC_flags |= posflags;
2413 		RExC_flags &= ~negflags;
2414 		if (*RExC_parse == ':') {
2415 		    RExC_parse++;
2416 		    paren = ':';
2417 		    break;
2418 		}
2419 	      unknown:
2420 		if (*RExC_parse != ')') {
2421 		    RExC_parse++;
2422 		    vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2423 		}
2424 		nextchar(pRExC_state);
2425 		*flagp = TRYAGAIN;
2426 		return NULL;
2427 	    }
2428 	}
2429 	else {                  /* (...) */
2430 	    parno = RExC_npar;
2431 	    RExC_npar++;
2432 	    ret = reganode(pRExC_state, OPEN, parno);
2433             Set_Node_Length(ret, 1); /* MJD */
2434             Set_Node_Offset(ret, RExC_parse); /* MJD */
2435 	    open = 1;
2436 	}
2437     }
2438     else                        /* ! paren */
2439 	ret = NULL;
2440 
2441     /* Pick up the branches, linking them together. */
2442     parse_start = RExC_parse;   /* MJD */
2443     br = regbranch(pRExC_state, &flags, 1);
2444     /*     branch_len = (paren != 0); */
2445 
2446     if (br == NULL)
2447 	return(NULL);
2448     if (*RExC_parse == '|') {
2449 	if (!SIZE_ONLY && RExC_extralen) {
2450 	    reginsert(pRExC_state, BRANCHJ, br);
2451 	}
2452 	else {                  /* MJD */
2453 	    reginsert(pRExC_state, BRANCH, br);
2454             Set_Node_Length(br, paren != 0);
2455             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2456         }
2457 	have_branch = 1;
2458 	if (SIZE_ONLY)
2459 	    RExC_extralen += 1;		/* For BRANCHJ-BRANCH. */
2460     }
2461     else if (paren == ':') {
2462 	*flagp |= flags&SIMPLE;
2463     }
2464     if (open) {				/* Starts with OPEN. */
2465 	regtail(pRExC_state, ret, br);		/* OPEN -> first. */
2466     }
2467     else if (paren != '?')		/* Not Conditional */
2468 	ret = br;
2469     *flagp |= flags & (SPSTART | HASWIDTH);
2470     lastbr = br;
2471     while (*RExC_parse == '|') {
2472 	if (!SIZE_ONLY && RExC_extralen) {
2473 	    ender = reganode(pRExC_state, LONGJMP,0);
2474 	    regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2475 	}
2476 	if (SIZE_ONLY)
2477 	    RExC_extralen += 2;		/* Account for LONGJMP. */
2478 	nextchar(pRExC_state);
2479 	br = regbranch(pRExC_state, &flags, 0);
2480 
2481 	if (br == NULL)
2482 	    return(NULL);
2483 	regtail(pRExC_state, lastbr, br);		/* BRANCH -> BRANCH. */
2484 	lastbr = br;
2485 	if (flags&HASWIDTH)
2486 	    *flagp |= HASWIDTH;
2487 	*flagp |= flags&SPSTART;
2488     }
2489 
2490     if (have_branch || paren != ':') {
2491 	/* Make a closing node, and hook it on the end. */
2492 	switch (paren) {
2493 	case ':':
2494 	    ender = reg_node(pRExC_state, TAIL);
2495 	    break;
2496 	case 1:
2497 	    ender = reganode(pRExC_state, CLOSE, parno);
2498             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2499             Set_Node_Length(ender,1); /* MJD */
2500 	    break;
2501 	case '<':
2502 	case ',':
2503 	case '=':
2504 	case '!':
2505 	    *flagp &= ~HASWIDTH;
2506 	    /* FALL THROUGH */
2507 	case '>':
2508 	    ender = reg_node(pRExC_state, SUCCEED);
2509 	    break;
2510 	case 0:
2511 	    ender = reg_node(pRExC_state, END);
2512 	    break;
2513 	}
2514 	regtail(pRExC_state, lastbr, ender);
2515 
2516 	if (have_branch) {
2517 	    /* Hook the tails of the branches to the closing node. */
2518 	    for (br = ret; br != NULL; br = regnext(br)) {
2519 		regoptail(pRExC_state, br, ender);
2520 	    }
2521 	}
2522     }
2523 
2524     {
2525 	char *p;
2526 	static char parens[] = "=!<,>";
2527 
2528 	if (paren && (p = strchr(parens, paren))) {
2529 	    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2530 	    int flag = (p - parens) > 1;
2531 
2532 	    if (paren == '>')
2533 		node = SUSPEND, flag = 0;
2534 	    reginsert(pRExC_state, node,ret);
2535 	    Set_Node_Cur_Length(ret);
2536 	    Set_Node_Offset(ret, parse_start + 1);
2537 	    ret->flags = flag;
2538 	    regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2539 	}
2540     }
2541 
2542     /* Check for proper termination. */
2543     if (paren) {
2544 	RExC_flags = oregflags;
2545 	if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2546 	    RExC_parse = oregcomp_parse;
2547 	    vFAIL("Unmatched (");
2548 	}
2549     }
2550     else if (!paren && RExC_parse < RExC_end) {
2551 	if (*RExC_parse == ')') {
2552 	    RExC_parse++;
2553 	    vFAIL("Unmatched )");
2554 	}
2555 	else
2556 	    FAIL("Junk on end of regexp");	/* "Can't happen". */
2557 	/* NOTREACHED */
2558     }
2559 
2560     return(ret);
2561 }
2562 
2563 /*
2564  - regbranch - one alternative of an | operator
2565  *
2566  * Implements the concatenation operator.
2567  */
2568 STATIC regnode *
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first)2569 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2570 {
2571     register regnode *ret;
2572     register regnode *chain = NULL;
2573     register regnode *latest;
2574     I32 flags = 0, c = 0;
2575 
2576     if (first)
2577 	ret = NULL;
2578     else {
2579 	if (!SIZE_ONLY && RExC_extralen)
2580 	    ret = reganode(pRExC_state, BRANCHJ,0);
2581 	else {
2582 	    ret = reg_node(pRExC_state, BRANCH);
2583             Set_Node_Length(ret, 1);
2584         }
2585     }
2586 
2587     if (!first && SIZE_ONLY)
2588 	RExC_extralen += 1;			/* BRANCHJ */
2589 
2590     *flagp = WORST;			/* Tentatively. */
2591 
2592     RExC_parse--;
2593     nextchar(pRExC_state);
2594     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2595 	flags &= ~TRYAGAIN;
2596 	latest = regpiece(pRExC_state, &flags);
2597 	if (latest == NULL) {
2598 	    if (flags & TRYAGAIN)
2599 		continue;
2600 	    return(NULL);
2601 	}
2602 	else if (ret == NULL)
2603 	    ret = latest;
2604 	*flagp |= flags&HASWIDTH;
2605 	if (chain == NULL) 	/* First piece. */
2606 	    *flagp |= flags&SPSTART;
2607 	else {
2608 	    RExC_naughty++;
2609 	    regtail(pRExC_state, chain, latest);
2610 	}
2611 	chain = latest;
2612 	c++;
2613     }
2614     if (chain == NULL) {	/* Loop ran zero times. */
2615 	chain = reg_node(pRExC_state, NOTHING);
2616 	if (ret == NULL)
2617 	    ret = chain;
2618     }
2619     if (c == 1) {
2620 	*flagp |= flags&SIMPLE;
2621     }
2622 
2623     return(ret);
2624 }
2625 
2626 /*
2627  - regpiece - something followed by possible [*+?]
2628  *
2629  * Note that the branching code sequences used for ? and the general cases
2630  * of * and + are somewhat optimized:  they use the same NOTHING node as
2631  * both the endmarker for their branch list and the body of the last branch.
2632  * It might seem that this node could be dispensed with entirely, but the
2633  * endmarker role is not redundant.
2634  */
2635 STATIC regnode *
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp)2636 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2637 {
2638     register regnode *ret;
2639     register char op;
2640     register char *next;
2641     I32 flags;
2642     char *origparse = RExC_parse;
2643     char *maxpos;
2644     I32 min;
2645     I32 max = REG_INFTY;
2646     char *parse_start;
2647 
2648     ret = regatom(pRExC_state, &flags);
2649     if (ret == NULL) {
2650 	if (flags & TRYAGAIN)
2651 	    *flagp |= TRYAGAIN;
2652 	return(NULL);
2653     }
2654 
2655     op = *RExC_parse;
2656 
2657     if (op == '{' && regcurly(RExC_parse)) {
2658         parse_start = RExC_parse; /* MJD */
2659 	next = RExC_parse + 1;
2660 	maxpos = Nullch;
2661 	while (isDIGIT(*next) || *next == ',') {
2662 	    if (*next == ',') {
2663 		if (maxpos)
2664 		    break;
2665 		else
2666 		    maxpos = next;
2667 	    }
2668 	    next++;
2669 	}
2670 	if (*next == '}') {		/* got one */
2671 	    if (!maxpos)
2672 		maxpos = next;
2673 	    RExC_parse++;
2674 	    min = atoi(RExC_parse);
2675 	    if (*maxpos == ',')
2676 		maxpos++;
2677 	    else
2678 		maxpos = RExC_parse;
2679 	    max = atoi(maxpos);
2680 	    if (!max && *maxpos != '0')
2681 		max = REG_INFTY;		/* meaning "infinity" */
2682 	    else if (max >= REG_INFTY)
2683 		vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2684 	    RExC_parse = next;
2685 	    nextchar(pRExC_state);
2686 
2687 	do_curly:
2688 	    if ((flags&SIMPLE)) {
2689 		RExC_naughty += 2 + RExC_naughty / 2;
2690 		reginsert(pRExC_state, CURLY, ret);
2691                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2692                 Set_Node_Cur_Length(ret);
2693 	    }
2694 	    else {
2695 		regnode *w = reg_node(pRExC_state, WHILEM);
2696 
2697 		w->flags = 0;
2698 		regtail(pRExC_state, ret, w);
2699 		if (!SIZE_ONLY && RExC_extralen) {
2700 		    reginsert(pRExC_state, LONGJMP,ret);
2701 		    reginsert(pRExC_state, NOTHING,ret);
2702 		    NEXT_OFF(ret) = 3;	/* Go over LONGJMP. */
2703 		}
2704 		reginsert(pRExC_state, CURLYX,ret);
2705                                 /* MJD hk */
2706                 Set_Node_Offset(ret, parse_start+1);
2707                 Set_Node_Length(ret,
2708                                 op == '{' ? (RExC_parse - parse_start) : 1);
2709 
2710 		if (!SIZE_ONLY && RExC_extralen)
2711 		    NEXT_OFF(ret) = 3;	/* Go over NOTHING to LONGJMP. */
2712 		regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2713 		if (SIZE_ONLY)
2714 		    RExC_whilem_seen++, RExC_extralen += 3;
2715 		RExC_naughty += 4 + RExC_naughty;	/* compound interest */
2716 	    }
2717 	    ret->flags = 0;
2718 
2719 	    if (min > 0)
2720 		*flagp = WORST;
2721 	    if (max > 0)
2722 		*flagp |= HASWIDTH;
2723 	    if (max && max < min)
2724 		vFAIL("Can't do {n,m} with n > m");
2725 	    if (!SIZE_ONLY) {
2726 		ARG1_SET(ret, (U16)min);
2727 		ARG2_SET(ret, (U16)max);
2728 	    }
2729 
2730 	    goto nest_check;
2731 	}
2732     }
2733 
2734     if (!ISMULT1(op)) {
2735 	*flagp = flags;
2736 	return(ret);
2737     }
2738 
2739 #if 0				/* Now runtime fix should be reliable. */
2740 
2741     /* if this is reinstated, don't forget to put this back into perldiag:
2742 
2743 	    =item Regexp *+ operand could be empty at {#} in regex m/%s/
2744 
2745 	   (F) The part of the regexp subject to either the * or + quantifier
2746            could match an empty string. The {#} shows in the regular
2747            expression about where the problem was discovered.
2748 
2749     */
2750 
2751     if (!(flags&HASWIDTH) && op != '?')
2752       vFAIL("Regexp *+ operand could be empty");
2753 #endif
2754 
2755     parse_start = RExC_parse;
2756     nextchar(pRExC_state);
2757 
2758     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2759 
2760     if (op == '*' && (flags&SIMPLE)) {
2761 	reginsert(pRExC_state, STAR, ret);
2762 	ret->flags = 0;
2763 	RExC_naughty += 4;
2764     }
2765     else if (op == '*') {
2766 	min = 0;
2767 	goto do_curly;
2768     }
2769     else if (op == '+' && (flags&SIMPLE)) {
2770 	reginsert(pRExC_state, PLUS, ret);
2771 	ret->flags = 0;
2772 	RExC_naughty += 3;
2773     }
2774     else if (op == '+') {
2775 	min = 1;
2776 	goto do_curly;
2777     }
2778     else if (op == '?') {
2779 	min = 0; max = 1;
2780 	goto do_curly;
2781     }
2782   nest_check:
2783     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2784 	vWARN3(RExC_parse,
2785 	       "%.*s matches null string many times",
2786 	       RExC_parse - origparse,
2787 	       origparse);
2788     }
2789 
2790     if (*RExC_parse == '?') {
2791 	nextchar(pRExC_state);
2792 	reginsert(pRExC_state, MINMOD, ret);
2793 	regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2794     }
2795     if (ISMULT2(RExC_parse)) {
2796 	RExC_parse++;
2797 	vFAIL("Nested quantifiers");
2798     }
2799 
2800     return(ret);
2801 }
2802 
2803 /*
2804  - regatom - the lowest level
2805  *
2806  * Optimization:  gobbles an entire sequence of ordinary characters so that
2807  * it can turn them into a single node, which is smaller to store and
2808  * faster to run.  Backslashed characters are exceptions, each becoming a
2809  * separate node; the code is simpler that way and it's not worth fixing.
2810  *
2811  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2812 STATIC regnode *
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp)2813 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2814 {
2815     register regnode *ret = 0;
2816     I32 flags;
2817     char *parse_start = RExC_parse;
2818 
2819     *flagp = WORST;		/* Tentatively. */
2820 
2821 tryagain:
2822     switch (*RExC_parse) {
2823     case '^':
2824 	RExC_seen_zerolen++;
2825 	nextchar(pRExC_state);
2826 	if (RExC_flags & PMf_MULTILINE)
2827 	    ret = reg_node(pRExC_state, MBOL);
2828 	else if (RExC_flags & PMf_SINGLELINE)
2829 	    ret = reg_node(pRExC_state, SBOL);
2830 	else
2831 	    ret = reg_node(pRExC_state, BOL);
2832         Set_Node_Length(ret, 1); /* MJD */
2833 	break;
2834     case '$':
2835 	nextchar(pRExC_state);
2836 	if (*RExC_parse)
2837 	    RExC_seen_zerolen++;
2838 	if (RExC_flags & PMf_MULTILINE)
2839 	    ret = reg_node(pRExC_state, MEOL);
2840 	else if (RExC_flags & PMf_SINGLELINE)
2841 	    ret = reg_node(pRExC_state, SEOL);
2842 	else
2843 	    ret = reg_node(pRExC_state, EOL);
2844         Set_Node_Length(ret, 1); /* MJD */
2845 	break;
2846     case '.':
2847 	nextchar(pRExC_state);
2848 	if (RExC_flags & PMf_SINGLELINE)
2849 	    ret = reg_node(pRExC_state, SANY);
2850 	else
2851 	    ret = reg_node(pRExC_state, REG_ANY);
2852 	*flagp |= HASWIDTH|SIMPLE;
2853 	RExC_naughty++;
2854         Set_Node_Length(ret, 1); /* MJD */
2855 	break;
2856     case '[':
2857     {
2858 	char *oregcomp_parse = ++RExC_parse;
2859 	ret = regclass(pRExC_state);
2860 	if (*RExC_parse != ']') {
2861 	    RExC_parse = oregcomp_parse;
2862 	    vFAIL("Unmatched [");
2863 	}
2864 	nextchar(pRExC_state);
2865 	*flagp |= HASWIDTH|SIMPLE;
2866         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2867 	break;
2868     }
2869     case '(':
2870 	nextchar(pRExC_state);
2871 	ret = reg(pRExC_state, 1, &flags);
2872 	if (ret == NULL) {
2873 		if (flags & TRYAGAIN) {
2874 		    if (RExC_parse == RExC_end) {
2875 			 /* Make parent create an empty node if needed. */
2876 			*flagp |= TRYAGAIN;
2877 			return(NULL);
2878 		    }
2879 		    goto tryagain;
2880 		}
2881 		return(NULL);
2882 	}
2883 	*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2884 	break;
2885     case '|':
2886     case ')':
2887 	if (flags & TRYAGAIN) {
2888 	    *flagp |= TRYAGAIN;
2889 	    return NULL;
2890 	}
2891 	vFAIL("Internal urp");
2892 				/* Supposed to be caught earlier. */
2893 	break;
2894     case '{':
2895 	if (!regcurly(RExC_parse)) {
2896 	    RExC_parse++;
2897 	    goto defchar;
2898 	}
2899 	/* FALL THROUGH */
2900     case '?':
2901     case '+':
2902     case '*':
2903 	RExC_parse++;
2904 	vFAIL("Quantifier follows nothing");
2905 	break;
2906     case '\\':
2907 	switch (*++RExC_parse) {
2908 	case 'A':
2909 	    RExC_seen_zerolen++;
2910 	    ret = reg_node(pRExC_state, SBOL);
2911 	    *flagp |= SIMPLE;
2912 	    nextchar(pRExC_state);
2913             Set_Node_Length(ret, 2); /* MJD */
2914 	    break;
2915 	case 'G':
2916 	    ret = reg_node(pRExC_state, GPOS);
2917 	    RExC_seen |= REG_SEEN_GPOS;
2918 	    *flagp |= SIMPLE;
2919 	    nextchar(pRExC_state);
2920             Set_Node_Length(ret, 2); /* MJD */
2921 	    break;
2922 	case 'Z':
2923 	    ret = reg_node(pRExC_state, SEOL);
2924 	    *flagp |= SIMPLE;
2925 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
2926 	    nextchar(pRExC_state);
2927 	    break;
2928 	case 'z':
2929 	    ret = reg_node(pRExC_state, EOS);
2930 	    *flagp |= SIMPLE;
2931 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
2932 	    nextchar(pRExC_state);
2933             Set_Node_Length(ret, 2); /* MJD */
2934 	    break;
2935 	case 'C':
2936 	    ret = reg_node(pRExC_state, CANY);
2937 	    RExC_seen |= REG_SEEN_CANY;
2938 	    *flagp |= HASWIDTH|SIMPLE;
2939 	    nextchar(pRExC_state);
2940             Set_Node_Length(ret, 2); /* MJD */
2941 	    break;
2942 	case 'X':
2943 	    ret = reg_node(pRExC_state, CLUMP);
2944 	    *flagp |= HASWIDTH;
2945 	    nextchar(pRExC_state);
2946             Set_Node_Length(ret, 2); /* MJD */
2947 	    break;
2948 	case 'w':
2949 	    ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2950 	    *flagp |= HASWIDTH|SIMPLE;
2951 	    nextchar(pRExC_state);
2952             Set_Node_Length(ret, 2); /* MJD */
2953 	    break;
2954 	case 'W':
2955 	    ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2956 	    *flagp |= HASWIDTH|SIMPLE;
2957 	    nextchar(pRExC_state);
2958             Set_Node_Length(ret, 2); /* MJD */
2959 	    break;
2960 	case 'b':
2961 	    RExC_seen_zerolen++;
2962 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
2963 	    ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2964 	    *flagp |= SIMPLE;
2965 	    nextchar(pRExC_state);
2966             Set_Node_Length(ret, 2); /* MJD */
2967 	    break;
2968 	case 'B':
2969 	    RExC_seen_zerolen++;
2970 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
2971 	    ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2972 	    *flagp |= SIMPLE;
2973 	    nextchar(pRExC_state);
2974             Set_Node_Length(ret, 2); /* MJD */
2975 	    break;
2976 	case 's':
2977 	    ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2978 	    *flagp |= HASWIDTH|SIMPLE;
2979 	    nextchar(pRExC_state);
2980             Set_Node_Length(ret, 2); /* MJD */
2981 	    break;
2982 	case 'S':
2983 	    ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2984 	    *flagp |= HASWIDTH|SIMPLE;
2985 	    nextchar(pRExC_state);
2986             Set_Node_Length(ret, 2); /* MJD */
2987 	    break;
2988 	case 'd':
2989 	    ret = reg_node(pRExC_state, DIGIT);
2990 	    *flagp |= HASWIDTH|SIMPLE;
2991 	    nextchar(pRExC_state);
2992             Set_Node_Length(ret, 2); /* MJD */
2993 	    break;
2994 	case 'D':
2995 	    ret = reg_node(pRExC_state, NDIGIT);
2996 	    *flagp |= HASWIDTH|SIMPLE;
2997 	    nextchar(pRExC_state);
2998             Set_Node_Length(ret, 2); /* MJD */
2999 	    break;
3000 	case 'p':
3001 	case 'P':
3002 	    {
3003 		char* oldregxend = RExC_end;
3004 		char* parse_start = RExC_parse - 2;
3005 
3006 		if (RExC_parse[1] == '{') {
3007 		  /* a lovely hack--pretend we saw [\pX] instead */
3008 		    RExC_end = strchr(RExC_parse, '}');
3009 		    if (!RExC_end) {
3010 		        U8 c = (U8)*RExC_parse;
3011 			RExC_parse += 2;
3012 			RExC_end = oldregxend;
3013 			vFAIL2("Missing right brace on \\%c{}", c);
3014 		    }
3015 		    RExC_end++;
3016 		}
3017 		else {
3018 		    RExC_end = RExC_parse + 2;
3019 		    if (RExC_end > oldregxend)
3020 			RExC_end = oldregxend;
3021 		}
3022 		RExC_parse--;
3023 
3024 		ret = regclass(pRExC_state);
3025 
3026 		RExC_end = oldregxend;
3027 		RExC_parse--;
3028 
3029 		Set_Node_Offset(ret, parse_start + 2);
3030 		Set_Node_Cur_Length(ret);
3031 		nextchar(pRExC_state);
3032 		*flagp |= HASWIDTH|SIMPLE;
3033 	    }
3034 	    break;
3035 	case 'n':
3036 	case 'r':
3037 	case 't':
3038 	case 'f':
3039 	case 'e':
3040 	case 'a':
3041 	case 'x':
3042 	case 'c':
3043 	case '0':
3044 	    goto defchar;
3045 	case '1': case '2': case '3': case '4':
3046 	case '5': case '6': case '7': case '8': case '9':
3047 	    {
3048 		I32 num = atoi(RExC_parse);
3049 
3050 		if (num > 9 && num >= RExC_npar)
3051 		    goto defchar;
3052 		else {
3053                     char * parse_start = RExC_parse - 1; /* MJD */
3054 		    while (isDIGIT(*RExC_parse))
3055 			RExC_parse++;
3056 
3057 		    if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3058 			vFAIL("Reference to nonexistent group");
3059 		    RExC_sawback = 1;
3060 		    ret = reganode(pRExC_state,
3061 				   (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3062 				   num);
3063 		    *flagp |= HASWIDTH;
3064 
3065                     /* override incorrect value set in reganode MJD */
3066                     Set_Node_Offset(ret, parse_start+1);
3067                     Set_Node_Cur_Length(ret); /* MJD */
3068 		    RExC_parse--;
3069 		    nextchar(pRExC_state);
3070 		}
3071 	    }
3072 	    break;
3073 	case '\0':
3074 	    if (RExC_parse >= RExC_end)
3075 		FAIL("Trailing \\");
3076 	    /* FALL THROUGH */
3077 	default:
3078 	    /* Do not generate `unrecognized' warnings here, we fall
3079 	       back into the quick-grab loop below */
3080 	    parse_start--;
3081 	    goto defchar;
3082 	}
3083 	break;
3084 
3085     case '#':
3086 	if (RExC_flags & PMf_EXTENDED) {
3087 	    while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3088 	    if (RExC_parse < RExC_end)
3089 		goto tryagain;
3090 	}
3091 	/* FALL THROUGH */
3092 
3093     default: {
3094 	    register STRLEN len;
3095 	    register UV ender;
3096 	    register char *p;
3097 	    char *oldp, *s;
3098 	    STRLEN numlen;
3099 	    STRLEN foldlen;
3100 	    U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3101 
3102             parse_start = RExC_parse - 1;
3103 
3104 	    RExC_parse++;
3105 
3106 	defchar:
3107 	    ender = 0;
3108 	    ret = reg_node(pRExC_state,
3109 			   (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3110 	    s = STRING(ret);
3111 	    for (len = 0, p = RExC_parse - 1;
3112 	      len < 127 && p < RExC_end;
3113 	      len++)
3114 	    {
3115 		oldp = p;
3116 
3117 		if (RExC_flags & PMf_EXTENDED)
3118 		    p = regwhite(p, RExC_end);
3119 		switch (*p) {
3120 		case '^':
3121 		case '$':
3122 		case '.':
3123 		case '[':
3124 		case '(':
3125 		case ')':
3126 		case '|':
3127 		    goto loopdone;
3128 		case '\\':
3129 		    switch (*++p) {
3130 		    case 'A':
3131 		    case 'C':
3132 		    case 'X':
3133 		    case 'G':
3134 		    case 'Z':
3135 		    case 'z':
3136 		    case 'w':
3137 		    case 'W':
3138 		    case 'b':
3139 		    case 'B':
3140 		    case 's':
3141 		    case 'S':
3142 		    case 'd':
3143 		    case 'D':
3144 		    case 'p':
3145 		    case 'P':
3146 			--p;
3147 			goto loopdone;
3148 		    case 'n':
3149 			ender = '\n';
3150 			p++;
3151 			break;
3152 		    case 'r':
3153 			ender = '\r';
3154 			p++;
3155 			break;
3156 		    case 't':
3157 			ender = '\t';
3158 			p++;
3159 			break;
3160 		    case 'f':
3161 			ender = '\f';
3162 			p++;
3163 			break;
3164 		    case 'e':
3165 			  ender = ASCII_TO_NATIVE('\033');
3166 			p++;
3167 			break;
3168 		    case 'a':
3169 			  ender = ASCII_TO_NATIVE('\007');
3170 			p++;
3171 			break;
3172 		    case 'x':
3173 			if (*++p == '{') {
3174 			    char* e = strchr(p, '}');
3175 
3176 			    if (!e) {
3177 				RExC_parse = p + 1;
3178 				vFAIL("Missing right brace on \\x{}");
3179 			    }
3180 			    else {
3181                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3182                                     | PERL_SCAN_DISALLOW_PREFIX;
3183                                 numlen = e - p - 1;
3184 				ender = grok_hex(p + 1, &numlen, &flags, NULL);
3185 				if (ender > 0xff)
3186 				    RExC_utf8 = 1;
3187 				p = e + 1;
3188 			    }
3189 			}
3190 			else {
3191                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3192 			    numlen = 2;
3193 			    ender = grok_hex(p, &numlen, &flags, NULL);
3194 			    p += numlen;
3195 			}
3196 			break;
3197 		    case 'c':
3198 			p++;
3199 			ender = UCHARAT(p++);
3200 			ender = toCTRL(ender);
3201 			break;
3202 		    case '0': case '1': case '2': case '3':case '4':
3203 		    case '5': case '6': case '7': case '8':case '9':
3204 			if (*p == '0' ||
3205 			  (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3206                             I32 flags = 0;
3207 			    numlen = 3;
3208 			    ender = grok_oct(p, &numlen, &flags, NULL);
3209 			    p += numlen;
3210 			}
3211 			else {
3212 			    --p;
3213 			    goto loopdone;
3214 			}
3215 			break;
3216 		    case '\0':
3217 			if (p >= RExC_end)
3218 			    FAIL("Trailing \\");
3219 			/* FALL THROUGH */
3220 		    default:
3221 			if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3222 			    vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3223 			goto normal_default;
3224 		    }
3225 		    break;
3226 		default:
3227 		  normal_default:
3228 		    if (UTF8_IS_START(*p) && UTF) {
3229 			ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3230 					       &numlen, 0);
3231 			p += numlen;
3232 		    }
3233 		    else
3234 			ender = *p++;
3235 		    break;
3236 		}
3237 		if (RExC_flags & PMf_EXTENDED)
3238 		    p = regwhite(p, RExC_end);
3239 		if (UTF && FOLD) {
3240 		    /* Prime the casefolded buffer. */
3241 		    ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3242 		}
3243 		if (ISMULT2(p)) { /* Back off on ?+*. */
3244 		    if (len)
3245 			p = oldp;
3246 		    else if (UTF) {
3247 		         STRLEN unilen;
3248 
3249 			 if (FOLD) {
3250 			      /* Emit all the Unicode characters. */
3251 			      for (foldbuf = tmpbuf;
3252 				   foldlen;
3253 				   foldlen -= numlen) {
3254 				   ender = utf8_to_uvchr(foldbuf, &numlen);
3255 				   if (numlen > 0) {
3256 					reguni(pRExC_state, ender, s, &unilen);
3257 					s       += unilen;
3258 					len     += unilen;
3259 					/* In EBCDIC the numlen
3260 					 * and unilen can differ. */
3261 					foldbuf += numlen;
3262 					if (numlen >= foldlen)
3263 					     break;
3264 				   }
3265 				   else
3266 					break; /* "Can't happen." */
3267 			      }
3268 			 }
3269 			 else {
3270 			      reguni(pRExC_state, ender, s, &unilen);
3271 			      if (unilen > 0) {
3272 				   s   += unilen;
3273 				   len += unilen;
3274 			      }
3275 			 }
3276 		    }
3277 		    else {
3278 			len++;
3279 			REGC((char)ender, s++);
3280 		    }
3281 		    break;
3282 		}
3283 		if (UTF) {
3284 		     STRLEN unilen;
3285 
3286 		     if (FOLD) {
3287 		          /* Emit all the Unicode characters. */
3288 			  for (foldbuf = tmpbuf;
3289 			       foldlen;
3290 			       foldlen -= numlen) {
3291 			       ender = utf8_to_uvchr(foldbuf, &numlen);
3292 			       if (numlen > 0) {
3293 				    reguni(pRExC_state, ender, s, &unilen);
3294 				    len     += unilen;
3295 				    s       += unilen;
3296 				    /* In EBCDIC the numlen
3297 				     * and unilen can differ. */
3298 				    foldbuf += numlen;
3299 				    if (numlen >= foldlen)
3300 					 break;
3301 			       }
3302 			       else
3303 				    break;
3304 			  }
3305 		     }
3306 		     else {
3307 			  reguni(pRExC_state, ender, s, &unilen);
3308 			  if (unilen > 0) {
3309 			       s   += unilen;
3310 			       len += unilen;
3311 			  }
3312 		     }
3313 		     len--;
3314 		}
3315 		else
3316 		    REGC((char)ender, s++);
3317 	    }
3318 	loopdone:
3319 	    RExC_parse = p - 1;
3320             Set_Node_Cur_Length(ret); /* MJD */
3321 	    nextchar(pRExC_state);
3322 	    {
3323 		/* len is STRLEN which is unsigned, need to copy to signed */
3324 		IV iv = len;
3325 		if (iv < 0)
3326 		    vFAIL("Internal disaster");
3327 	    }
3328 	    if (len > 0)
3329 		*flagp |= HASWIDTH;
3330 	    if (len == 1 && UNI_IS_INVARIANT(ender))
3331 		*flagp |= SIMPLE;
3332 	    if (!SIZE_ONLY)
3333 		STR_LEN(ret) = len;
3334 	    if (SIZE_ONLY)
3335 		RExC_size += STR_SZ(len);
3336 	    else
3337 		RExC_emit += STR_SZ(len);
3338 	}
3339 	break;
3340     }
3341 
3342     /* If the encoding pragma is in effect recode the text of
3343      * any EXACT-kind nodes. */
3344     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3345 	STRLEN oldlen = STR_LEN(ret);
3346 	SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3347 
3348 	if (RExC_utf8)
3349 	    SvUTF8_on(sv);
3350 	if (sv_utf8_downgrade(sv, TRUE)) {
3351 	    char *s       = sv_recode_to_utf8(sv, PL_encoding);
3352 	    STRLEN newlen = SvCUR(sv);
3353 
3354 	    if (SvUTF8(sv))
3355 		RExC_utf8 = 1;
3356 	    if (!SIZE_ONLY) {
3357 		DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3358 				      (int)oldlen, STRING(ret),
3359 				      (int)newlen, s));
3360 		Copy(s, STRING(ret), newlen, char);
3361 		STR_LEN(ret) += newlen - oldlen;
3362 		RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3363 	    } else
3364 		RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3365 	}
3366     }
3367 
3368     return(ret);
3369 }
3370 
3371 STATIC char *
S_regwhite(pTHX_ char * p,char * e)3372 S_regwhite(pTHX_ char *p, char *e)
3373 {
3374     while (p < e) {
3375 	if (isSPACE(*p))
3376 	    ++p;
3377 	else if (*p == '#') {
3378 	    do {
3379 		p++;
3380 	    } while (p < e && *p != '\n');
3381 	}
3382 	else
3383 	    break;
3384     }
3385     return p;
3386 }
3387 
3388 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3389    Character classes ([:foo:]) can also be negated ([:^foo:]).
3390    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3391    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3392    but trigger failures because they are currently unimplemented. */
3393 
3394 #define POSIXCC_DONE(c)   ((c) == ':')
3395 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3396 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3397 
3398 STATIC I32
S_regpposixcc(pTHX_ RExC_state_t * pRExC_state,I32 value)3399 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3400 {
3401     char *posixcc = 0;
3402     I32 namedclass = OOB_NAMEDCLASS;
3403 
3404     if (value == '[' && RExC_parse + 1 < RExC_end &&
3405 	/* I smell either [: or [= or [. -- POSIX has been here, right? */
3406 	POSIXCC(UCHARAT(RExC_parse))) {
3407 	char  c = UCHARAT(RExC_parse);
3408 	char* s = RExC_parse++;
3409 
3410 	while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3411 	    RExC_parse++;
3412 	if (RExC_parse == RExC_end)
3413 	    /* Grandfather lone [:, [=, [. */
3414 	    RExC_parse = s;
3415 	else {
3416 	    char* t = RExC_parse++; /* skip over the c */
3417 
3418   	    if (UCHARAT(RExC_parse) == ']') {
3419   		RExC_parse++; /* skip over the ending ] */
3420   		posixcc = s + 1;
3421 		if (*s == ':') {
3422 		    I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3423 		    I32 skip = 5; /* the most common skip */
3424 
3425 		    switch (*posixcc) {
3426 		    case 'a':
3427 			if (strnEQ(posixcc, "alnum", 5))
3428 			    namedclass =
3429 				complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3430 			else if (strnEQ(posixcc, "alpha", 5))
3431 			    namedclass =
3432 				complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3433 			else if (strnEQ(posixcc, "ascii", 5))
3434 			    namedclass =
3435 				complement ? ANYOF_NASCII : ANYOF_ASCII;
3436 			break;
3437 		    case 'b':
3438 			if (strnEQ(posixcc, "blank", 5))
3439 			    namedclass =
3440 				complement ? ANYOF_NBLANK : ANYOF_BLANK;
3441 			break;
3442 		    case 'c':
3443 			if (strnEQ(posixcc, "cntrl", 5))
3444 			    namedclass =
3445 				complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3446 			break;
3447 		    case 'd':
3448 			if (strnEQ(posixcc, "digit", 5))
3449 			    namedclass =
3450 				complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3451 			break;
3452 		    case 'g':
3453 			if (strnEQ(posixcc, "graph", 5))
3454 			    namedclass =
3455 				complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3456 			break;
3457 		    case 'l':
3458 			if (strnEQ(posixcc, "lower", 5))
3459 			    namedclass =
3460 				complement ? ANYOF_NLOWER : ANYOF_LOWER;
3461 			break;
3462 		    case 'p':
3463 			if (strnEQ(posixcc, "print", 5))
3464 			    namedclass =
3465 				complement ? ANYOF_NPRINT : ANYOF_PRINT;
3466 			else if (strnEQ(posixcc, "punct", 5))
3467 			    namedclass =
3468 				complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3469 			break;
3470 		    case 's':
3471 			if (strnEQ(posixcc, "space", 5))
3472 			    namedclass =
3473 				complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3474 			break;
3475 		    case 'u':
3476 			if (strnEQ(posixcc, "upper", 5))
3477 			    namedclass =
3478 				complement ? ANYOF_NUPPER : ANYOF_UPPER;
3479  			break;
3480 		    case 'w': /* this is not POSIX, this is the Perl \w */
3481 			if (strnEQ(posixcc, "word", 4)) {
3482 			    namedclass =
3483 				complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3484 			    skip = 4;
3485 			}
3486 			break;
3487 		    case 'x':
3488 			if (strnEQ(posixcc, "xdigit", 6)) {
3489 			    namedclass =
3490 				complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3491 			    skip = 6;
3492 			}
3493 			break;
3494 		    }
3495 		    if (namedclass == OOB_NAMEDCLASS ||
3496 			posixcc[skip] != ':' ||
3497 			posixcc[skip+1] != ']')
3498 		    {
3499 			Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3500 				      t - s - 1, s + 1);
3501 		    }
3502 		} else if (!SIZE_ONLY) {
3503 		    /* [[=foo=]] and [[.foo.]] are still future. */
3504 
3505 		    /* adjust RExC_parse so the warning shows after
3506 		       the class closes */
3507 		    while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3508 			RExC_parse++;
3509 		    Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3510 		}
3511 	    } else {
3512 		/* Maternal grandfather:
3513 		 * "[:" ending in ":" but not in ":]" */
3514 		RExC_parse = s;
3515 	    }
3516 	}
3517     }
3518 
3519     return namedclass;
3520 }
3521 
3522 STATIC void
S_checkposixcc(pTHX_ RExC_state_t * pRExC_state)3523 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3524 {
3525     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3526 	char *s = RExC_parse;
3527  	char  c = *s++;
3528 
3529 	while(*s && isALNUM(*s))
3530 	    s++;
3531 	if (*s && c == *s && s[1] == ']') {
3532 	    if (ckWARN(WARN_REGEXP))
3533 		vWARN3(s+2,
3534 			"POSIX syntax [%c %c] belongs inside character classes",
3535 			c, c);
3536 
3537 	    /* [[=foo=]] and [[.foo.]] are still future. */
3538 	    if (POSIXCC_NOTYET(c)) {
3539 		/* adjust RExC_parse so the error shows after
3540 		   the class closes */
3541 		while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3542 		    ;
3543 		Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3544 	    }
3545 	}
3546     }
3547 }
3548 
3549 STATIC regnode *
S_regclass(pTHX_ RExC_state_t * pRExC_state)3550 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3551 {
3552     register UV value;
3553     register UV nextvalue;
3554     register IV prevvalue = OOB_UNICODE;
3555     register IV range = 0;
3556     register regnode *ret;
3557     STRLEN numlen;
3558     IV namedclass;
3559     char *rangebegin = 0;
3560     bool need_class = 0;
3561     SV *listsv = Nullsv;
3562     register char *e;
3563     UV n;
3564     bool optimize_invert   = TRUE;
3565     AV* unicode_alternate  = 0;
3566 #ifdef EBCDIC
3567     UV literal_endpoint = 0;
3568 #endif
3569 
3570     ret = reganode(pRExC_state, ANYOF, 0);
3571 
3572     if (!SIZE_ONLY)
3573 	ANYOF_FLAGS(ret) = 0;
3574 
3575     if (UCHARAT(RExC_parse) == '^') {	/* Complement of range. */
3576 	RExC_naughty++;
3577 	RExC_parse++;
3578 	if (!SIZE_ONLY)
3579 	    ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3580     }
3581 
3582     if (SIZE_ONLY)
3583 	RExC_size += ANYOF_SKIP;
3584     else {
3585  	RExC_emit += ANYOF_SKIP;
3586 	if (FOLD)
3587 	    ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3588 	if (LOC)
3589 	    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3590 	ANYOF_BITMAP_ZERO(ret);
3591 	listsv = newSVpvn("# comment\n", 10);
3592     }
3593 
3594     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3595 
3596     if (!SIZE_ONLY && POSIXCC(nextvalue))
3597 	checkposixcc(pRExC_state);
3598 
3599     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3600     if (UCHARAT(RExC_parse) == ']')
3601 	goto charclassloop;
3602 
3603     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3604 
3605     charclassloop:
3606 
3607 	namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3608 
3609 	if (!range)
3610 	    rangebegin = RExC_parse;
3611 	if (UTF) {
3612 	    value = utf8n_to_uvchr((U8*)RExC_parse,
3613 				   RExC_end - RExC_parse,
3614 				   &numlen, 0);
3615 	    RExC_parse += numlen;
3616 	}
3617 	else
3618 	    value = UCHARAT(RExC_parse++);
3619 	nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3620 	if (value == '[' && POSIXCC(nextvalue))
3621 	    namedclass = regpposixcc(pRExC_state, value);
3622 	else if (value == '\\') {
3623 	    if (UTF) {
3624 		value = utf8n_to_uvchr((U8*)RExC_parse,
3625 				   RExC_end - RExC_parse,
3626 				   &numlen, 0);
3627 		RExC_parse += numlen;
3628 	    }
3629 	    else
3630 		value = UCHARAT(RExC_parse++);
3631 	    /* Some compilers cannot handle switching on 64-bit integer
3632 	     * values, therefore value cannot be an UV.  Yes, this will
3633 	     * be a problem later if we want switch on Unicode.
3634 	     * A similar issue a little bit later when switching on
3635 	     * namedclass. --jhi */
3636 	    switch ((I32)value) {
3637 	    case 'w':	namedclass = ANYOF_ALNUM;	break;
3638 	    case 'W':	namedclass = ANYOF_NALNUM;	break;
3639 	    case 's':	namedclass = ANYOF_SPACE;	break;
3640 	    case 'S':	namedclass = ANYOF_NSPACE;	break;
3641 	    case 'd':	namedclass = ANYOF_DIGIT;	break;
3642 	    case 'D':	namedclass = ANYOF_NDIGIT;	break;
3643 	    case 'p':
3644 	    case 'P':
3645 		if (RExC_parse >= RExC_end)
3646 		    vFAIL2("Empty \\%c{}", (U8)value);
3647 		if (*RExC_parse == '{') {
3648 		    U8 c = (U8)value;
3649 		    e = strchr(RExC_parse++, '}');
3650                     if (!e)
3651                         vFAIL2("Missing right brace on \\%c{}", c);
3652 		    while (isSPACE(UCHARAT(RExC_parse)))
3653 		        RExC_parse++;
3654                     if (e == RExC_parse)
3655                         vFAIL2("Empty \\%c{}", c);
3656 		    n = e - RExC_parse;
3657 		    while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3658 		        n--;
3659 		}
3660 		else {
3661 		    e = RExC_parse;
3662 		    n = 1;
3663 		}
3664 		if (!SIZE_ONLY) {
3665 		    if (UCHARAT(RExC_parse) == '^') {
3666 			 RExC_parse++;
3667 			 n--;
3668 			 value = value == 'p' ? 'P' : 'p'; /* toggle */
3669 			 while (isSPACE(UCHARAT(RExC_parse))) {
3670 			      RExC_parse++;
3671 			      n--;
3672 			 }
3673 		    }
3674 		    if (value == 'p')
3675 			 Perl_sv_catpvf(aTHX_ listsv,
3676 					"+utf8::%.*s\n", (int)n, RExC_parse);
3677 		    else
3678 			 Perl_sv_catpvf(aTHX_ listsv,
3679 					"!utf8::%.*s\n", (int)n, RExC_parse);
3680 		}
3681 		RExC_parse = e + 1;
3682 		ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3683 		continue;
3684 	    case 'n':	value = '\n';			break;
3685 	    case 'r':	value = '\r';			break;
3686 	    case 't':	value = '\t';			break;
3687 	    case 'f':	value = '\f';			break;
3688 	    case 'b':	value = '\b';			break;
3689 	    case 'e':	value = ASCII_TO_NATIVE('\033');break;
3690 	    case 'a':	value = ASCII_TO_NATIVE('\007');break;
3691 	    case 'x':
3692 		if (*RExC_parse == '{') {
3693                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3694                         | PERL_SCAN_DISALLOW_PREFIX;
3695 		    e = strchr(RExC_parse++, '}');
3696                     if (!e)
3697                         vFAIL("Missing right brace on \\x{}");
3698 
3699 		    numlen = e - RExC_parse;
3700 		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3701 		    RExC_parse = e + 1;
3702 		}
3703 		else {
3704                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3705 		    numlen = 2;
3706 		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3707 		    RExC_parse += numlen;
3708 		}
3709 		break;
3710 	    case 'c':
3711 		value = UCHARAT(RExC_parse++);
3712 		value = toCTRL(value);
3713 		break;
3714 	    case '0': case '1': case '2': case '3': case '4':
3715 	    case '5': case '6': case '7': case '8': case '9':
3716             {
3717                 I32 flags = 0;
3718 		numlen = 3;
3719 		value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3720 		RExC_parse += numlen;
3721 		break;
3722             }
3723 	    default:
3724 		if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3725 		    vWARN2(RExC_parse,
3726 			   "Unrecognized escape \\%c in character class passed through",
3727 			   (int)value);
3728 		break;
3729 	    }
3730 	} /* end of \blah */
3731 #ifdef EBCDIC
3732 	else
3733 	    literal_endpoint++;
3734 #endif
3735 
3736 	if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3737 
3738 	    if (!SIZE_ONLY && !need_class)
3739 		ANYOF_CLASS_ZERO(ret);
3740 
3741 	    need_class = 1;
3742 
3743 	    /* a bad range like a-\d, a-[:digit:] ? */
3744 	    if (range) {
3745 		if (!SIZE_ONLY) {
3746 		    if (ckWARN(WARN_REGEXP))
3747 			vWARN4(RExC_parse,
3748 			       "False [] range \"%*.*s\"",
3749 			       RExC_parse - rangebegin,
3750 			       RExC_parse - rangebegin,
3751 			       rangebegin);
3752 		    if (prevvalue < 256) {
3753 			ANYOF_BITMAP_SET(ret, prevvalue);
3754 			ANYOF_BITMAP_SET(ret, '-');
3755 		    }
3756 		    else {
3757 			ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3758 			Perl_sv_catpvf(aTHX_ listsv,
3759 				       "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3760 		    }
3761 		}
3762 
3763 		range = 0; /* this was not a true range */
3764 	    }
3765 
3766 	    if (!SIZE_ONLY) {
3767 	        if (namedclass > OOB_NAMEDCLASS)
3768 		    optimize_invert = FALSE;
3769 		/* Possible truncation here but in some 64-bit environments
3770 		 * the compiler gets heartburn about switch on 64-bit values.
3771 		 * A similar issue a little earlier when switching on value.
3772 		 * --jhi */
3773 		switch ((I32)namedclass) {
3774 		case ANYOF_ALNUM:
3775 		    if (LOC)
3776 			ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3777 		    else {
3778 			for (value = 0; value < 256; value++)
3779 			    if (isALNUM(value))
3780 				ANYOF_BITMAP_SET(ret, value);
3781 		    }
3782 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3783 		    break;
3784 		case ANYOF_NALNUM:
3785 		    if (LOC)
3786 			ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3787 		    else {
3788 			for (value = 0; value < 256; value++)
3789 			    if (!isALNUM(value))
3790 				ANYOF_BITMAP_SET(ret, value);
3791 		    }
3792 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3793 		    break;
3794 		case ANYOF_ALNUMC:
3795 		    if (LOC)
3796 			ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3797 		    else {
3798 			for (value = 0; value < 256; value++)
3799 			    if (isALNUMC(value))
3800 				ANYOF_BITMAP_SET(ret, value);
3801 		    }
3802 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3803 		    break;
3804 		case ANYOF_NALNUMC:
3805 		    if (LOC)
3806 			ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3807 		    else {
3808 			for (value = 0; value < 256; value++)
3809 			    if (!isALNUMC(value))
3810 				ANYOF_BITMAP_SET(ret, value);
3811 		    }
3812 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3813 		    break;
3814 		case ANYOF_ALPHA:
3815 		    if (LOC)
3816 			ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3817 		    else {
3818 			for (value = 0; value < 256; value++)
3819 			    if (isALPHA(value))
3820 				ANYOF_BITMAP_SET(ret, value);
3821 		    }
3822 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3823 		    break;
3824 		case ANYOF_NALPHA:
3825 		    if (LOC)
3826 			ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3827 		    else {
3828 			for (value = 0; value < 256; value++)
3829 			    if (!isALPHA(value))
3830 				ANYOF_BITMAP_SET(ret, value);
3831 		    }
3832 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3833 		    break;
3834 		case ANYOF_ASCII:
3835 		    if (LOC)
3836 			ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3837 		    else {
3838 #ifndef EBCDIC
3839 			for (value = 0; value < 128; value++)
3840 			    ANYOF_BITMAP_SET(ret, value);
3841 #else  /* EBCDIC */
3842 			for (value = 0; value < 256; value++) {
3843 			    if (isASCII(value))
3844 			        ANYOF_BITMAP_SET(ret, value);
3845 			}
3846 #endif /* EBCDIC */
3847 		    }
3848 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3849 		    break;
3850 		case ANYOF_NASCII:
3851 		    if (LOC)
3852 			ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3853 		    else {
3854 #ifndef EBCDIC
3855 			for (value = 128; value < 256; value++)
3856 			    ANYOF_BITMAP_SET(ret, value);
3857 #else  /* EBCDIC */
3858 			for (value = 0; value < 256; value++) {
3859 			    if (!isASCII(value))
3860 			        ANYOF_BITMAP_SET(ret, value);
3861 			}
3862 #endif /* EBCDIC */
3863 		    }
3864 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3865 		    break;
3866 		case ANYOF_BLANK:
3867 		    if (LOC)
3868 			ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3869 		    else {
3870 			for (value = 0; value < 256; value++)
3871 			    if (isBLANK(value))
3872 				ANYOF_BITMAP_SET(ret, value);
3873 		    }
3874 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3875 		    break;
3876 		case ANYOF_NBLANK:
3877 		    if (LOC)
3878 			ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3879 		    else {
3880 			for (value = 0; value < 256; value++)
3881 			    if (!isBLANK(value))
3882 				ANYOF_BITMAP_SET(ret, value);
3883 		    }
3884 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3885 		    break;
3886 		case ANYOF_CNTRL:
3887 		    if (LOC)
3888 			ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3889 		    else {
3890 			for (value = 0; value < 256; value++)
3891 			    if (isCNTRL(value))
3892 				ANYOF_BITMAP_SET(ret, value);
3893 		    }
3894 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3895 		    break;
3896 		case ANYOF_NCNTRL:
3897 		    if (LOC)
3898 			ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3899 		    else {
3900 			for (value = 0; value < 256; value++)
3901 			    if (!isCNTRL(value))
3902 				ANYOF_BITMAP_SET(ret, value);
3903 		    }
3904 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3905 		    break;
3906 		case ANYOF_DIGIT:
3907 		    if (LOC)
3908 			ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3909 		    else {
3910 			/* consecutive digits assumed */
3911 			for (value = '0'; value <= '9'; value++)
3912 			    ANYOF_BITMAP_SET(ret, value);
3913 		    }
3914 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3915 		    break;
3916 		case ANYOF_NDIGIT:
3917 		    if (LOC)
3918 			ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3919 		    else {
3920 			/* consecutive digits assumed */
3921 			for (value = 0; value < '0'; value++)
3922 			    ANYOF_BITMAP_SET(ret, value);
3923 			for (value = '9' + 1; value < 256; value++)
3924 			    ANYOF_BITMAP_SET(ret, value);
3925 		    }
3926 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3927 		    break;
3928 		case ANYOF_GRAPH:
3929 		    if (LOC)
3930 			ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3931 		    else {
3932 			for (value = 0; value < 256; value++)
3933 			    if (isGRAPH(value))
3934 				ANYOF_BITMAP_SET(ret, value);
3935 		    }
3936 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3937 		    break;
3938 		case ANYOF_NGRAPH:
3939 		    if (LOC)
3940 			ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3941 		    else {
3942 			for (value = 0; value < 256; value++)
3943 			    if (!isGRAPH(value))
3944 				ANYOF_BITMAP_SET(ret, value);
3945 		    }
3946 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3947 		    break;
3948 		case ANYOF_LOWER:
3949 		    if (LOC)
3950 			ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3951 		    else {
3952 			for (value = 0; value < 256; value++)
3953 			    if (isLOWER(value))
3954 				ANYOF_BITMAP_SET(ret, value);
3955 		    }
3956 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3957 		    break;
3958 		case ANYOF_NLOWER:
3959 		    if (LOC)
3960 			ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3961 		    else {
3962 			for (value = 0; value < 256; value++)
3963 			    if (!isLOWER(value))
3964 				ANYOF_BITMAP_SET(ret, value);
3965 		    }
3966 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3967 		    break;
3968 		case ANYOF_PRINT:
3969 		    if (LOC)
3970 			ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3971 		    else {
3972 			for (value = 0; value < 256; value++)
3973 			    if (isPRINT(value))
3974 				ANYOF_BITMAP_SET(ret, value);
3975 		    }
3976 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3977 		    break;
3978 		case ANYOF_NPRINT:
3979 		    if (LOC)
3980 			ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3981 		    else {
3982 			for (value = 0; value < 256; value++)
3983 			    if (!isPRINT(value))
3984 				ANYOF_BITMAP_SET(ret, value);
3985 		    }
3986 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3987 		    break;
3988 		case ANYOF_PSXSPC:
3989 		    if (LOC)
3990 			ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3991 		    else {
3992 			for (value = 0; value < 256; value++)
3993 			    if (isPSXSPC(value))
3994 				ANYOF_BITMAP_SET(ret, value);
3995 		    }
3996 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3997 		    break;
3998 		case ANYOF_NPSXSPC:
3999 		    if (LOC)
4000 			ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4001 		    else {
4002 			for (value = 0; value < 256; value++)
4003 			    if (!isPSXSPC(value))
4004 				ANYOF_BITMAP_SET(ret, value);
4005 		    }
4006 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
4007 		    break;
4008 		case ANYOF_PUNCT:
4009 		    if (LOC)
4010 			ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4011 		    else {
4012 			for (value = 0; value < 256; value++)
4013 			    if (isPUNCT(value))
4014 				ANYOF_BITMAP_SET(ret, value);
4015 		    }
4016 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
4017 		    break;
4018 		case ANYOF_NPUNCT:
4019 		    if (LOC)
4020 			ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4021 		    else {
4022 			for (value = 0; value < 256; value++)
4023 			    if (!isPUNCT(value))
4024 				ANYOF_BITMAP_SET(ret, value);
4025 		    }
4026 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4027 		    break;
4028 		case ANYOF_SPACE:
4029 		    if (LOC)
4030 			ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4031 		    else {
4032 			for (value = 0; value < 256; value++)
4033 			    if (isSPACE(value))
4034 				ANYOF_BITMAP_SET(ret, value);
4035 		    }
4036 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4037 		    break;
4038 		case ANYOF_NSPACE:
4039 		    if (LOC)
4040 			ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4041 		    else {
4042 			for (value = 0; value < 256; value++)
4043 			    if (!isSPACE(value))
4044 				ANYOF_BITMAP_SET(ret, value);
4045 		    }
4046 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4047 		    break;
4048 		case ANYOF_UPPER:
4049 		    if (LOC)
4050 			ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4051 		    else {
4052 			for (value = 0; value < 256; value++)
4053 			    if (isUPPER(value))
4054 				ANYOF_BITMAP_SET(ret, value);
4055 		    }
4056 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4057 		    break;
4058 		case ANYOF_NUPPER:
4059 		    if (LOC)
4060 			ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4061 		    else {
4062 			for (value = 0; value < 256; value++)
4063 			    if (!isUPPER(value))
4064 				ANYOF_BITMAP_SET(ret, value);
4065 		    }
4066 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4067 		    break;
4068 		case ANYOF_XDIGIT:
4069 		    if (LOC)
4070 			ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4071 		    else {
4072 			for (value = 0; value < 256; value++)
4073 			    if (isXDIGIT(value))
4074 				ANYOF_BITMAP_SET(ret, value);
4075 		    }
4076 		    Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4077 		    break;
4078 		case ANYOF_NXDIGIT:
4079 		    if (LOC)
4080 			ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4081 		    else {
4082 			for (value = 0; value < 256; value++)
4083 			    if (!isXDIGIT(value))
4084 				ANYOF_BITMAP_SET(ret, value);
4085 		    }
4086 		    Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4087 		    break;
4088 		default:
4089 		    vFAIL("Invalid [::] class");
4090 		    break;
4091 		}
4092 		if (LOC)
4093 		    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4094 		continue;
4095 	    }
4096 	} /* end of namedclass \blah */
4097 
4098 	if (range) {
4099 	    if (prevvalue > (IV)value) /* b-a */ {
4100 		Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4101 			      RExC_parse - rangebegin,
4102 			      RExC_parse - rangebegin,
4103 			      rangebegin);
4104 		range = 0; /* not a valid range */
4105 	    }
4106 	}
4107 	else {
4108 	    prevvalue = value; /* save the beginning of the range */
4109 	    if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4110 		RExC_parse[1] != ']') {
4111 		RExC_parse++;
4112 
4113 		/* a bad range like \w-, [:word:]- ? */
4114 		if (namedclass > OOB_NAMEDCLASS) {
4115 		    if (ckWARN(WARN_REGEXP))
4116 			vWARN4(RExC_parse,
4117 			       "False [] range \"%*.*s\"",
4118 			       RExC_parse - rangebegin,
4119 			       RExC_parse - rangebegin,
4120 			       rangebegin);
4121 		    if (!SIZE_ONLY)
4122 			ANYOF_BITMAP_SET(ret, '-');
4123 		} else
4124 		    range = 1;	/* yeah, it's a range! */
4125 		continue;	/* but do it the next time */
4126 	    }
4127 	}
4128 
4129 	/* now is the next time */
4130 	if (!SIZE_ONLY) {
4131 	    IV i;
4132 
4133 	    if (prevvalue < 256) {
4134 	        IV ceilvalue = value < 256 ? value : 255;
4135 
4136 #ifdef EBCDIC
4137 		/* In EBCDIC [\x89-\x91] should include
4138 		 * the \x8e but [i-j] should not. */
4139 		if (literal_endpoint == 2 &&
4140 		    ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4141 		     (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4142 		{
4143 		    if (isLOWER(prevvalue)) {
4144 			for (i = prevvalue; i <= ceilvalue; i++)
4145 			    if (isLOWER(i))
4146 				ANYOF_BITMAP_SET(ret, i);
4147 		    } else {
4148 			for (i = prevvalue; i <= ceilvalue; i++)
4149 			    if (isUPPER(i))
4150 				ANYOF_BITMAP_SET(ret, i);
4151 		    }
4152 		}
4153 		else
4154 #endif
4155 		      for (i = prevvalue; i <= ceilvalue; i++)
4156 			  ANYOF_BITMAP_SET(ret, i);
4157 	  }
4158 	  if (value > 255 || UTF) {
4159 	        UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4160 		UV natvalue      = NATIVE_TO_UNI(value);
4161 
4162 		ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4163 		if (prevnatvalue < natvalue) { /* what about > ? */
4164 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4165 				   prevnatvalue, natvalue);
4166 		}
4167 		else if (prevnatvalue == natvalue) {
4168 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4169 		    if (FOLD) {
4170 			 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4171 			 STRLEN foldlen;
4172 			 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4173 
4174 			 /* If folding and foldable and a single
4175 			  * character, insert also the folded version
4176 			  * to the charclass. */
4177 			 if (f != value) {
4178 			      if (foldlen == (STRLEN)UNISKIP(f))
4179 				  Perl_sv_catpvf(aTHX_ listsv,
4180 						 "%04"UVxf"\n", f);
4181 			      else {
4182 				  /* Any multicharacter foldings
4183 				   * require the following transform:
4184 				   * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4185 				   * where E folds into "pq" and F folds
4186 				   * into "rst", all other characters
4187 				   * fold to single characters.  We save
4188 				   * away these multicharacter foldings,
4189 				   * to be later saved as part of the
4190 				   * additional "s" data. */
4191 				  SV *sv;
4192 
4193 				  if (!unicode_alternate)
4194 				      unicode_alternate = newAV();
4195 				  sv = newSVpvn((char*)foldbuf, foldlen);
4196 				  SvUTF8_on(sv);
4197 				  av_push(unicode_alternate, sv);
4198 			      }
4199 			 }
4200 
4201 			 /* If folding and the value is one of the Greek
4202 			  * sigmas insert a few more sigmas to make the
4203 			  * folding rules of the sigmas to work right.
4204 			  * Note that not all the possible combinations
4205 			  * are handled here: some of them are handled
4206 			  * by the standard folding rules, and some of
4207 			  * them (literal or EXACTF cases) are handled
4208 			  * during runtime in regexec.c:S_find_byclass(). */
4209 			 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4210 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4211 					     (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4212 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4213 					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4214 			 }
4215 			 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4216 			      Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4217 					     (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4218 		    }
4219 		}
4220 	    }
4221 #ifdef EBCDIC
4222 	    literal_endpoint = 0;
4223 #endif
4224         }
4225 
4226 	range = 0; /* this range (if it was one) is done now */
4227     }
4228 
4229     if (need_class) {
4230 	ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4231 	if (SIZE_ONLY)
4232 	    RExC_size += ANYOF_CLASS_ADD_SKIP;
4233 	else
4234 	    RExC_emit += ANYOF_CLASS_ADD_SKIP;
4235     }
4236 
4237     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4238     if (!SIZE_ONLY &&
4239 	 /* If the only flag is folding (plus possibly inversion). */
4240 	((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4241        ) {
4242 	for (value = 0; value < 256; ++value) {
4243 	    if (ANYOF_BITMAP_TEST(ret, value)) {
4244 		UV fold = PL_fold[value];
4245 
4246 		if (fold != value)
4247 		    ANYOF_BITMAP_SET(ret, fold);
4248 	    }
4249 	}
4250 	ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4251     }
4252 
4253     /* optimize inverted simple patterns (e.g. [^a-z]) */
4254     if (!SIZE_ONLY && optimize_invert &&
4255 	/* If the only flag is inversion. */
4256 	(ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) ==	ANYOF_INVERT) {
4257 	for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4258 	    ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4259 	ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4260     }
4261 
4262     if (!SIZE_ONLY) {
4263 	AV *av = newAV();
4264 	SV *rv;
4265 
4266 	/* The 0th element stores the character class description
4267 	 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4268 	 * to initialize the appropriate swash (which gets stored in
4269 	 * the 1st element), and also useful for dumping the regnode.
4270 	 * The 2nd element stores the multicharacter foldings,
4271 	 * used later (regexec.c:S_reginclass()). */
4272 	av_store(av, 0, listsv);
4273 	av_store(av, 1, NULL);
4274 	av_store(av, 2, (SV*)unicode_alternate);
4275 	rv = newRV_noinc((SV*)av);
4276 	n = add_data(pRExC_state, 1, "s");
4277 	RExC_rx->data->data[n] = (void*)rv;
4278 	ARG_SET(ret, n);
4279     }
4280 
4281     return ret;
4282 }
4283 
4284 STATIC char*
S_nextchar(pTHX_ RExC_state_t * pRExC_state)4285 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4286 {
4287     char* retval = RExC_parse++;
4288 
4289     for (;;) {
4290 	if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4291 		RExC_parse[2] == '#') {
4292 	    while (*RExC_parse != ')') {
4293 		if (RExC_parse == RExC_end)
4294 		    FAIL("Sequence (?#... not terminated");
4295 		RExC_parse++;
4296 	    }
4297 	    RExC_parse++;
4298 	    continue;
4299 	}
4300 	if (RExC_flags & PMf_EXTENDED) {
4301 	    if (isSPACE(*RExC_parse)) {
4302 		RExC_parse++;
4303 		continue;
4304 	    }
4305 	    else if (*RExC_parse == '#') {
4306 		while (RExC_parse < RExC_end)
4307 		    if (*RExC_parse++ == '\n') break;
4308 		continue;
4309 	    }
4310 	}
4311 	return retval;
4312     }
4313 }
4314 
4315 /*
4316 - reg_node - emit a node
4317 */
4318 STATIC regnode *			/* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)4319 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4320 {
4321     register regnode *ret;
4322     register regnode *ptr;
4323 
4324     ret = RExC_emit;
4325     if (SIZE_ONLY) {
4326 	SIZE_ALIGN(RExC_size);
4327 	RExC_size += 1;
4328 	return(ret);
4329     }
4330 
4331     NODE_ALIGN_FILL(ret);
4332     ptr = ret;
4333     FILL_ADVANCE_NODE(ptr, op);
4334     if (RExC_offsets) {         /* MJD */
4335 	MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4336               "reg_node", __LINE__,
4337               reg_name[op],
4338               RExC_emit - RExC_emit_start > RExC_offsets[0]
4339               ? "Overwriting end of array!\n" : "OK",
4340               RExC_emit - RExC_emit_start,
4341               RExC_parse - RExC_start,
4342               RExC_offsets[0]));
4343 	Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4344     }
4345 
4346     RExC_emit = ptr;
4347 
4348     return(ret);
4349 }
4350 
4351 /*
4352 - reganode - emit a node with an argument
4353 */
4354 STATIC regnode *			/* Location. */
S_reganode(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)4355 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4356 {
4357     register regnode *ret;
4358     register regnode *ptr;
4359 
4360     ret = RExC_emit;
4361     if (SIZE_ONLY) {
4362 	SIZE_ALIGN(RExC_size);
4363 	RExC_size += 2;
4364 	return(ret);
4365     }
4366 
4367     NODE_ALIGN_FILL(ret);
4368     ptr = ret;
4369     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4370     if (RExC_offsets) {         /* MJD */
4371 	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4372               "reganode",
4373 	      __LINE__,
4374 	      reg_name[op],
4375               RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4376               "Overwriting end of array!\n" : "OK",
4377               RExC_emit - RExC_emit_start,
4378               RExC_parse - RExC_start,
4379               RExC_offsets[0]));
4380 	Set_Cur_Node_Offset;
4381     }
4382 
4383     RExC_emit = ptr;
4384 
4385     return(ret);
4386 }
4387 
4388 /*
4389 - reguni - emit (if appropriate) a Unicode character
4390 */
4391 STATIC void
S_reguni(pTHX_ RExC_state_t * pRExC_state,UV uv,char * s,STRLEN * lenp)4392 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4393 {
4394     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4395 }
4396 
4397 /*
4398 - reginsert - insert an operator in front of already-emitted operand
4399 *
4400 * Means relocating the operand.
4401 */
4402 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,U8 op,regnode * opnd)4403 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4404 {
4405     register regnode *src;
4406     register regnode *dst;
4407     register regnode *place;
4408     register int offset = regarglen[(U8)op];
4409 
4410 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4411 
4412     if (SIZE_ONLY) {
4413 	RExC_size += NODE_STEP_REGNODE + offset;
4414 	return;
4415     }
4416 
4417     src = RExC_emit;
4418     RExC_emit += NODE_STEP_REGNODE + offset;
4419     dst = RExC_emit;
4420     while (src > opnd) {
4421 	StructCopy(--src, --dst, regnode);
4422         if (RExC_offsets) {     /* MJD 20010112 */
4423 	    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4424                   "reg_insert",
4425 		  __LINE__,
4426 		  reg_name[op],
4427                   dst - RExC_emit_start > RExC_offsets[0]
4428                   ? "Overwriting end of array!\n" : "OK",
4429                   src - RExC_emit_start,
4430                   dst - RExC_emit_start,
4431                   RExC_offsets[0]));
4432 	    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4433 	    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4434         }
4435     }
4436 
4437 
4438     place = opnd;		/* Op node, where operand used to be. */
4439     if (RExC_offsets) {         /* MJD */
4440 	MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4441               "reginsert",
4442 	      __LINE__,
4443 	      reg_name[op],
4444               place - RExC_emit_start > RExC_offsets[0]
4445               ? "Overwriting end of array!\n" : "OK",
4446               place - RExC_emit_start,
4447               RExC_parse - RExC_start,
4448               RExC_offsets[0]));
4449 	Set_Node_Offset(place, RExC_parse);
4450 	Set_Node_Length(place, 1);
4451     }
4452     src = NEXTOPER(place);
4453     FILL_ADVANCE_NODE(place, op);
4454     Zero(src, offset, regnode);
4455 }
4456 
4457 /*
4458 - regtail - set the next-pointer at the end of a node chain of p to val.
4459 */
4460 STATIC void
S_regtail(pTHX_ RExC_state_t * pRExC_state,regnode * p,regnode * val)4461 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4462 {
4463     register regnode *scan;
4464     register regnode *temp;
4465 
4466     if (SIZE_ONLY)
4467 	return;
4468 
4469     /* Find last node. */
4470     scan = p;
4471     for (;;) {
4472 	temp = regnext(scan);
4473 	if (temp == NULL)
4474 	    break;
4475 	scan = temp;
4476     }
4477 
4478     if (reg_off_by_arg[OP(scan)]) {
4479 	ARG_SET(scan, val - scan);
4480     }
4481     else {
4482 	NEXT_OFF(scan) = val - scan;
4483     }
4484 }
4485 
4486 /*
4487 - regoptail - regtail on operand of first argument; nop if operandless
4488 */
4489 STATIC void
S_regoptail(pTHX_ RExC_state_t * pRExC_state,regnode * p,regnode * val)4490 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4491 {
4492     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4493     if (p == NULL || SIZE_ONLY)
4494 	return;
4495     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4496 	regtail(pRExC_state, NEXTOPER(p), val);
4497     }
4498     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4499 	regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4500     }
4501     else
4502 	return;
4503 }
4504 
4505 /*
4506  - regcurly - a little FSA that accepts {\d+,?\d*}
4507  */
4508 STATIC I32
S_regcurly(pTHX_ register char * s)4509 S_regcurly(pTHX_ register char *s)
4510 {
4511     if (*s++ != '{')
4512 	return FALSE;
4513     if (!isDIGIT(*s))
4514 	return FALSE;
4515     while (isDIGIT(*s))
4516 	s++;
4517     if (*s == ',')
4518 	s++;
4519     while (isDIGIT(*s))
4520 	s++;
4521     if (*s != '}')
4522 	return FALSE;
4523     return TRUE;
4524 }
4525 
4526 
4527 #ifdef DEBUGGING
4528 
4529 STATIC regnode *
S_dumpuntil(pTHX_ regnode * start,regnode * node,regnode * last,SV * sv,I32 l)4530 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4531 {
4532     register U8 op = EXACT;	/* Arbitrary non-END op. */
4533     register regnode *next;
4534 
4535     while (op != END && (!last || node < last)) {
4536 	/* While that wasn't END last time... */
4537 
4538 	NODE_ALIGN(node);
4539 	op = OP(node);
4540 	if (op == CLOSE)
4541 	    l--;
4542 	next = regnext(node);
4543 	/* Where, what. */
4544 	if (OP(node) == OPTIMIZED)
4545 	    goto after_print;
4546 	regprop(sv, node);
4547 	PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4548 		      (int)(2*l + 1), "", SvPVX(sv));
4549 	if (next == NULL)		/* Next ptr. */
4550 	    PerlIO_printf(Perl_debug_log, "(0)");
4551 	else
4552 	    PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4553 	(void)PerlIO_putc(Perl_debug_log, '\n');
4554       after_print:
4555 	if (PL_regkind[(U8)op] == BRANCHJ) {
4556 	    register regnode *nnode = (OP(next) == LONGJMP
4557 				       ? regnext(next)
4558 				       : next);
4559 	    if (last && nnode > last)
4560 		nnode = last;
4561 	    node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4562 	}
4563 	else if (PL_regkind[(U8)op] == BRANCH) {
4564 	    node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4565 	}
4566 	else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4567 	    node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4568 			     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4569 	}
4570 	else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4571 	    node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4572 			     next, sv, l + 1);
4573 	}
4574 	else if ( op == PLUS || op == STAR) {
4575 	    node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4576 	}
4577 	else if (op == ANYOF) {
4578 	    /* arglen 1 + class block */
4579 	    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4580 		    ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4581 	    node = NEXTOPER(node);
4582 	}
4583 	else if (PL_regkind[(U8)op] == EXACT) {
4584             /* Literal string, where present. */
4585 	    node += NODE_SZ_STR(node) - 1;
4586 	    node = NEXTOPER(node);
4587 	}
4588 	else {
4589 	    node = NEXTOPER(node);
4590 	    node += regarglen[(U8)op];
4591 	}
4592 	if (op == CURLYX || op == OPEN)
4593 	    l++;
4594 	else if (op == WHILEM)
4595 	    l--;
4596     }
4597     return node;
4598 }
4599 
4600 #endif	/* DEBUGGING */
4601 
4602 /*
4603  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4604  */
4605 void
Perl_regdump(pTHX_ regexp * r)4606 Perl_regdump(pTHX_ regexp *r)
4607 {
4608 #ifdef DEBUGGING
4609     SV *sv = sv_newmortal();
4610 
4611     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4612 
4613     /* Header fields of interest. */
4614     if (r->anchored_substr)
4615 	PerlIO_printf(Perl_debug_log,
4616 		      "anchored `%s%.*s%s'%s at %"IVdf" ",
4617 		      PL_colors[0],
4618 		      (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4619 		      SvPVX(r->anchored_substr),
4620 		      PL_colors[1],
4621 		      SvTAIL(r->anchored_substr) ? "$" : "",
4622 		      (IV)r->anchored_offset);
4623     else if (r->anchored_utf8)
4624 	PerlIO_printf(Perl_debug_log,
4625 		      "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4626 		      PL_colors[0],
4627 		      (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4628 		      SvPVX(r->anchored_utf8),
4629 		      PL_colors[1],
4630 		      SvTAIL(r->anchored_utf8) ? "$" : "",
4631 		      (IV)r->anchored_offset);
4632     if (r->float_substr)
4633 	PerlIO_printf(Perl_debug_log,
4634 		      "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4635 		      PL_colors[0],
4636 		      (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4637 		      SvPVX(r->float_substr),
4638 		      PL_colors[1],
4639 		      SvTAIL(r->float_substr) ? "$" : "",
4640 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
4641     else if (r->float_utf8)
4642 	PerlIO_printf(Perl_debug_log,
4643 		      "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4644 		      PL_colors[0],
4645 		      (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4646 		      SvPVX(r->float_utf8),
4647 		      PL_colors[1],
4648 		      SvTAIL(r->float_utf8) ? "$" : "",
4649 		      (IV)r->float_min_offset, (UV)r->float_max_offset);
4650     if (r->check_substr || r->check_utf8)
4651 	PerlIO_printf(Perl_debug_log,
4652 		      r->check_substr == r->float_substr
4653 		      && r->check_utf8 == r->float_utf8
4654 		      ? "(checking floating" : "(checking anchored");
4655     if (r->reganch & ROPT_NOSCAN)
4656 	PerlIO_printf(Perl_debug_log, " noscan");
4657     if (r->reganch & ROPT_CHECK_ALL)
4658 	PerlIO_printf(Perl_debug_log, " isall");
4659     if (r->check_substr || r->check_utf8)
4660 	PerlIO_printf(Perl_debug_log, ") ");
4661 
4662     if (r->regstclass) {
4663 	regprop(sv, r->regstclass);
4664 	PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4665     }
4666     if (r->reganch & ROPT_ANCH) {
4667 	PerlIO_printf(Perl_debug_log, "anchored");
4668 	if (r->reganch & ROPT_ANCH_BOL)
4669 	    PerlIO_printf(Perl_debug_log, "(BOL)");
4670 	if (r->reganch & ROPT_ANCH_MBOL)
4671 	    PerlIO_printf(Perl_debug_log, "(MBOL)");
4672 	if (r->reganch & ROPT_ANCH_SBOL)
4673 	    PerlIO_printf(Perl_debug_log, "(SBOL)");
4674 	if (r->reganch & ROPT_ANCH_GPOS)
4675 	    PerlIO_printf(Perl_debug_log, "(GPOS)");
4676 	PerlIO_putc(Perl_debug_log, ' ');
4677     }
4678     if (r->reganch & ROPT_GPOS_SEEN)
4679 	PerlIO_printf(Perl_debug_log, "GPOS ");
4680     if (r->reganch & ROPT_SKIP)
4681 	PerlIO_printf(Perl_debug_log, "plus ");
4682     if (r->reganch & ROPT_IMPLICIT)
4683 	PerlIO_printf(Perl_debug_log, "implicit ");
4684     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4685     if (r->reganch & ROPT_EVAL_SEEN)
4686 	PerlIO_printf(Perl_debug_log, "with eval ");
4687     PerlIO_printf(Perl_debug_log, "\n");
4688     if (r->offsets) {
4689       U32 i;
4690       U32 len = r->offsets[0];
4691       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4692       for (i = 1; i <= len; i++)
4693         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4694                       (UV)r->offsets[i*2-1],
4695                       (UV)r->offsets[i*2]);
4696       PerlIO_printf(Perl_debug_log, "\n");
4697     }
4698 #endif	/* DEBUGGING */
4699 }
4700 
4701 #ifdef DEBUGGING
4702 
4703 STATIC void
S_put_byte(pTHX_ SV * sv,int c)4704 S_put_byte(pTHX_ SV *sv, int c)
4705 {
4706     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4707 	Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4708     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4709 	Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4710     else
4711 	Perl_sv_catpvf(aTHX_ sv, "%c", c);
4712 }
4713 
4714 #endif	/* DEBUGGING */
4715 
4716 /*
4717 - regprop - printable representation of opcode
4718 */
4719 void
Perl_regprop(pTHX_ SV * sv,regnode * o)4720 Perl_regprop(pTHX_ SV *sv, regnode *o)
4721 {
4722 #ifdef DEBUGGING
4723     register int k;
4724 
4725     sv_setpvn(sv, "", 0);
4726     if (OP(o) >= reg_num)		/* regnode.type is unsigned */
4727 	/* It would be nice to FAIL() here, but this may be called from
4728 	   regexec.c, and it would be hard to supply pRExC_state. */
4729 	Perl_croak(aTHX_ "Corrupted regexp opcode");
4730     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4731 
4732     k = PL_regkind[(U8)OP(o)];
4733 
4734     if (k == EXACT) {
4735         SV *dsv = sv_2mortal(newSVpvn("", 0));
4736 	/* Using is_utf8_string() is a crude hack but it may
4737 	 * be the best for now since we have no flag "this EXACTish
4738 	 * node was UTF-8" --jhi */
4739 	bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4740 	char *s    = do_utf8 ?
4741 	  pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4742 			 UNI_DISPLAY_REGEX) :
4743 	  STRING(o);
4744 	int len = do_utf8 ?
4745 	  strlen(s) :
4746 	  STR_LEN(o);
4747 	Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4748 		       PL_colors[0],
4749 		       len, s,
4750 		       PL_colors[1]);
4751     }
4752     else if (k == CURLY) {
4753 	if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4754 	    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4755 	Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4756     }
4757     else if (k == WHILEM && o->flags)			/* Ordinal/of */
4758 	Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4759     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4760 	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));	/* Parenth number */
4761     else if (k == LOGICAL)
4762 	Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);	/* 2: embedded, otherwise 1 */
4763     else if (k == ANYOF) {
4764 	int i, rangestart = -1;
4765 	U8 flags = ANYOF_FLAGS(o);
4766 	const char * const anyofs[] = {	/* Should be synchronized with
4767 					 * ANYOF_ #xdefines in regcomp.h */
4768 	    "\\w",
4769 	    "\\W",
4770 	    "\\s",
4771 	    "\\S",
4772 	    "\\d",
4773 	    "\\D",
4774 	    "[:alnum:]",
4775 	    "[:^alnum:]",
4776 	    "[:alpha:]",
4777 	    "[:^alpha:]",
4778 	    "[:ascii:]",
4779 	    "[:^ascii:]",
4780 	    "[:ctrl:]",
4781 	    "[:^ctrl:]",
4782 	    "[:graph:]",
4783 	    "[:^graph:]",
4784 	    "[:lower:]",
4785 	    "[:^lower:]",
4786 	    "[:print:]",
4787 	    "[:^print:]",
4788 	    "[:punct:]",
4789 	    "[:^punct:]",
4790 	    "[:upper:]",
4791 	    "[:^upper:]",
4792 	    "[:xdigit:]",
4793 	    "[:^xdigit:]",
4794 	    "[:space:]",
4795 	    "[:^space:]",
4796 	    "[:blank:]",
4797 	    "[:^blank:]"
4798 	};
4799 
4800 	if (flags & ANYOF_LOCALE)
4801 	    sv_catpv(sv, "{loc}");
4802 	if (flags & ANYOF_FOLD)
4803 	    sv_catpv(sv, "{i}");
4804 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4805 	if (flags & ANYOF_INVERT)
4806 	    sv_catpv(sv, "^");
4807 	for (i = 0; i <= 256; i++) {
4808 	    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4809 		if (rangestart == -1)
4810 		    rangestart = i;
4811 	    } else if (rangestart != -1) {
4812 		if (i <= rangestart + 3)
4813 		    for (; rangestart < i; rangestart++)
4814 			put_byte(sv, rangestart);
4815 		else {
4816 		    put_byte(sv, rangestart);
4817 		    sv_catpv(sv, "-");
4818 		    put_byte(sv, i - 1);
4819 		}
4820 		rangestart = -1;
4821 	    }
4822 	}
4823 
4824 	if (o->flags & ANYOF_CLASS)
4825 	    for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4826 		if (ANYOF_CLASS_TEST(o,i))
4827 		    sv_catpv(sv, anyofs[i]);
4828 
4829 	if (flags & ANYOF_UNICODE)
4830 	    sv_catpv(sv, "{unicode}");
4831 	else if (flags & ANYOF_UNICODE_ALL)
4832 	    sv_catpv(sv, "{unicode_all}");
4833 
4834 	{
4835 	    SV *lv;
4836 	    SV *sw = regclass_swash(o, FALSE, &lv, 0);
4837 
4838 	    if (lv) {
4839 		if (sw) {
4840 		    U8 s[UTF8_MAXLEN+1];
4841 
4842 		    for (i = 0; i <= 256; i++) { /* just the first 256 */
4843 			U8 *e = uvchr_to_utf8(s, i);
4844 
4845 			if (i < 256 && swash_fetch(sw, s, TRUE)) {
4846 			    if (rangestart == -1)
4847 				rangestart = i;
4848 			} else if (rangestart != -1) {
4849 			    U8 *p;
4850 
4851 			    if (i <= rangestart + 3)
4852 				for (; rangestart < i; rangestart++) {
4853 				    for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4854 					put_byte(sv, *p);
4855 				}
4856 			    else {
4857 				for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4858 				    put_byte(sv, *p);
4859 				sv_catpv(sv, "-");
4860 				    for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4861 					put_byte(sv, *p);
4862 				}
4863 				rangestart = -1;
4864 			    }
4865 			}
4866 
4867 		    sv_catpv(sv, "..."); /* et cetera */
4868 		}
4869 
4870 		{
4871 		    char *s = savepv(SvPVX(lv));
4872 		    char *origs = s;
4873 
4874 		    while(*s && *s != '\n') s++;
4875 
4876 		    if (*s == '\n') {
4877 			char *t = ++s;
4878 
4879 			while (*s) {
4880 			    if (*s == '\n')
4881 				*s = ' ';
4882 			    s++;
4883 			}
4884 			if (s[-1] == ' ')
4885 			    s[-1] = 0;
4886 
4887 			sv_catpv(sv, t);
4888 		    }
4889 
4890 		    Safefree(origs);
4891 		}
4892 	    }
4893 	}
4894 
4895 	Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4896     }
4897     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4898 	Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4899 #endif	/* DEBUGGING */
4900 }
4901 
4902 SV *
Perl_re_intuit_string(pTHX_ regexp * prog)4903 Perl_re_intuit_string(pTHX_ regexp *prog)
4904 {				/* Assume that RE_INTUIT is set */
4905     DEBUG_r(
4906 	{   STRLEN n_a;
4907 	    char *s = SvPV(prog->check_substr
4908 		      ? prog->check_substr : prog->check_utf8, n_a);
4909 
4910 	    if (!PL_colorset) reginitcolors();
4911 	    PerlIO_printf(Perl_debug_log,
4912 		      "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4913 		      PL_colors[4],
4914 		      prog->check_substr ? "" : "utf8 ",
4915 		      PL_colors[5],PL_colors[0],
4916 		      s,
4917 		      PL_colors[1],
4918 		      (strlen(s) > 60 ? "..." : ""));
4919 	} );
4920 
4921     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4922 }
4923 
4924 void
Perl_pregfree(pTHX_ struct regexp * r)4925 Perl_pregfree(pTHX_ struct regexp *r)
4926 {
4927 #ifdef DEBUGGING
4928     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4929 #endif
4930 
4931     if (!r || (--r->refcnt > 0))
4932 	return;
4933     DEBUG_r({
4934 	 int len;
4935          char *s;
4936 
4937 	 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4938 		r->prelen, 60, UNI_DISPLAY_REGEX)
4939             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4940 	 len = SvCUR(dsv);
4941 	 if (!PL_colorset)
4942 	      reginitcolors();
4943 	 PerlIO_printf(Perl_debug_log,
4944 		       "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4945 		       PL_colors[4],PL_colors[5],PL_colors[0],
4946 		       len, len, s,
4947 		       PL_colors[1],
4948 		       len > 60 ? "..." : "");
4949     });
4950 
4951     if (r->precomp)
4952 	Safefree(r->precomp);
4953     if (r->offsets)             /* 20010421 MJD */
4954 	Safefree(r->offsets);
4955     if (RX_MATCH_COPIED(r))
4956 	Safefree(r->subbeg);
4957     if (r->substrs) {
4958 	if (r->anchored_substr)
4959 	    SvREFCNT_dec(r->anchored_substr);
4960 	if (r->anchored_utf8)
4961 	    SvREFCNT_dec(r->anchored_utf8);
4962 	if (r->float_substr)
4963 	    SvREFCNT_dec(r->float_substr);
4964 	if (r->float_utf8)
4965 	    SvREFCNT_dec(r->float_utf8);
4966 	Safefree(r->substrs);
4967     }
4968     if (r->data) {
4969 	int n = r->data->count;
4970 	PAD* new_comppad = NULL;
4971 	PAD* old_comppad;
4972 
4973 	while (--n >= 0) {
4974           /* If you add a ->what type here, update the comment in regcomp.h */
4975 	    switch (r->data->what[n]) {
4976 	    case 's':
4977 		SvREFCNT_dec((SV*)r->data->data[n]);
4978 		break;
4979 	    case 'f':
4980 		Safefree(r->data->data[n]);
4981 		break;
4982 	    case 'p':
4983 		new_comppad = (AV*)r->data->data[n];
4984 		break;
4985 	    case 'o':
4986 		if (new_comppad == NULL)
4987 		    Perl_croak(aTHX_ "panic: pregfree comppad");
4988 		PAD_SAVE_LOCAL(old_comppad,
4989 		    /* Watch out for global destruction's random ordering. */
4990 		    (SvTYPE(new_comppad) == SVt_PVAV) ?
4991 		    		new_comppad : Null(PAD *)
4992 		);
4993 		if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4994                     op_free((OP_4tree*)r->data->data[n]);
4995 		}
4996 
4997 		PAD_RESTORE_LOCAL(old_comppad);
4998 		SvREFCNT_dec((SV*)new_comppad);
4999 		new_comppad = NULL;
5000 		break;
5001 	    case 'n':
5002 	        break;
5003 	    default:
5004 		Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
5005 	    }
5006 	}
5007 	Safefree(r->data->what);
5008 	Safefree(r->data);
5009     }
5010     Safefree(r->startp);
5011     Safefree(r->endp);
5012     Safefree(r);
5013 }
5014 
5015 /*
5016  - regnext - dig the "next" pointer out of a node
5017  *
5018  * [Note, when REGALIGN is defined there are two places in regmatch()
5019  * that bypass this code for speed.]
5020  */
5021 regnode *
Perl_regnext(pTHX_ register regnode * p)5022 Perl_regnext(pTHX_ register regnode *p)
5023 {
5024     register I32 offset;
5025 
5026     if (p == &PL_regdummy)
5027 	return(NULL);
5028 
5029     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5030     if (offset == 0)
5031 	return(NULL);
5032 
5033     return(p+offset);
5034 }
5035 
5036 STATIC void
S_re_croak2(pTHX_ const char * pat1,const char * pat2,...)5037 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5038 {
5039     va_list args;
5040     STRLEN l1 = strlen(pat1);
5041     STRLEN l2 = strlen(pat2);
5042     char buf[512];
5043     SV *msv;
5044     char *message;
5045 
5046     if (l1 > 510)
5047 	l1 = 510;
5048     if (l1 + l2 > 510)
5049 	l2 = 510 - l1;
5050     Copy(pat1, buf, l1 , char);
5051     Copy(pat2, buf + l1, l2 , char);
5052     buf[l1 + l2] = '\n';
5053     buf[l1 + l2 + 1] = '\0';
5054 #ifdef I_STDARG
5055     /* ANSI variant takes additional second argument */
5056     va_start(args, pat2);
5057 #else
5058     va_start(args);
5059 #endif
5060     msv = vmess(buf, &args);
5061     va_end(args);
5062     message = SvPV(msv,l1);
5063     if (l1 > 512)
5064 	l1 = 512;
5065     Copy(message, buf, l1 , char);
5066     buf[l1-1] = '\0';			/* Overwrite \n */
5067     Perl_croak(aTHX_ "%s", buf);
5068 }
5069 
5070 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5071 
5072 void
Perl_save_re_context(pTHX)5073 Perl_save_re_context(pTHX)
5074 {
5075     SAVEI32(PL_reg_flags);		/* from regexec.c */
5076     SAVEPPTR(PL_bostr);
5077     SAVEPPTR(PL_reginput);		/* String-input pointer. */
5078     SAVEPPTR(PL_regbol);		/* Beginning of input, for ^ check. */
5079     SAVEPPTR(PL_regeol);		/* End of input, for $ check. */
5080     SAVEVPTR(PL_regstartp);		/* Pointer to startp array. */
5081     SAVEVPTR(PL_regendp);		/* Ditto for endp. */
5082     SAVEVPTR(PL_reglastparen);		/* Similarly for lastparen. */
5083     SAVEVPTR(PL_reglastcloseparen);	/* Similarly for lastcloseparen. */
5084     SAVEPPTR(PL_regtill);		/* How far we are required to go. */
5085     SAVEGENERICPV(PL_reg_start_tmp);		/* from regexec.c */
5086     PL_reg_start_tmp = 0;
5087     SAVEI32(PL_reg_start_tmpl);		/* from regexec.c */
5088     PL_reg_start_tmpl = 0;
5089     SAVEVPTR(PL_regdata);
5090     SAVEI32(PL_reg_eval_set);		/* from regexec.c */
5091     SAVEI32(PL_regnarrate);		/* from regexec.c */
5092     SAVEVPTR(PL_regprogram);		/* from regexec.c */
5093     SAVEINT(PL_regindent);		/* from regexec.c */
5094     SAVEVPTR(PL_regcc);			/* from regexec.c */
5095     SAVEVPTR(PL_curcop);
5096     SAVEVPTR(PL_reg_call_cc);		/* from regexec.c */
5097     SAVEVPTR(PL_reg_re);		/* from regexec.c */
5098     SAVEPPTR(PL_reg_ganch);		/* from regexec.c */
5099     SAVESPTR(PL_reg_sv);		/* from regexec.c */
5100     SAVEBOOL(PL_reg_match_utf8);	/* from regexec.c */
5101     SAVEVPTR(PL_reg_magic);		/* from regexec.c */
5102     SAVEI32(PL_reg_oldpos);			/* from regexec.c */
5103     SAVEVPTR(PL_reg_oldcurpm);		/* from regexec.c */
5104     SAVEVPTR(PL_reg_curpm);		/* from regexec.c */
5105     SAVEPPTR(PL_reg_oldsaved);		/* old saved substr during match */
5106     PL_reg_oldsaved = Nullch;
5107     SAVEI32(PL_reg_oldsavedlen);	/* old length of saved substr during match */
5108     PL_reg_oldsavedlen = 0;
5109     SAVEI32(PL_reg_maxiter);		/* max wait until caching pos */
5110     PL_reg_maxiter = 0;
5111     SAVEI32(PL_reg_leftiter);		/* wait until caching pos */
5112     PL_reg_leftiter = 0;
5113     SAVEGENERICPV(PL_reg_poscache);	/* cache of pos of WHILEM */
5114     PL_reg_poscache = Nullch;
5115     SAVEI32(PL_reg_poscache_size);	/* size of pos cache of WHILEM */
5116     PL_reg_poscache_size = 0;
5117     SAVEPPTR(PL_regprecomp);		/* uncompiled string. */
5118     SAVEI32(PL_regnpar);		/* () count. */
5119     SAVEI32(PL_regsize);		/* from regexec.c */
5120 
5121     {
5122 	/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5123 	U32 i;
5124 	GV *mgv;
5125 	REGEXP *rx;
5126 	char digits[16];
5127 
5128 	if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5129 	    for (i = 1; i <= rx->nparens; i++) {
5130 		sprintf(digits, "%lu", (long)i);
5131 		if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5132 		    save_scalar(mgv);
5133 	    }
5134 	}
5135     }
5136 
5137 #ifdef DEBUGGING
5138     SAVEPPTR(PL_reg_starttry);		/* from regexec.c */
5139 #endif
5140 }
5141 
5142 static void
clear_re(pTHX_ void * r)5143 clear_re(pTHX_ void *r)
5144 {
5145     ReREFCNT_dec((regexp *)r);
5146 }
5147 
5148