1 /* regcomp.c
2 */
3
4 /*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
26 # endif
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 # define DEBUGGING
30 # endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
44
45 # define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, by Larry Wall and others
74 ****
75 **** You may distribute under the terms of either the GNU General Public
76 **** License or the Artistic License, as specified in the README file.
77
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83 #include "EXTERN.h"
84 #define PERL_IN_REGCOMP_C
85 #include "perl.h"
86
87 #ifndef PERL_IN_XSUB_RE
88 # include "INTERN.h"
89 #endif
90
91 #define REG_COMP_C
92 #include "regcomp.h"
93
94 #ifdef op
95 #undef op
96 #endif /* op */
97
98 #ifdef MSDOS
99 # if defined(BUGGY_MSC6)
100 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101 # pragma optimize("a",off)
102 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103 # pragma optimize("w",on )
104 # endif /* BUGGY_MSC6 */
105 #endif /* MSDOS */
106
107 #ifndef STATIC
108 #define STATIC static
109 #endif
110
111 typedef struct RExC_state_t {
112 U32 flags; /* are we folding, multilining? */
113 char *precomp; /* uncompiled string. */
114 regexp *rx;
115 char *start; /* Start of input for compile */
116 char *end; /* End of input for compile */
117 char *parse; /* Input-scan pointer. */
118 I32 whilem_seen; /* number of WHILEM in this expr */
119 regnode *emit_start; /* Start of emitted-code area */
120 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
123 U32 seen;
124 I32 size; /* Code size. */
125 I32 npar; /* () count. */
126 I32 extralen;
127 I32 seen_zerolen;
128 I32 seen_evals;
129 I32 utf8; /* whether the pattern is utf8 or not */
130 I32 orig_utf8; /* whether the pattern was originally in utf8 */
131 /* XXX use this for future optimisation of case
132 * where pattern must be upgraded to utf8. */
133 #if ADD_TO_REGEXEC
134 char *starttry; /* -Dr: where regtry was called. */
135 #define RExC_starttry (pRExC_state->starttry)
136 #endif
137 } RExC_state_t;
138
139 #define RExC_flags (pRExC_state->flags)
140 #define RExC_precomp (pRExC_state->precomp)
141 #define RExC_rx (pRExC_state->rx)
142 #define RExC_start (pRExC_state->start)
143 #define RExC_end (pRExC_state->end)
144 #define RExC_parse (pRExC_state->parse)
145 #define RExC_whilem_seen (pRExC_state->whilem_seen)
146 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
147 #define RExC_emit (pRExC_state->emit)
148 #define RExC_emit_start (pRExC_state->emit_start)
149 #define RExC_naughty (pRExC_state->naughty)
150 #define RExC_sawback (pRExC_state->sawback)
151 #define RExC_seen (pRExC_state->seen)
152 #define RExC_size (pRExC_state->size)
153 #define RExC_npar (pRExC_state->npar)
154 #define RExC_extralen (pRExC_state->extralen)
155 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
156 #define RExC_seen_evals (pRExC_state->seen_evals)
157 #define RExC_utf8 (pRExC_state->utf8)
158 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
159
160 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
161 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
162 ((*s) == '{' && regcurly(s)))
163
164 #ifdef SPSTART
165 #undef SPSTART /* dratted cpp namespace... */
166 #endif
167 /*
168 * Flags to be passed up and down.
169 */
170 #define WORST 0 /* Worst case. */
171 #define HASWIDTH 0x1 /* Known to match non-null strings. */
172 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
173 #define SPSTART 0x4 /* Starts with * or +. */
174 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
175
176 /* Length of a variant. */
177
178 typedef struct scan_data_t {
179 I32 len_min;
180 I32 len_delta;
181 I32 pos_min;
182 I32 pos_delta;
183 SV *last_found;
184 I32 last_end; /* min value, <0 unless valid. */
185 I32 last_start_min;
186 I32 last_start_max;
187 SV **longest; /* Either &l_fixed, or &l_float. */
188 SV *longest_fixed;
189 I32 offset_fixed;
190 SV *longest_float;
191 I32 offset_float_min;
192 I32 offset_float_max;
193 I32 flags;
194 I32 whilem_c;
195 I32 *last_closep;
196 struct regnode_charclass_class *start_class;
197 } scan_data_t;
198
199 /*
200 * Forward declarations for pregcomp()'s friends.
201 */
202
203 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0};
205
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x1
208 #define SF_BEFORE_MEOL 0x2
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
211
212 #ifdef NO_UNARY_PLUS
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
215 #else
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
218 #endif
219
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
222
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x40
226 #define SF_HAS_PAR 0x80
227 #define SF_IN_PAR 0x100
228 #define SF_HAS_EVAL 0x200
229 #define SCF_DO_SUBSTR 0x400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
234
235 #define UTF (RExC_utf8 != 0)
236 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
237 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
238
239 #define OOB_UNICODE 12345678
240 #define OOB_NAMEDCLASS -1
241
242 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
243 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
244
245
246 /* length of regex to show in messages that don't mark a position within */
247 #define RegexLengthToShowInErrorMessages 127
248
249 /*
250 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
251 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
252 * op/pragma/warn/regcomp.
253 */
254 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
255 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
256
257 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
258
259 /*
260 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
261 * arg. Show regex, up to a maximum length. If it's too long, chop and add
262 * "...".
263 */
264 #define FAIL(msg) STMT_START { \
265 char *ellipses = ""; \
266 IV len = RExC_end - RExC_precomp; \
267 \
268 if (!SIZE_ONLY) \
269 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
270 if (len > RegexLengthToShowInErrorMessages) { \
271 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
272 len = RegexLengthToShowInErrorMessages - 10; \
273 ellipses = "..."; \
274 } \
275 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
276 msg, (int)len, RExC_precomp, ellipses); \
277 } STMT_END
278
279 /*
280 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
281 * args. Show regex, up to a maximum length. If it's too long, chop and add
282 * "...".
283 */
284 #define FAIL2(pat,msg) STMT_START { \
285 char *ellipses = ""; \
286 IV len = RExC_end - RExC_precomp; \
287 \
288 if (!SIZE_ONLY) \
289 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
290 if (len > RegexLengthToShowInErrorMessages) { \
291 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
292 len = RegexLengthToShowInErrorMessages - 10; \
293 ellipses = "..."; \
294 } \
295 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
296 msg, (int)len, RExC_precomp, ellipses); \
297 } STMT_END
298
299
300 /*
301 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
302 */
303 #define Simple_vFAIL(m) STMT_START { \
304 IV offset = RExC_parse - RExC_precomp; \
305 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
306 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
307 } STMT_END
308
309 /*
310 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
311 */
312 #define vFAIL(m) STMT_START { \
313 if (!SIZE_ONLY) \
314 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
315 Simple_vFAIL(m); \
316 } STMT_END
317
318 /*
319 * Like Simple_vFAIL(), but accepts two arguments.
320 */
321 #define Simple_vFAIL2(m,a1) STMT_START { \
322 IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
325 } STMT_END
326
327 /*
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
329 */
330 #define vFAIL2(m,a1) STMT_START { \
331 if (!SIZE_ONLY) \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL2(m, a1); \
334 } STMT_END
335
336
337 /*
338 * Like Simple_vFAIL(), but accepts three arguments.
339 */
340 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
341 IV offset = RExC_parse - RExC_precomp; \
342 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
343 (int)offset, RExC_precomp, RExC_precomp + offset); \
344 } STMT_END
345
346 /*
347 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
348 */
349 #define vFAIL3(m,a1,a2) STMT_START { \
350 if (!SIZE_ONLY) \
351 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
352 Simple_vFAIL3(m, a1, a2); \
353 } STMT_END
354
355 /*
356 * Like Simple_vFAIL(), but accepts four arguments.
357 */
358 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
359 IV offset = RExC_parse - RExC_precomp; \
360 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
361 (int)offset, RExC_precomp, RExC_precomp + offset); \
362 } STMT_END
363
364 /*
365 * Like Simple_vFAIL(), but accepts five arguments.
366 */
367 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
368 IV offset = RExC_parse - RExC_precomp; \
369 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
370 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 } STMT_END
372
373
374 #define vWARN(loc,m) STMT_START { \
375 IV offset = loc - RExC_precomp; \
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
377 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
378 } STMT_END
379
380 #define vWARNdep(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
383 "%s" REPORT_LOCATION, \
384 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
385 } STMT_END
386
387
388 #define vWARN2(loc, m, a1) STMT_START { \
389 IV offset = loc - RExC_precomp; \
390 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
391 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
392 } STMT_END
393
394 #define vWARN3(loc, m, a1, a2) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END
399
400 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
404 } STMT_END
405
406 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
410 } STMT_END
411
412
413 /* Allow for side effects in s */
414 #define REGC(c,s) STMT_START { \
415 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
416 } STMT_END
417
418 /* Macros for recording node offsets. 20001227 mjd@plover.com
419 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
420 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
421 * Element 0 holds the number n.
422 */
423
424 #define MJD_OFFSET_DEBUG(x)
425 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
426
427
428 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
429 if (! SIZE_ONLY) { \
430 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
431 __LINE__, (node), (byte))); \
432 if((node) < 0) { \
433 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
434 } else { \
435 RExC_offsets[2*(node)-1] = (byte); \
436 } \
437 } \
438 } STMT_END
439
440 #define Set_Node_Offset(node,byte) \
441 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
442 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
443
444 #define Set_Node_Length_To_R(node,len) STMT_START { \
445 if (! SIZE_ONLY) { \
446 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
447 __LINE__, (node), (len))); \
448 if((node) < 0) { \
449 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
450 } else { \
451 RExC_offsets[2*(node)] = (len); \
452 } \
453 } \
454 } STMT_END
455
456 #define Set_Node_Length(node,len) \
457 Set_Node_Length_To_R((node)-RExC_emit_start, len)
458 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
459 #define Set_Node_Cur_Length(node) \
460 Set_Node_Length(node, RExC_parse - parse_start)
461
462 /* Get offsets and lengths */
463 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
464 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
465
466 static void clear_re(pTHX_ void *r);
467
468 /* Mark that we cannot extend a found fixed substring at this point.
469 Updata the longest found anchored substring and the longest found
470 floating substrings if needed. */
471
472 STATIC void
S_scan_commit(pTHX_ RExC_state_t * pRExC_state,scan_data_t * data)473 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
474 {
475 STRLEN l = CHR_SVLEN(data->last_found);
476 STRLEN old_l = CHR_SVLEN(*data->longest);
477
478 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
479 SvSetMagicSV(*data->longest, data->last_found);
480 if (*data->longest == data->longest_fixed) {
481 data->offset_fixed = l ? data->last_start_min : data->pos_min;
482 if (data->flags & SF_BEFORE_EOL)
483 data->flags
484 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
485 else
486 data->flags &= ~SF_FIX_BEFORE_EOL;
487 }
488 else {
489 data->offset_float_min = l ? data->last_start_min : data->pos_min;
490 data->offset_float_max = (l
491 ? data->last_start_max
492 : data->pos_min + data->pos_delta);
493 if ((U32)data->offset_float_max > (U32)I32_MAX)
494 data->offset_float_max = I32_MAX;
495 if (data->flags & SF_BEFORE_EOL)
496 data->flags
497 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
498 else
499 data->flags &= ~SF_FL_BEFORE_EOL;
500 }
501 }
502 SvCUR_set(data->last_found, 0);
503 {
504 SV * sv = data->last_found;
505 MAGIC *mg =
506 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
507 if (mg && mg->mg_len > 0)
508 mg->mg_len = 0;
509 }
510 data->last_end = -1;
511 data->flags &= ~SF_BEFORE_EOL;
512 }
513
514 /* Can match anything (initialization) */
515 STATIC void
S_cl_anything(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl)516 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
517 {
518 ANYOF_CLASS_ZERO(cl);
519 ANYOF_BITMAP_SETALL(cl);
520 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
521 if (LOC)
522 cl->flags |= ANYOF_LOCALE;
523 }
524
525 /* Can match anything (initialization) */
526 STATIC int
S_cl_is_anything(pTHX_ struct regnode_charclass_class * cl)527 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
528 {
529 int value;
530
531 for (value = 0; value <= ANYOF_MAX; value += 2)
532 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
533 return 1;
534 if (!(cl->flags & ANYOF_UNICODE_ALL))
535 return 0;
536 if (!ANYOF_BITMAP_TESTALLSET(cl))
537 return 0;
538 return 1;
539 }
540
541 /* Can match anything (initialization) */
542 STATIC void
S_cl_init(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl)543 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
544 {
545 Zero(cl, 1, struct regnode_charclass_class);
546 cl->type = ANYOF;
547 cl_anything(pRExC_state, cl);
548 }
549
550 STATIC void
S_cl_init_zero(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl)551 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
552 {
553 Zero(cl, 1, struct regnode_charclass_class);
554 cl->type = ANYOF;
555 cl_anything(pRExC_state, cl);
556 if (LOC)
557 cl->flags |= ANYOF_LOCALE;
558 }
559
560 /* 'And' a given class with another one. Can create false positives */
561 /* We assume that cl is not inverted */
562 STATIC void
S_cl_and(pTHX_ struct regnode_charclass_class * cl,struct regnode_charclass_class * and_with)563 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
564 struct regnode_charclass_class *and_with)
565 {
566 if (!(and_with->flags & ANYOF_CLASS)
567 && !(cl->flags & ANYOF_CLASS)
568 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
569 && !(and_with->flags & ANYOF_FOLD)
570 && !(cl->flags & ANYOF_FOLD)) {
571 int i;
572
573 if (and_with->flags & ANYOF_INVERT)
574 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
575 cl->bitmap[i] &= ~and_with->bitmap[i];
576 else
577 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
578 cl->bitmap[i] &= and_with->bitmap[i];
579 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
580 if (!(and_with->flags & ANYOF_EOS))
581 cl->flags &= ~ANYOF_EOS;
582
583 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
584 !(and_with->flags & ANYOF_INVERT)) {
585 cl->flags &= ~ANYOF_UNICODE_ALL;
586 cl->flags |= ANYOF_UNICODE;
587 ARG_SET(cl, ARG(and_with));
588 }
589 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
590 !(and_with->flags & ANYOF_INVERT))
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
593 !(and_with->flags & ANYOF_INVERT))
594 cl->flags &= ~ANYOF_UNICODE;
595 }
596
597 /* 'OR' a given class with another one. Can create false positives */
598 /* We assume that cl is not inverted */
599 STATIC void
S_cl_or(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl,struct regnode_charclass_class * or_with)600 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
601 {
602 if (or_with->flags & ANYOF_INVERT) {
603 /* We do not use
604 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
605 * <= (B1 | !B2) | (CL1 | !CL2)
606 * which is wasteful if CL2 is small, but we ignore CL2:
607 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
608 * XXXX Can we handle case-fold? Unclear:
609 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
610 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
611 */
612 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
613 && !(or_with->flags & ANYOF_FOLD)
614 && !(cl->flags & ANYOF_FOLD) ) {
615 int i;
616
617 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
618 cl->bitmap[i] |= ~or_with->bitmap[i];
619 } /* XXXX: logic is complicated otherwise */
620 else {
621 cl_anything(pRExC_state, cl);
622 }
623 } else {
624 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
625 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
626 && (!(or_with->flags & ANYOF_FOLD)
627 || (cl->flags & ANYOF_FOLD)) ) {
628 int i;
629
630 /* OR char bitmap and class bitmap separately */
631 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
632 cl->bitmap[i] |= or_with->bitmap[i];
633 if (or_with->flags & ANYOF_CLASS) {
634 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
635 cl->classflags[i] |= or_with->classflags[i];
636 cl->flags |= ANYOF_CLASS;
637 }
638 }
639 else { /* XXXX: logic is complicated, leave it along for a moment. */
640 cl_anything(pRExC_state, cl);
641 }
642 }
643 if (or_with->flags & ANYOF_EOS)
644 cl->flags |= ANYOF_EOS;
645
646 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
647 ARG(cl) != ARG(or_with)) {
648 cl->flags |= ANYOF_UNICODE_ALL;
649 cl->flags &= ~ANYOF_UNICODE;
650 }
651 if (or_with->flags & ANYOF_UNICODE_ALL) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
654 }
655 }
656
657 /*
658 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
659 * These need to be revisited when a newer toolchain becomes available.
660 */
661 #if defined(__sparc64__) && defined(__GNUC__)
662 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
663 # undef SPARC64_GCC_WORKAROUND
664 # define SPARC64_GCC_WORKAROUND 1
665 # endif
666 #endif
667
668 /* REx optimizer. Converts nodes into quickier variants "in place".
669 Finds fixed substrings. */
670
671 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
672 to the position after last scanned or to NULL. */
673
674 STATIC I32
S_study_chunk(pTHX_ RExC_state_t * pRExC_state,regnode ** scanp,I32 * deltap,regnode * last,scan_data_t * data,U32 flags)675 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
676 /* scanp: Start here (read-write). */
677 /* deltap: Write maxlen-minlen here. */
678 /* last: Stop before this one. */
679 {
680 I32 min = 0, pars = 0, code;
681 regnode *scan = *scanp, *next;
682 I32 delta = 0;
683 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
684 int is_inf_internal = 0; /* The studied chunk is infinite */
685 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
686 scan_data_t data_fake;
687 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
688
689 while (scan && OP(scan) != END && scan < last) {
690 /* Peephole optimizer: */
691
692 if (PL_regkind[(U8)OP(scan)] == EXACT) {
693 /* Merge several consecutive EXACTish nodes into one. */
694 regnode *n = regnext(scan);
695 U32 stringok = 1;
696 #ifdef DEBUGGING
697 regnode *stop = scan;
698 #endif
699
700 next = scan + NODE_SZ_STR(scan);
701 /* Skip NOTHING, merge EXACT*. */
702 while (n &&
703 ( PL_regkind[(U8)OP(n)] == NOTHING ||
704 (stringok && (OP(n) == OP(scan))))
705 && NEXT_OFF(n)
706 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
707 if (OP(n) == TAIL || n > next)
708 stringok = 0;
709 if (PL_regkind[(U8)OP(n)] == NOTHING) {
710 NEXT_OFF(scan) += NEXT_OFF(n);
711 next = n + NODE_STEP_REGNODE;
712 #ifdef DEBUGGING
713 if (stringok)
714 stop = n;
715 #endif
716 n = regnext(n);
717 }
718 else if (stringok) {
719 int oldl = STR_LEN(scan);
720 regnode *nnext = regnext(n);
721
722 if (oldl + STR_LEN(n) > U8_MAX)
723 break;
724 NEXT_OFF(scan) += NEXT_OFF(n);
725 STR_LEN(scan) += STR_LEN(n);
726 next = n + NODE_SZ_STR(n);
727 /* Now we can overwrite *n : */
728 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
729 #ifdef DEBUGGING
730 stop = next - 1;
731 #endif
732 n = nnext;
733 }
734 }
735
736 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
737 /*
738 Two problematic code points in Unicode casefolding of EXACT nodes:
739
740 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
741 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
742
743 which casefold to
744
745 Unicode UTF-8
746
747 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
748 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
749
750 This means that in case-insensitive matching (or "loose matching",
751 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
752 length of the above casefolded versions) can match a target string
753 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
754 This would rather mess up the minimum length computation.
755
756 What we'll do is to look for the tail four bytes, and then peek
757 at the preceding two bytes to see whether we need to decrease
758 the minimum length by four (six minus two).
759
760 Thanks to the design of UTF-8, there cannot be false matches:
761 A sequence of valid UTF-8 bytes cannot be a subsequence of
762 another valid sequence of UTF-8 bytes.
763
764 */
765 char *s0 = STRING(scan), *s, *t;
766 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
767 char *t0 = "\xcc\x88\xcc\x81";
768 char *t1 = t0 + 3;
769
770 for (s = s0 + 2;
771 s < s2 && (t = ninstr(s, s1, t0, t1));
772 s = t + 4) {
773 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
774 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
775 min -= 4;
776 }
777 }
778
779 #ifdef DEBUGGING
780 /* Allow dumping */
781 n = scan + NODE_SZ_STR(scan);
782 while (n <= stop) {
783 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
784 OP(n) = OPTIMIZED;
785 NEXT_OFF(n) = 0;
786 }
787 n++;
788 }
789 #endif
790 }
791 /* Follow the next-chain of the current node and optimize
792 away all the NOTHINGs from it. */
793 if (OP(scan) != CURLYX) {
794 int max = (reg_off_by_arg[OP(scan)]
795 ? I32_MAX
796 /* I32 may be smaller than U16 on CRAYs! */
797 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
798 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
799 int noff;
800 regnode *n = scan;
801
802 /* Skip NOTHING and LONGJMP. */
803 while ((n = regnext(n))
804 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
805 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
806 && off + noff < max)
807 off += noff;
808 if (reg_off_by_arg[OP(scan)])
809 ARG(scan) = off;
810 else
811 NEXT_OFF(scan) = off;
812 }
813 /* The principal pseudo-switch. Cannot be a switch, since we
814 look into several different things. */
815 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
816 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
817 next = regnext(scan);
818 code = OP(scan);
819
820 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
821 I32 max1 = 0, min1 = I32_MAX, num = 0;
822 struct regnode_charclass_class accum;
823
824 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
825 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
826 if (flags & SCF_DO_STCLASS)
827 cl_init_zero(pRExC_state, &accum);
828 while (OP(scan) == code) {
829 I32 deltanext, minnext, f = 0, fake;
830 struct regnode_charclass_class this_class;
831
832 num++;
833 data_fake.flags = 0;
834 if (data) {
835 data_fake.whilem_c = data->whilem_c;
836 data_fake.last_closep = data->last_closep;
837 }
838 else
839 data_fake.last_closep = &fake;
840 next = regnext(scan);
841 scan = NEXTOPER(scan);
842 if (code != BRANCH)
843 scan = NEXTOPER(scan);
844 if (flags & SCF_DO_STCLASS) {
845 cl_init(pRExC_state, &this_class);
846 data_fake.start_class = &this_class;
847 f = SCF_DO_STCLASS_AND;
848 }
849 if (flags & SCF_WHILEM_VISITED_POS)
850 f |= SCF_WHILEM_VISITED_POS;
851 /* we suppose the run is continuous, last=next...*/
852 minnext = study_chunk(pRExC_state, &scan, &deltanext,
853 next, &data_fake, f);
854 if (min1 > minnext)
855 min1 = minnext;
856 if (max1 < minnext + deltanext)
857 max1 = minnext + deltanext;
858 if (deltanext == I32_MAX)
859 is_inf = is_inf_internal = 1;
860 scan = next;
861 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
862 pars++;
863 if (data && (data_fake.flags & SF_HAS_EVAL))
864 data->flags |= SF_HAS_EVAL;
865 if (data)
866 data->whilem_c = data_fake.whilem_c;
867 if (flags & SCF_DO_STCLASS)
868 cl_or(pRExC_state, &accum, &this_class);
869 if (code == SUSPEND)
870 break;
871 }
872 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
873 min1 = 0;
874 if (flags & SCF_DO_SUBSTR) {
875 data->pos_min += min1;
876 data->pos_delta += max1 - min1;
877 if (max1 != min1 || is_inf)
878 data->longest = &(data->longest_float);
879 }
880 min += min1;
881 delta += max1 - min1;
882 if (flags & SCF_DO_STCLASS_OR) {
883 cl_or(pRExC_state, data->start_class, &accum);
884 if (min1) {
885 cl_and(data->start_class, &and_with);
886 flags &= ~SCF_DO_STCLASS;
887 }
888 }
889 else if (flags & SCF_DO_STCLASS_AND) {
890 if (min1) {
891 cl_and(data->start_class, &accum);
892 flags &= ~SCF_DO_STCLASS;
893 }
894 else {
895 /* Switch to OR mode: cache the old value of
896 * data->start_class */
897 StructCopy(data->start_class, &and_with,
898 struct regnode_charclass_class);
899 flags &= ~SCF_DO_STCLASS_AND;
900 StructCopy(&accum, data->start_class,
901 struct regnode_charclass_class);
902 flags |= SCF_DO_STCLASS_OR;
903 data->start_class->flags |= ANYOF_EOS;
904 }
905 }
906 }
907 else if (code == BRANCHJ) /* single branch is optimized. */
908 scan = NEXTOPER(NEXTOPER(scan));
909 else /* single branch is optimized. */
910 scan = NEXTOPER(scan);
911 continue;
912 }
913 else if (OP(scan) == EXACT) {
914 I32 l = STR_LEN(scan);
915 UV uc = *((U8*)STRING(scan));
916 if (UTF) {
917 U8 *s = (U8*)STRING(scan);
918 l = utf8_length(s, s + l);
919 uc = utf8_to_uvchr(s, NULL);
920 }
921 min += l;
922 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
923 /* The code below prefers earlier match for fixed
924 offset, later match for variable offset. */
925 if (data->last_end == -1) { /* Update the start info. */
926 data->last_start_min = data->pos_min;
927 data->last_start_max = is_inf
928 ? I32_MAX : data->pos_min + data->pos_delta;
929 }
930 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
931 {
932 SV * sv = data->last_found;
933 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
934 mg_find(sv, PERL_MAGIC_utf8) : NULL;
935 if (mg && mg->mg_len >= 0)
936 mg->mg_len += utf8_length((U8*)STRING(scan),
937 (U8*)STRING(scan)+STR_LEN(scan));
938 }
939 if (UTF)
940 SvUTF8_on(data->last_found);
941 data->last_end = data->pos_min + l;
942 data->pos_min += l; /* As in the first entry. */
943 data->flags &= ~SF_BEFORE_EOL;
944 }
945 if (flags & SCF_DO_STCLASS_AND) {
946 /* Check whether it is compatible with what we know already! */
947 int compat = 1;
948
949 if (uc >= 0x100 ||
950 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
951 && !ANYOF_BITMAP_TEST(data->start_class, uc)
952 && (!(data->start_class->flags & ANYOF_FOLD)
953 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
954 )
955 compat = 0;
956 ANYOF_CLASS_ZERO(data->start_class);
957 ANYOF_BITMAP_ZERO(data->start_class);
958 if (compat)
959 ANYOF_BITMAP_SET(data->start_class, uc);
960 data->start_class->flags &= ~ANYOF_EOS;
961 if (uc < 0x100)
962 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
963 }
964 else if (flags & SCF_DO_STCLASS_OR) {
965 /* false positive possible if the class is case-folded */
966 if (uc < 0x100)
967 ANYOF_BITMAP_SET(data->start_class, uc);
968 else
969 data->start_class->flags |= ANYOF_UNICODE_ALL;
970 data->start_class->flags &= ~ANYOF_EOS;
971 cl_and(data->start_class, &and_with);
972 }
973 flags &= ~SCF_DO_STCLASS;
974 }
975 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
976 I32 l = STR_LEN(scan);
977 UV uc = *((U8*)STRING(scan));
978
979 /* Search for fixed substrings supports EXACT only. */
980 if (flags & SCF_DO_SUBSTR)
981 scan_commit(pRExC_state, data);
982 if (UTF) {
983 U8 *s = (U8 *)STRING(scan);
984 l = utf8_length(s, s + l);
985 uc = utf8_to_uvchr(s, NULL);
986 }
987 min += l;
988 if (data && (flags & SCF_DO_SUBSTR))
989 data->pos_min += l;
990 if (flags & SCF_DO_STCLASS_AND) {
991 /* Check whether it is compatible with what we know already! */
992 int compat = 1;
993
994 if (uc >= 0x100 ||
995 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
996 && !ANYOF_BITMAP_TEST(data->start_class, uc)
997 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
998 compat = 0;
999 ANYOF_CLASS_ZERO(data->start_class);
1000 ANYOF_BITMAP_ZERO(data->start_class);
1001 if (compat) {
1002 ANYOF_BITMAP_SET(data->start_class, uc);
1003 data->start_class->flags &= ~ANYOF_EOS;
1004 data->start_class->flags |= ANYOF_FOLD;
1005 if (OP(scan) == EXACTFL)
1006 data->start_class->flags |= ANYOF_LOCALE;
1007 }
1008 }
1009 else if (flags & SCF_DO_STCLASS_OR) {
1010 if (data->start_class->flags & ANYOF_FOLD) {
1011 /* false positive possible if the class is case-folded.
1012 Assume that the locale settings are the same... */
1013 if (uc < 0x100)
1014 ANYOF_BITMAP_SET(data->start_class, uc);
1015 data->start_class->flags &= ~ANYOF_EOS;
1016 }
1017 cl_and(data->start_class, &and_with);
1018 }
1019 flags &= ~SCF_DO_STCLASS;
1020 }
1021 else if (strchr((char*)PL_varies,OP(scan))) {
1022 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1023 I32 f = flags, pos_before = 0;
1024 regnode *oscan = scan;
1025 struct regnode_charclass_class this_class;
1026 struct regnode_charclass_class *oclass = NULL;
1027 I32 next_is_eval = 0;
1028
1029 switch (PL_regkind[(U8)OP(scan)]) {
1030 case WHILEM: /* End of (?:...)* . */
1031 scan = NEXTOPER(scan);
1032 goto finish;
1033 case PLUS:
1034 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1035 next = NEXTOPER(scan);
1036 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1037 mincount = 1;
1038 maxcount = REG_INFTY;
1039 next = regnext(scan);
1040 scan = NEXTOPER(scan);
1041 goto do_curly;
1042 }
1043 }
1044 if (flags & SCF_DO_SUBSTR)
1045 data->pos_min++;
1046 min++;
1047 /* Fall through. */
1048 case STAR:
1049 if (flags & SCF_DO_STCLASS) {
1050 mincount = 0;
1051 maxcount = REG_INFTY;
1052 next = regnext(scan);
1053 scan = NEXTOPER(scan);
1054 goto do_curly;
1055 }
1056 is_inf = is_inf_internal = 1;
1057 scan = regnext(scan);
1058 if (flags & SCF_DO_SUBSTR) {
1059 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1060 data->longest = &(data->longest_float);
1061 }
1062 goto optimize_curly_tail;
1063 case CURLY:
1064 mincount = ARG1(scan);
1065 maxcount = ARG2(scan);
1066 next = regnext(scan);
1067 if (OP(scan) == CURLYX) {
1068 I32 lp = (data ? *(data->last_closep) : 0);
1069
1070 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1071 }
1072 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1073 next_is_eval = (OP(scan) == EVAL);
1074 do_curly:
1075 if (flags & SCF_DO_SUBSTR) {
1076 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1077 pos_before = data->pos_min;
1078 }
1079 if (data) {
1080 fl = data->flags;
1081 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1082 if (is_inf)
1083 data->flags |= SF_IS_INF;
1084 }
1085 if (flags & SCF_DO_STCLASS) {
1086 cl_init(pRExC_state, &this_class);
1087 oclass = data->start_class;
1088 data->start_class = &this_class;
1089 f |= SCF_DO_STCLASS_AND;
1090 f &= ~SCF_DO_STCLASS_OR;
1091 }
1092 /* These are the cases when once a subexpression
1093 fails at a particular position, it cannot succeed
1094 even after backtracking at the enclosing scope.
1095
1096 XXXX what if minimal match and we are at the
1097 initial run of {n,m}? */
1098 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1099 f &= ~SCF_WHILEM_VISITED_POS;
1100
1101 /* This will finish on WHILEM, setting scan, or on NULL: */
1102 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1103 mincount == 0
1104 ? (f & ~SCF_DO_SUBSTR) : f);
1105
1106 if (flags & SCF_DO_STCLASS)
1107 data->start_class = oclass;
1108 if (mincount == 0 || minnext == 0) {
1109 if (flags & SCF_DO_STCLASS_OR) {
1110 cl_or(pRExC_state, data->start_class, &this_class);
1111 }
1112 else if (flags & SCF_DO_STCLASS_AND) {
1113 /* Switch to OR mode: cache the old value of
1114 * data->start_class */
1115 StructCopy(data->start_class, &and_with,
1116 struct regnode_charclass_class);
1117 flags &= ~SCF_DO_STCLASS_AND;
1118 StructCopy(&this_class, data->start_class,
1119 struct regnode_charclass_class);
1120 flags |= SCF_DO_STCLASS_OR;
1121 data->start_class->flags |= ANYOF_EOS;
1122 }
1123 } else { /* Non-zero len */
1124 if (flags & SCF_DO_STCLASS_OR) {
1125 cl_or(pRExC_state, data->start_class, &this_class);
1126 cl_and(data->start_class, &and_with);
1127 }
1128 else if (flags & SCF_DO_STCLASS_AND)
1129 cl_and(data->start_class, &this_class);
1130 flags &= ~SCF_DO_STCLASS;
1131 }
1132 if (!scan) /* It was not CURLYX, but CURLY. */
1133 scan = next;
1134 if (ckWARN(WARN_REGEXP)
1135 /* ? quantifier ok, except for (?{ ... }) */
1136 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1137 && (minnext == 0) && (deltanext == 0)
1138 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1139 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1140 {
1141 vWARN(RExC_parse,
1142 "Quantifier unexpected on zero-length expression");
1143 }
1144
1145 min += minnext * mincount;
1146 is_inf_internal |= ((maxcount == REG_INFTY
1147 && (minnext + deltanext) > 0)
1148 || deltanext == I32_MAX);
1149 is_inf |= is_inf_internal;
1150 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1151
1152 /* Try powerful optimization CURLYX => CURLYN. */
1153 if ( OP(oscan) == CURLYX && data
1154 && data->flags & SF_IN_PAR
1155 && !(data->flags & SF_HAS_EVAL)
1156 && !deltanext && minnext == 1 ) {
1157 /* Try to optimize to CURLYN. */
1158 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1159 regnode *nxt1 = nxt;
1160 #ifdef DEBUGGING
1161 regnode *nxt2;
1162 #endif
1163
1164 /* Skip open. */
1165 nxt = regnext(nxt);
1166 if (!strchr((char*)PL_simple,OP(nxt))
1167 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1168 && STR_LEN(nxt) == 1))
1169 goto nogo;
1170 #ifdef DEBUGGING
1171 nxt2 = nxt;
1172 #endif
1173 nxt = regnext(nxt);
1174 if (OP(nxt) != CLOSE)
1175 goto nogo;
1176 /* Now we know that nxt2 is the only contents: */
1177 oscan->flags = (U8)ARG(nxt);
1178 OP(oscan) = CURLYN;
1179 OP(nxt1) = NOTHING; /* was OPEN. */
1180 #ifdef DEBUGGING
1181 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1182 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1183 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1184 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1185 OP(nxt + 1) = OPTIMIZED; /* was count. */
1186 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1187 #endif
1188 }
1189 nogo:
1190
1191 /* Try optimization CURLYX => CURLYM. */
1192 if ( OP(oscan) == CURLYX && data
1193 && !(data->flags & SF_HAS_PAR)
1194 && !(data->flags & SF_HAS_EVAL)
1195 && !deltanext ) {
1196 /* XXXX How to optimize if data == 0? */
1197 /* Optimize to a simpler form. */
1198 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1199 regnode *nxt2;
1200
1201 OP(oscan) = CURLYM;
1202 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1203 && (OP(nxt2) != WHILEM))
1204 nxt = nxt2;
1205 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1206 /* Need to optimize away parenths. */
1207 if (data->flags & SF_IN_PAR) {
1208 /* Set the parenth number. */
1209 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1210
1211 if (OP(nxt) != CLOSE)
1212 FAIL("Panic opt close");
1213 oscan->flags = (U8)ARG(nxt);
1214 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1215 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1216 #ifdef DEBUGGING
1217 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1218 OP(nxt + 1) = OPTIMIZED; /* was count. */
1219 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1220 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1221 #endif
1222 #if 0
1223 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1224 regnode *nnxt = regnext(nxt1);
1225
1226 if (nnxt == nxt) {
1227 if (reg_off_by_arg[OP(nxt1)])
1228 ARG_SET(nxt1, nxt2 - nxt1);
1229 else if (nxt2 - nxt1 < U16_MAX)
1230 NEXT_OFF(nxt1) = nxt2 - nxt1;
1231 else
1232 OP(nxt) = NOTHING; /* Cannot beautify */
1233 }
1234 nxt1 = nnxt;
1235 }
1236 #endif
1237 /* Optimize again: */
1238 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1239 NULL, 0);
1240 }
1241 else
1242 oscan->flags = 0;
1243 }
1244 else if ((OP(oscan) == CURLYX)
1245 && (flags & SCF_WHILEM_VISITED_POS)
1246 /* See the comment on a similar expression above.
1247 However, this time it not a subexpression
1248 we care about, but the expression itself. */
1249 && (maxcount == REG_INFTY)
1250 && data && ++data->whilem_c < 16) {
1251 /* This stays as CURLYX, we can put the count/of pair. */
1252 /* Find WHILEM (as in regexec.c) */
1253 regnode *nxt = oscan + NEXT_OFF(oscan);
1254
1255 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1256 nxt += ARG(nxt);
1257 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1258 | (RExC_whilem_seen << 4)); /* On WHILEM */
1259 }
1260 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1261 pars++;
1262 if (flags & SCF_DO_SUBSTR) {
1263 SV *last_str = Nullsv;
1264 int counted = mincount != 0;
1265
1266 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1267 #if defined(SPARC64_GCC_WORKAROUND)
1268 I32 b = 0;
1269 STRLEN l = 0;
1270 char *s = NULL;
1271 I32 old = 0;
1272
1273 if (pos_before >= data->last_start_min)
1274 b = pos_before;
1275 else
1276 b = data->last_start_min;
1277
1278 l = 0;
1279 s = SvPV(data->last_found, l);
1280 old = b - data->last_start_min;
1281
1282 #else
1283 I32 b = pos_before >= data->last_start_min
1284 ? pos_before : data->last_start_min;
1285 STRLEN l;
1286 char *s = SvPV(data->last_found, l);
1287 I32 old = b - data->last_start_min;
1288 #endif
1289
1290 if (UTF)
1291 old = utf8_hop((U8*)s, old) - (U8*)s;
1292
1293 l -= old;
1294 /* Get the added string: */
1295 last_str = newSVpvn(s + old, l);
1296 if (UTF)
1297 SvUTF8_on(last_str);
1298 if (deltanext == 0 && pos_before == b) {
1299 /* What was added is a constant string */
1300 if (mincount > 1) {
1301 SvGROW(last_str, (mincount * l) + 1);
1302 repeatcpy(SvPVX(last_str) + l,
1303 SvPVX(last_str), l, mincount - 1);
1304 SvCUR(last_str) *= mincount;
1305 /* Add additional parts. */
1306 SvCUR_set(data->last_found,
1307 SvCUR(data->last_found) - l);
1308 sv_catsv(data->last_found, last_str);
1309 {
1310 SV * sv = data->last_found;
1311 MAGIC *mg =
1312 SvUTF8(sv) && SvMAGICAL(sv) ?
1313 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1314 if (mg && mg->mg_len >= 0)
1315 mg->mg_len += CHR_SVLEN(last_str);
1316 }
1317 data->last_end += l * (mincount - 1);
1318 }
1319 } else {
1320 /* start offset must point into the last copy */
1321 data->last_start_min += minnext * (mincount - 1);
1322 data->last_start_max += is_inf ? I32_MAX
1323 : (maxcount - 1) * (minnext + data->pos_delta);
1324 }
1325 }
1326 /* It is counted once already... */
1327 data->pos_min += minnext * (mincount - counted);
1328 data->pos_delta += - counted * deltanext +
1329 (minnext + deltanext) * maxcount - minnext * mincount;
1330 if (mincount != maxcount) {
1331 /* Cannot extend fixed substrings found inside
1332 the group. */
1333 scan_commit(pRExC_state,data);
1334 if (mincount && last_str) {
1335 sv_setsv(data->last_found, last_str);
1336 data->last_end = data->pos_min;
1337 data->last_start_min =
1338 data->pos_min - CHR_SVLEN(last_str);
1339 data->last_start_max = is_inf
1340 ? I32_MAX
1341 : data->pos_min + data->pos_delta
1342 - CHR_SVLEN(last_str);
1343 }
1344 data->longest = &(data->longest_float);
1345 }
1346 SvREFCNT_dec(last_str);
1347 }
1348 if (data && (fl & SF_HAS_EVAL))
1349 data->flags |= SF_HAS_EVAL;
1350 optimize_curly_tail:
1351 if (OP(oscan) != CURLYX) {
1352 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1353 && NEXT_OFF(next))
1354 NEXT_OFF(oscan) += NEXT_OFF(next);
1355 }
1356 continue;
1357 default: /* REF and CLUMP only? */
1358 if (flags & SCF_DO_SUBSTR) {
1359 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1360 data->longest = &(data->longest_float);
1361 }
1362 is_inf = is_inf_internal = 1;
1363 if (flags & SCF_DO_STCLASS_OR)
1364 cl_anything(pRExC_state, data->start_class);
1365 flags &= ~SCF_DO_STCLASS;
1366 break;
1367 }
1368 }
1369 else if (strchr((char*)PL_simple,OP(scan))) {
1370 int value = 0;
1371
1372 if (flags & SCF_DO_SUBSTR) {
1373 scan_commit(pRExC_state,data);
1374 data->pos_min++;
1375 }
1376 min++;
1377 if (flags & SCF_DO_STCLASS) {
1378 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1379
1380 /* Some of the logic below assumes that switching
1381 locale on will only add false positives. */
1382 switch (PL_regkind[(U8)OP(scan)]) {
1383 case SANY:
1384 default:
1385 do_default:
1386 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1387 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1388 cl_anything(pRExC_state, data->start_class);
1389 break;
1390 case REG_ANY:
1391 if (OP(scan) == SANY)
1392 goto do_default;
1393 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1394 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1395 || (data->start_class->flags & ANYOF_CLASS));
1396 cl_anything(pRExC_state, data->start_class);
1397 }
1398 if (flags & SCF_DO_STCLASS_AND || !value)
1399 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1400 break;
1401 case ANYOF:
1402 if (flags & SCF_DO_STCLASS_AND)
1403 cl_and(data->start_class,
1404 (struct regnode_charclass_class*)scan);
1405 else
1406 cl_or(pRExC_state, data->start_class,
1407 (struct regnode_charclass_class*)scan);
1408 break;
1409 case ALNUM:
1410 if (flags & SCF_DO_STCLASS_AND) {
1411 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1412 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1413 for (value = 0; value < 256; value++)
1414 if (!isALNUM(value))
1415 ANYOF_BITMAP_CLEAR(data->start_class, value);
1416 }
1417 }
1418 else {
1419 if (data->start_class->flags & ANYOF_LOCALE)
1420 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1421 else {
1422 for (value = 0; value < 256; value++)
1423 if (isALNUM(value))
1424 ANYOF_BITMAP_SET(data->start_class, value);
1425 }
1426 }
1427 break;
1428 case ALNUML:
1429 if (flags & SCF_DO_STCLASS_AND) {
1430 if (data->start_class->flags & ANYOF_LOCALE)
1431 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1432 }
1433 else {
1434 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1435 data->start_class->flags |= ANYOF_LOCALE;
1436 }
1437 break;
1438 case NALNUM:
1439 if (flags & SCF_DO_STCLASS_AND) {
1440 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1441 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1442 for (value = 0; value < 256; value++)
1443 if (isALNUM(value))
1444 ANYOF_BITMAP_CLEAR(data->start_class, value);
1445 }
1446 }
1447 else {
1448 if (data->start_class->flags & ANYOF_LOCALE)
1449 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1450 else {
1451 for (value = 0; value < 256; value++)
1452 if (!isALNUM(value))
1453 ANYOF_BITMAP_SET(data->start_class, value);
1454 }
1455 }
1456 break;
1457 case NALNUML:
1458 if (flags & SCF_DO_STCLASS_AND) {
1459 if (data->start_class->flags & ANYOF_LOCALE)
1460 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1461 }
1462 else {
1463 data->start_class->flags |= ANYOF_LOCALE;
1464 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1465 }
1466 break;
1467 case SPACE:
1468 if (flags & SCF_DO_STCLASS_AND) {
1469 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1470 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1471 for (value = 0; value < 256; value++)
1472 if (!isSPACE(value))
1473 ANYOF_BITMAP_CLEAR(data->start_class, value);
1474 }
1475 }
1476 else {
1477 if (data->start_class->flags & ANYOF_LOCALE)
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1479 else {
1480 for (value = 0; value < 256; value++)
1481 if (isSPACE(value))
1482 ANYOF_BITMAP_SET(data->start_class, value);
1483 }
1484 }
1485 break;
1486 case SPACEL:
1487 if (flags & SCF_DO_STCLASS_AND) {
1488 if (data->start_class->flags & ANYOF_LOCALE)
1489 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1490 }
1491 else {
1492 data->start_class->flags |= ANYOF_LOCALE;
1493 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1494 }
1495 break;
1496 case NSPACE:
1497 if (flags & SCF_DO_STCLASS_AND) {
1498 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1499 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1500 for (value = 0; value < 256; value++)
1501 if (isSPACE(value))
1502 ANYOF_BITMAP_CLEAR(data->start_class, value);
1503 }
1504 }
1505 else {
1506 if (data->start_class->flags & ANYOF_LOCALE)
1507 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1508 else {
1509 for (value = 0; value < 256; value++)
1510 if (!isSPACE(value))
1511 ANYOF_BITMAP_SET(data->start_class, value);
1512 }
1513 }
1514 break;
1515 case NSPACEL:
1516 if (flags & SCF_DO_STCLASS_AND) {
1517 if (data->start_class->flags & ANYOF_LOCALE) {
1518 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1519 for (value = 0; value < 256; value++)
1520 if (!isSPACE(value))
1521 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522 }
1523 }
1524 else {
1525 data->start_class->flags |= ANYOF_LOCALE;
1526 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1527 }
1528 break;
1529 case DIGIT:
1530 if (flags & SCF_DO_STCLASS_AND) {
1531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1532 for (value = 0; value < 256; value++)
1533 if (!isDIGIT(value))
1534 ANYOF_BITMAP_CLEAR(data->start_class, value);
1535 }
1536 else {
1537 if (data->start_class->flags & ANYOF_LOCALE)
1538 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1539 else {
1540 for (value = 0; value < 256; value++)
1541 if (isDIGIT(value))
1542 ANYOF_BITMAP_SET(data->start_class, value);
1543 }
1544 }
1545 break;
1546 case NDIGIT:
1547 if (flags & SCF_DO_STCLASS_AND) {
1548 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1549 for (value = 0; value < 256; value++)
1550 if (isDIGIT(value))
1551 ANYOF_BITMAP_CLEAR(data->start_class, value);
1552 }
1553 else {
1554 if (data->start_class->flags & ANYOF_LOCALE)
1555 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1556 else {
1557 for (value = 0; value < 256; value++)
1558 if (!isDIGIT(value))
1559 ANYOF_BITMAP_SET(data->start_class, value);
1560 }
1561 }
1562 break;
1563 }
1564 if (flags & SCF_DO_STCLASS_OR)
1565 cl_and(data->start_class, &and_with);
1566 flags &= ~SCF_DO_STCLASS;
1567 }
1568 }
1569 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1570 data->flags |= (OP(scan) == MEOL
1571 ? SF_BEFORE_MEOL
1572 : SF_BEFORE_SEOL);
1573 }
1574 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1575 /* Lookbehind, or need to calculate parens/evals/stclass: */
1576 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1577 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1578 /* Lookahead/lookbehind */
1579 I32 deltanext, minnext, fake = 0;
1580 regnode *nscan;
1581 struct regnode_charclass_class intrnl;
1582 int f = 0;
1583
1584 data_fake.flags = 0;
1585 if (data) {
1586 data_fake.whilem_c = data->whilem_c;
1587 data_fake.last_closep = data->last_closep;
1588 }
1589 else
1590 data_fake.last_closep = &fake;
1591 if ( flags & SCF_DO_STCLASS && !scan->flags
1592 && OP(scan) == IFMATCH ) { /* Lookahead */
1593 cl_init(pRExC_state, &intrnl);
1594 data_fake.start_class = &intrnl;
1595 f |= SCF_DO_STCLASS_AND;
1596 }
1597 if (flags & SCF_WHILEM_VISITED_POS)
1598 f |= SCF_WHILEM_VISITED_POS;
1599 next = regnext(scan);
1600 nscan = NEXTOPER(NEXTOPER(scan));
1601 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1602 if (scan->flags) {
1603 if (deltanext) {
1604 vFAIL("Variable length lookbehind not implemented");
1605 }
1606 else if (minnext > U8_MAX) {
1607 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1608 }
1609 scan->flags = (U8)minnext;
1610 }
1611 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1612 pars++;
1613 if (data && (data_fake.flags & SF_HAS_EVAL))
1614 data->flags |= SF_HAS_EVAL;
1615 if (data)
1616 data->whilem_c = data_fake.whilem_c;
1617 if (f & SCF_DO_STCLASS_AND) {
1618 int was = (data->start_class->flags & ANYOF_EOS);
1619
1620 cl_and(data->start_class, &intrnl);
1621 if (was)
1622 data->start_class->flags |= ANYOF_EOS;
1623 }
1624 }
1625 else if (OP(scan) == OPEN) {
1626 pars++;
1627 }
1628 else if (OP(scan) == CLOSE) {
1629 if ((I32)ARG(scan) == is_par) {
1630 next = regnext(scan);
1631
1632 if ( next && (OP(next) != WHILEM) && next < last)
1633 is_par = 0; /* Disable optimization */
1634 }
1635 if (data)
1636 *(data->last_closep) = ARG(scan);
1637 }
1638 else if (OP(scan) == EVAL) {
1639 if (data)
1640 data->flags |= SF_HAS_EVAL;
1641 }
1642 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1643 if (flags & SCF_DO_SUBSTR) {
1644 scan_commit(pRExC_state,data);
1645 data->longest = &(data->longest_float);
1646 }
1647 is_inf = is_inf_internal = 1;
1648 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1649 cl_anything(pRExC_state, data->start_class);
1650 flags &= ~SCF_DO_STCLASS;
1651 }
1652 /* Else: zero-length, ignore. */
1653 scan = regnext(scan);
1654 }
1655
1656 finish:
1657 *scanp = scan;
1658 *deltap = is_inf_internal ? I32_MAX : delta;
1659 if (flags & SCF_DO_SUBSTR && is_inf)
1660 data->pos_delta = I32_MAX - data->pos_min;
1661 if (is_par > U8_MAX)
1662 is_par = 0;
1663 if (is_par && pars==1 && data) {
1664 data->flags |= SF_IN_PAR;
1665 data->flags &= ~SF_HAS_PAR;
1666 }
1667 else if (pars && data) {
1668 data->flags |= SF_HAS_PAR;
1669 data->flags &= ~SF_IN_PAR;
1670 }
1671 if (flags & SCF_DO_STCLASS_OR)
1672 cl_and(data->start_class, &and_with);
1673 return min;
1674 }
1675
1676 STATIC I32
S_add_data(pTHX_ RExC_state_t * pRExC_state,I32 n,char * s)1677 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1678 {
1679 if (RExC_rx->data) {
1680 Renewc(RExC_rx->data,
1681 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1682 char, struct reg_data);
1683 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1684 RExC_rx->data->count += n;
1685 }
1686 else {
1687 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1688 char, struct reg_data);
1689 New(1208, RExC_rx->data->what, n, U8);
1690 RExC_rx->data->count = n;
1691 }
1692 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1693 return RExC_rx->data->count - n;
1694 }
1695
1696 void
Perl_reginitcolors(pTHX)1697 Perl_reginitcolors(pTHX)
1698 {
1699 int i = 0;
1700 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1701
1702 if (s) {
1703 PL_colors[0] = s = savepv(s);
1704 while (++i < 6) {
1705 s = strchr(s, '\t');
1706 if (s) {
1707 *s = '\0';
1708 PL_colors[i] = ++s;
1709 }
1710 else
1711 PL_colors[i] = s = "";
1712 }
1713 } else {
1714 while (i < 6)
1715 PL_colors[i++] = "";
1716 }
1717 PL_colorset = 1;
1718 }
1719
1720
1721 /*
1722 - pregcomp - compile a regular expression into internal code
1723 *
1724 * We can't allocate space until we know how big the compiled form will be,
1725 * but we can't compile it (and thus know how big it is) until we've got a
1726 * place to put the code. So we cheat: we compile it twice, once with code
1727 * generation turned off and size counting turned on, and once "for real".
1728 * This also means that we don't allocate space until we are sure that the
1729 * thing really will compile successfully, and we never have to move the
1730 * code and thus invalidate pointers into it. (Note that it has to be in
1731 * one piece because free() must be able to free it all.) [NB: not true in perl]
1732 *
1733 * Beware that the optimization-preparation code in here knows about some
1734 * of the structure of the compiled regexp. [I'll say.]
1735 */
1736 regexp *
Perl_pregcomp(pTHX_ char * exp,char * xend,PMOP * pm)1737 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1738 {
1739 register regexp *r;
1740 regnode *scan;
1741 regnode *first;
1742 I32 flags;
1743 I32 minlen = 0;
1744 I32 sawplus = 0;
1745 I32 sawopen = 0;
1746 scan_data_t data;
1747 RExC_state_t RExC_state;
1748 RExC_state_t *pRExC_state = &RExC_state;
1749
1750 if (exp == NULL)
1751 FAIL("NULL regexp argument");
1752
1753 RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1754
1755 DEBUG_r({
1756 if (!PL_colorset) reginitcolors();
1757 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1758 PL_colors[4],PL_colors[5],PL_colors[0],
1759 (int)(xend - exp), exp, PL_colors[1]);
1760 });
1761
1762 redo_first_pass:
1763 RExC_precomp = exp;
1764 RExC_flags = pm->op_pmflags;
1765 RExC_sawback = 0;
1766
1767 RExC_seen = 0;
1768 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1769 RExC_seen_evals = 0;
1770 RExC_extralen = 0;
1771
1772 /* First pass: determine size, legality. */
1773 RExC_parse = exp;
1774 RExC_start = exp;
1775 RExC_end = xend;
1776 RExC_naughty = 0;
1777 RExC_npar = 1;
1778 RExC_size = 0L;
1779 RExC_emit = &PL_regdummy;
1780 RExC_whilem_seen = 0;
1781 #if 0 /* REGC() is (currently) a NOP at the first pass.
1782 * Clever compilers notice this and complain. --jhi */
1783 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1784 #endif
1785 if (reg(pRExC_state, 0, &flags) == NULL) {
1786 RExC_precomp = Nullch;
1787 return(NULL);
1788 }
1789
1790 if (RExC_utf8 && !RExC_orig_utf8) {
1791 /* It's possible to write a regexp in ascii that represents unicode
1792 codepoints outside of the byte range, such as via \x{100}. If we
1793 detect such a sequence we have to convert the entire pattern to utf8
1794 and then recompile, as our sizing calculation will have been based
1795 on 1 byte == 1 character, but we will need to use utf8 to encode
1796 at least some part of the pattern, and therefore must convert the whole
1797 thing.
1798 XXX: somehow figure out how to make this less expensive...
1799 -- dmq */
1800 STRLEN len = xend-exp;
1801 DEBUG_r(PerlIO_printf(Perl_debug_log,
1802 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
1803 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
1804 xend = exp + len;
1805 RExC_orig_utf8 = RExC_utf8;
1806 SAVEFREEPV(exp);
1807 goto redo_first_pass;
1808 }
1809 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1810
1811 /* Small enough for pointer-storage convention?
1812 If extralen==0, this means that we will not need long jumps. */
1813 if (RExC_size >= 0x10000L && RExC_extralen)
1814 RExC_size += RExC_extralen;
1815 else
1816 RExC_extralen = 0;
1817 if (RExC_whilem_seen > 15)
1818 RExC_whilem_seen = 15;
1819
1820 /* Allocate space and initialize. */
1821 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1822 char, regexp);
1823 if (r == NULL)
1824 FAIL("Regexp out of space");
1825
1826 #ifdef DEBUGGING
1827 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1828 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1829 #endif
1830 r->refcnt = 1;
1831 r->prelen = xend - exp;
1832 r->precomp = savepvn(RExC_precomp, r->prelen);
1833 r->subbeg = NULL;
1834 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1835 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1836
1837 r->substrs = 0; /* Useful during FAIL. */
1838 r->startp = 0; /* Useful during FAIL. */
1839 r->endp = 0; /* Useful during FAIL. */
1840
1841 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1842 if (r->offsets) {
1843 r->offsets[0] = RExC_size;
1844 }
1845 DEBUG_r(PerlIO_printf(Perl_debug_log,
1846 "%s %"UVuf" bytes for offset annotations.\n",
1847 r->offsets ? "Got" : "Couldn't get",
1848 (UV)((2*RExC_size+1) * sizeof(U32))));
1849
1850 RExC_rx = r;
1851
1852 /* Second pass: emit code. */
1853 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1854 RExC_parse = exp;
1855 RExC_end = xend;
1856 RExC_naughty = 0;
1857 RExC_npar = 1;
1858 RExC_emit_start = r->program;
1859 RExC_emit = r->program;
1860 /* Store the count of eval-groups for security checks: */
1861 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1862 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1863 r->data = 0;
1864 if (reg(pRExC_state, 0, &flags) == NULL)
1865 return(NULL);
1866
1867 /* Dig out information for optimizations. */
1868 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1869 pm->op_pmflags = RExC_flags;
1870 if (UTF)
1871 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1872 r->regstclass = NULL;
1873 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1874 r->reganch |= ROPT_NAUGHTY;
1875 scan = r->program + 1; /* First BRANCH. */
1876
1877 /* XXXX To minimize changes to RE engine we always allocate
1878 3-units-long substrs field. */
1879 Newz(1004, r->substrs, 1, struct reg_substr_data);
1880
1881 StructCopy(&zero_scan_data, &data, scan_data_t);
1882 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1883 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1884 I32 fake;
1885 STRLEN longest_float_length, longest_fixed_length;
1886 struct regnode_charclass_class ch_class;
1887 int stclass_flag;
1888 I32 last_close = 0;
1889
1890 first = scan;
1891 /* Skip introductions and multiplicators >= 1. */
1892 while ((OP(first) == OPEN && (sawopen = 1)) ||
1893 /* An OR of *one* alternative - should not happen now. */
1894 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1895 (OP(first) == PLUS) ||
1896 (OP(first) == MINMOD) ||
1897 /* An {n,m} with n>0 */
1898 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1899 if (OP(first) == PLUS)
1900 sawplus = 1;
1901 else
1902 first += regarglen[(U8)OP(first)];
1903 first = NEXTOPER(first);
1904 }
1905
1906 /* Starting-point info. */
1907 again:
1908 if (PL_regkind[(U8)OP(first)] == EXACT) {
1909 if (OP(first) == EXACT)
1910 ; /* Empty, get anchored substr later. */
1911 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1912 r->regstclass = first;
1913 }
1914 else if (strchr((char*)PL_simple,OP(first)))
1915 r->regstclass = first;
1916 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1917 PL_regkind[(U8)OP(first)] == NBOUND)
1918 r->regstclass = first;
1919 else if (PL_regkind[(U8)OP(first)] == BOL) {
1920 r->reganch |= (OP(first) == MBOL
1921 ? ROPT_ANCH_MBOL
1922 : (OP(first) == SBOL
1923 ? ROPT_ANCH_SBOL
1924 : ROPT_ANCH_BOL));
1925 first = NEXTOPER(first);
1926 goto again;
1927 }
1928 else if (OP(first) == GPOS) {
1929 r->reganch |= ROPT_ANCH_GPOS;
1930 first = NEXTOPER(first);
1931 goto again;
1932 }
1933 else if (!sawopen && (OP(first) == STAR &&
1934 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1935 !(r->reganch & ROPT_ANCH) )
1936 {
1937 /* turn .* into ^.* with an implied $*=1 */
1938 int type = OP(NEXTOPER(first));
1939
1940 if (type == REG_ANY)
1941 type = ROPT_ANCH_MBOL;
1942 else
1943 type = ROPT_ANCH_SBOL;
1944
1945 r->reganch |= type | ROPT_IMPLICIT;
1946 first = NEXTOPER(first);
1947 goto again;
1948 }
1949 if (sawplus && (!sawopen || !RExC_sawback)
1950 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1951 /* x+ must match at the 1st pos of run of x's */
1952 r->reganch |= ROPT_SKIP;
1953
1954 /* Scan is after the zeroth branch, first is atomic matcher. */
1955 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1956 (IV)(first - scan + 1)));
1957 /*
1958 * If there's something expensive in the r.e., find the
1959 * longest literal string that must appear and make it the
1960 * regmust. Resolve ties in favor of later strings, since
1961 * the regstart check works with the beginning of the r.e.
1962 * and avoiding duplication strengthens checking. Not a
1963 * strong reason, but sufficient in the absence of others.
1964 * [Now we resolve ties in favor of the earlier string if
1965 * it happens that c_offset_min has been invalidated, since the
1966 * earlier string may buy us something the later one won't.]
1967 */
1968 minlen = 0;
1969
1970 data.longest_fixed = newSVpvn("",0);
1971 data.longest_float = newSVpvn("",0);
1972 data.last_found = newSVpvn("",0);
1973 data.longest = &(data.longest_fixed);
1974 first = scan;
1975 if (!r->regstclass) {
1976 cl_init(pRExC_state, &ch_class);
1977 data.start_class = &ch_class;
1978 stclass_flag = SCF_DO_STCLASS_AND;
1979 } else /* XXXX Check for BOUND? */
1980 stclass_flag = 0;
1981 data.last_closep = &last_close;
1982
1983 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1984 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1985 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1986 && data.last_start_min == 0 && data.last_end > 0
1987 && !RExC_seen_zerolen
1988 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1989 r->reganch |= ROPT_CHECK_ALL;
1990 scan_commit(pRExC_state, &data);
1991 SvREFCNT_dec(data.last_found);
1992
1993 longest_float_length = CHR_SVLEN(data.longest_float);
1994 if (longest_float_length
1995 || (data.flags & SF_FL_BEFORE_EOL
1996 && (!(data.flags & SF_FL_BEFORE_MEOL)
1997 || (RExC_flags & PMf_MULTILINE)))) {
1998 int t;
1999
2000 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
2001 && data.offset_fixed == data.offset_float_min
2002 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
2003 goto remove_float; /* As in (a)+. */
2004
2005 if (SvUTF8(data.longest_float)) {
2006 r->float_utf8 = data.longest_float;
2007 r->float_substr = Nullsv;
2008 } else {
2009 r->float_substr = data.longest_float;
2010 r->float_utf8 = Nullsv;
2011 }
2012 r->float_min_offset = data.offset_float_min;
2013 r->float_max_offset = data.offset_float_max;
2014 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
2015 && (!(data.flags & SF_FL_BEFORE_MEOL)
2016 || (RExC_flags & PMf_MULTILINE)));
2017 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2018 }
2019 else {
2020 remove_float:
2021 r->float_substr = r->float_utf8 = Nullsv;
2022 SvREFCNT_dec(data.longest_float);
2023 longest_float_length = 0;
2024 }
2025
2026 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2027 if (longest_fixed_length
2028 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2029 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2030 || (RExC_flags & PMf_MULTILINE)))) {
2031 int t;
2032
2033 if (SvUTF8(data.longest_fixed)) {
2034 r->anchored_utf8 = data.longest_fixed;
2035 r->anchored_substr = Nullsv;
2036 } else {
2037 r->anchored_substr = data.longest_fixed;
2038 r->anchored_utf8 = Nullsv;
2039 }
2040 r->anchored_offset = data.offset_fixed;
2041 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2042 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2043 || (RExC_flags & PMf_MULTILINE)));
2044 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2045 }
2046 else {
2047 r->anchored_substr = r->anchored_utf8 = Nullsv;
2048 SvREFCNT_dec(data.longest_fixed);
2049 longest_fixed_length = 0;
2050 }
2051 if (r->regstclass
2052 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2053 r->regstclass = NULL;
2054 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2055 && stclass_flag
2056 && !(data.start_class->flags & ANYOF_EOS)
2057 && !cl_is_anything(data.start_class))
2058 {
2059 I32 n = add_data(pRExC_state, 1, "f");
2060
2061 New(1006, RExC_rx->data->data[n], 1,
2062 struct regnode_charclass_class);
2063 StructCopy(data.start_class,
2064 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2065 struct regnode_charclass_class);
2066 r->regstclass = (regnode*)RExC_rx->data->data[n];
2067 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2068 PL_regdata = r->data; /* for regprop() */
2069 DEBUG_r({ SV *sv = sv_newmortal();
2070 regprop(sv, (regnode*)data.start_class);
2071 PerlIO_printf(Perl_debug_log,
2072 "synthetic stclass `%s'.\n",
2073 SvPVX(sv));});
2074 }
2075
2076 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2077 if (longest_fixed_length > longest_float_length) {
2078 r->check_substr = r->anchored_substr;
2079 r->check_utf8 = r->anchored_utf8;
2080 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2081 if (r->reganch & ROPT_ANCH_SINGLE)
2082 r->reganch |= ROPT_NOSCAN;
2083 }
2084 else {
2085 r->check_substr = r->float_substr;
2086 r->check_utf8 = r->float_utf8;
2087 r->check_offset_min = data.offset_float_min;
2088 r->check_offset_max = data.offset_float_max;
2089 }
2090 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2091 This should be changed ASAP! */
2092 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2093 r->reganch |= RE_USE_INTUIT;
2094 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2095 r->reganch |= RE_INTUIT_TAIL;
2096 }
2097 }
2098 else {
2099 /* Several toplevels. Best we can is to set minlen. */
2100 I32 fake;
2101 struct regnode_charclass_class ch_class;
2102 I32 last_close = 0;
2103
2104 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2105 scan = r->program + 1;
2106 cl_init(pRExC_state, &ch_class);
2107 data.start_class = &ch_class;
2108 data.last_closep = &last_close;
2109 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2110 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2111 = r->float_substr = r->float_utf8 = Nullsv;
2112 if (!(data.start_class->flags & ANYOF_EOS)
2113 && !cl_is_anything(data.start_class))
2114 {
2115 I32 n = add_data(pRExC_state, 1, "f");
2116
2117 New(1006, RExC_rx->data->data[n], 1,
2118 struct regnode_charclass_class);
2119 StructCopy(data.start_class,
2120 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2121 struct regnode_charclass_class);
2122 r->regstclass = (regnode*)RExC_rx->data->data[n];
2123 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2124 DEBUG_r({ SV* sv = sv_newmortal();
2125 regprop(sv, (regnode*)data.start_class);
2126 PerlIO_printf(Perl_debug_log,
2127 "synthetic stclass `%s'.\n",
2128 SvPVX(sv));});
2129 }
2130 }
2131
2132 r->minlen = minlen;
2133 if (RExC_seen & REG_SEEN_GPOS)
2134 r->reganch |= ROPT_GPOS_SEEN;
2135 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2136 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2137 if (RExC_seen & REG_SEEN_EVAL)
2138 r->reganch |= ROPT_EVAL_SEEN;
2139 if (RExC_seen & REG_SEEN_CANY)
2140 r->reganch |= ROPT_CANY_SEEN;
2141 Newz(1002, r->startp, RExC_npar, I32);
2142 Newz(1002, r->endp, RExC_npar, I32);
2143 PL_regdata = r->data; /* for regprop() */
2144 DEBUG_r(regdump(r));
2145 return(r);
2146 }
2147
2148 /*
2149 - reg - regular expression, i.e. main body or parenthesized thing
2150 *
2151 * Caller must absorb opening parenthesis.
2152 *
2153 * Combining parenthesis handling with the base level of regular expression
2154 * is a trifle forced, but the need to tie the tails of the branches to what
2155 * follows makes it hard to avoid.
2156 */
2157 STATIC regnode *
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp)2158 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2159 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2160 {
2161 register regnode *ret; /* Will be the head of the group. */
2162 register regnode *br;
2163 register regnode *lastbr;
2164 register regnode *ender = 0;
2165 register I32 parno = 0;
2166 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2167
2168 /* for (?g), (?gc), and (?o) warnings; warning
2169 about (?c) will warn about (?g) -- japhy */
2170
2171 I32 wastedflags = 0x00,
2172 wasted_o = 0x01,
2173 wasted_g = 0x02,
2174 wasted_gc = 0x02 | 0x04,
2175 wasted_c = 0x04;
2176
2177 char * parse_start = RExC_parse; /* MJD */
2178 char *oregcomp_parse = RExC_parse;
2179 char c;
2180
2181 *flagp = 0; /* Tentatively. */
2182
2183
2184 /* Make an OPEN node, if parenthesized. */
2185 if (paren) {
2186 if (*RExC_parse == '?') { /* (?...) */
2187 U32 posflags = 0, negflags = 0;
2188 U32 *flagsp = &posflags;
2189 int logical = 0;
2190 char *seqstart = RExC_parse;
2191
2192 RExC_parse++;
2193 paren = *RExC_parse++;
2194 ret = NULL; /* For look-ahead/behind. */
2195 switch (paren) {
2196 case '<': /* (?<...) */
2197 RExC_seen |= REG_SEEN_LOOKBEHIND;
2198 if (*RExC_parse == '!')
2199 paren = ',';
2200 if (*RExC_parse != '=' && *RExC_parse != '!')
2201 goto unknown;
2202 RExC_parse++;
2203 case '=': /* (?=...) */
2204 case '!': /* (?!...) */
2205 RExC_seen_zerolen++;
2206 case ':': /* (?:...) */
2207 case '>': /* (?>...) */
2208 break;
2209 case '$': /* (?$...) */
2210 case '@': /* (?@...) */
2211 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2212 break;
2213 case '#': /* (?#...) */
2214 while (*RExC_parse && *RExC_parse != ')')
2215 RExC_parse++;
2216 if (*RExC_parse != ')')
2217 FAIL("Sequence (?#... not terminated");
2218 nextchar(pRExC_state);
2219 *flagp = TRYAGAIN;
2220 return NULL;
2221 case 'p': /* (?p...) */
2222 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2223 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2224 /* FALL THROUGH*/
2225 case '?': /* (??...) */
2226 logical = 1;
2227 if (*RExC_parse != '{')
2228 goto unknown;
2229 paren = *RExC_parse++;
2230 /* FALL THROUGH */
2231 case '{': /* (?{...}) */
2232 {
2233 I32 count = 1, n = 0;
2234 char c;
2235 char *s = RExC_parse;
2236 SV *sv;
2237 OP_4tree *sop, *rop;
2238
2239 RExC_seen_zerolen++;
2240 RExC_seen |= REG_SEEN_EVAL;
2241 while (count && (c = *RExC_parse)) {
2242 if (c == '\\' && RExC_parse[1])
2243 RExC_parse++;
2244 else if (c == '{')
2245 count++;
2246 else if (c == '}')
2247 count--;
2248 RExC_parse++;
2249 }
2250 if (*RExC_parse != ')')
2251 {
2252 RExC_parse = s;
2253 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2254 }
2255 if (!SIZE_ONLY) {
2256 PAD *pad;
2257
2258 if (RExC_parse - 1 - s)
2259 sv = newSVpvn(s, RExC_parse - 1 - s);
2260 else
2261 sv = newSVpvn("", 0);
2262
2263 ENTER;
2264 Perl_save_re_context(aTHX);
2265 rop = sv_compile_2op(sv, &sop, "re", &pad);
2266 sop->op_private |= OPpREFCOUNTED;
2267 /* re_dup will OpREFCNT_inc */
2268 OpREFCNT_set(sop, 1);
2269 LEAVE;
2270
2271 n = add_data(pRExC_state, 3, "nop");
2272 RExC_rx->data->data[n] = (void*)rop;
2273 RExC_rx->data->data[n+1] = (void*)sop;
2274 RExC_rx->data->data[n+2] = (void*)pad;
2275 SvREFCNT_dec(sv);
2276 }
2277 else { /* First pass */
2278 if (PL_reginterp_cnt < ++RExC_seen_evals
2279 && IN_PERL_RUNTIME)
2280 /* No compiled RE interpolated, has runtime
2281 components ===> unsafe. */
2282 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2283 if (PL_tainting && PL_tainted)
2284 FAIL("Eval-group in insecure regular expression");
2285 }
2286
2287 nextchar(pRExC_state);
2288 if (logical) {
2289 ret = reg_node(pRExC_state, LOGICAL);
2290 if (!SIZE_ONLY)
2291 ret->flags = 2;
2292 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2293 /* deal with the length of this later - MJD */
2294 return ret;
2295 }
2296 ret = reganode(pRExC_state, EVAL, n);
2297 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2298 Set_Node_Offset(ret, parse_start);
2299 return ret;
2300 }
2301 case '(': /* (?(?{...})...) and (?(?=...)...) */
2302 {
2303 if (RExC_parse[0] == '?') { /* (?(?...)) */
2304 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2305 || RExC_parse[1] == '<'
2306 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2307 I32 flag;
2308
2309 ret = reg_node(pRExC_state, LOGICAL);
2310 if (!SIZE_ONLY)
2311 ret->flags = 1;
2312 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2313 goto insert_if;
2314 }
2315 }
2316 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2317 /* (?(1)...) */
2318 parno = atoi(RExC_parse++);
2319
2320 while (isDIGIT(*RExC_parse))
2321 RExC_parse++;
2322 ret = reganode(pRExC_state, GROUPP, parno);
2323
2324 if ((c = *nextchar(pRExC_state)) != ')')
2325 vFAIL("Switch condition not recognized");
2326 insert_if:
2327 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2328 br = regbranch(pRExC_state, &flags, 1);
2329 if (br == NULL)
2330 br = reganode(pRExC_state, LONGJMP, 0);
2331 else
2332 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2333 c = *nextchar(pRExC_state);
2334 if (flags&HASWIDTH)
2335 *flagp |= HASWIDTH;
2336 if (c == '|') {
2337 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2338 regbranch(pRExC_state, &flags, 1);
2339 regtail(pRExC_state, ret, lastbr);
2340 if (flags&HASWIDTH)
2341 *flagp |= HASWIDTH;
2342 c = *nextchar(pRExC_state);
2343 }
2344 else
2345 lastbr = NULL;
2346 if (c != ')')
2347 vFAIL("Switch (?(condition)... contains too many branches");
2348 ender = reg_node(pRExC_state, TAIL);
2349 regtail(pRExC_state, br, ender);
2350 if (lastbr) {
2351 regtail(pRExC_state, lastbr, ender);
2352 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2353 }
2354 else
2355 regtail(pRExC_state, ret, ender);
2356 return ret;
2357 }
2358 else {
2359 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2360 }
2361 }
2362 case 0:
2363 RExC_parse--; /* for vFAIL to print correctly */
2364 vFAIL("Sequence (? incomplete");
2365 break;
2366 default:
2367 --RExC_parse;
2368 parse_flags: /* (?i) */
2369 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2370 /* (?g), (?gc) and (?o) are useless here
2371 and must be globally applied -- japhy */
2372
2373 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2374 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2375 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2376 if (! (wastedflags & wflagbit) ) {
2377 wastedflags |= wflagbit;
2378 vWARN5(
2379 RExC_parse + 1,
2380 "Useless (%s%c) - %suse /%c modifier",
2381 flagsp == &negflags ? "?-" : "?",
2382 *RExC_parse,
2383 flagsp == &negflags ? "don't " : "",
2384 *RExC_parse
2385 );
2386 }
2387 }
2388 }
2389 else if (*RExC_parse == 'c') {
2390 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2391 if (! (wastedflags & wasted_c) ) {
2392 wastedflags |= wasted_gc;
2393 vWARN3(
2394 RExC_parse + 1,
2395 "Useless (%sc) - %suse /gc modifier",
2396 flagsp == &negflags ? "?-" : "?",
2397 flagsp == &negflags ? "don't " : ""
2398 );
2399 }
2400 }
2401 }
2402 else { pmflag(flagsp, *RExC_parse); }
2403
2404 ++RExC_parse;
2405 }
2406 if (*RExC_parse == '-') {
2407 flagsp = &negflags;
2408 wastedflags = 0; /* reset so (?g-c) warns twice */
2409 ++RExC_parse;
2410 goto parse_flags;
2411 }
2412 RExC_flags |= posflags;
2413 RExC_flags &= ~negflags;
2414 if (*RExC_parse == ':') {
2415 RExC_parse++;
2416 paren = ':';
2417 break;
2418 }
2419 unknown:
2420 if (*RExC_parse != ')') {
2421 RExC_parse++;
2422 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2423 }
2424 nextchar(pRExC_state);
2425 *flagp = TRYAGAIN;
2426 return NULL;
2427 }
2428 }
2429 else { /* (...) */
2430 parno = RExC_npar;
2431 RExC_npar++;
2432 ret = reganode(pRExC_state, OPEN, parno);
2433 Set_Node_Length(ret, 1); /* MJD */
2434 Set_Node_Offset(ret, RExC_parse); /* MJD */
2435 open = 1;
2436 }
2437 }
2438 else /* ! paren */
2439 ret = NULL;
2440
2441 /* Pick up the branches, linking them together. */
2442 parse_start = RExC_parse; /* MJD */
2443 br = regbranch(pRExC_state, &flags, 1);
2444 /* branch_len = (paren != 0); */
2445
2446 if (br == NULL)
2447 return(NULL);
2448 if (*RExC_parse == '|') {
2449 if (!SIZE_ONLY && RExC_extralen) {
2450 reginsert(pRExC_state, BRANCHJ, br);
2451 }
2452 else { /* MJD */
2453 reginsert(pRExC_state, BRANCH, br);
2454 Set_Node_Length(br, paren != 0);
2455 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2456 }
2457 have_branch = 1;
2458 if (SIZE_ONLY)
2459 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2460 }
2461 else if (paren == ':') {
2462 *flagp |= flags&SIMPLE;
2463 }
2464 if (open) { /* Starts with OPEN. */
2465 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2466 }
2467 else if (paren != '?') /* Not Conditional */
2468 ret = br;
2469 *flagp |= flags & (SPSTART | HASWIDTH);
2470 lastbr = br;
2471 while (*RExC_parse == '|') {
2472 if (!SIZE_ONLY && RExC_extralen) {
2473 ender = reganode(pRExC_state, LONGJMP,0);
2474 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2475 }
2476 if (SIZE_ONLY)
2477 RExC_extralen += 2; /* Account for LONGJMP. */
2478 nextchar(pRExC_state);
2479 br = regbranch(pRExC_state, &flags, 0);
2480
2481 if (br == NULL)
2482 return(NULL);
2483 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2484 lastbr = br;
2485 if (flags&HASWIDTH)
2486 *flagp |= HASWIDTH;
2487 *flagp |= flags&SPSTART;
2488 }
2489
2490 if (have_branch || paren != ':') {
2491 /* Make a closing node, and hook it on the end. */
2492 switch (paren) {
2493 case ':':
2494 ender = reg_node(pRExC_state, TAIL);
2495 break;
2496 case 1:
2497 ender = reganode(pRExC_state, CLOSE, parno);
2498 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2499 Set_Node_Length(ender,1); /* MJD */
2500 break;
2501 case '<':
2502 case ',':
2503 case '=':
2504 case '!':
2505 *flagp &= ~HASWIDTH;
2506 /* FALL THROUGH */
2507 case '>':
2508 ender = reg_node(pRExC_state, SUCCEED);
2509 break;
2510 case 0:
2511 ender = reg_node(pRExC_state, END);
2512 break;
2513 }
2514 regtail(pRExC_state, lastbr, ender);
2515
2516 if (have_branch) {
2517 /* Hook the tails of the branches to the closing node. */
2518 for (br = ret; br != NULL; br = regnext(br)) {
2519 regoptail(pRExC_state, br, ender);
2520 }
2521 }
2522 }
2523
2524 {
2525 char *p;
2526 static char parens[] = "=!<,>";
2527
2528 if (paren && (p = strchr(parens, paren))) {
2529 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2530 int flag = (p - parens) > 1;
2531
2532 if (paren == '>')
2533 node = SUSPEND, flag = 0;
2534 reginsert(pRExC_state, node,ret);
2535 Set_Node_Cur_Length(ret);
2536 Set_Node_Offset(ret, parse_start + 1);
2537 ret->flags = flag;
2538 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2539 }
2540 }
2541
2542 /* Check for proper termination. */
2543 if (paren) {
2544 RExC_flags = oregflags;
2545 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2546 RExC_parse = oregcomp_parse;
2547 vFAIL("Unmatched (");
2548 }
2549 }
2550 else if (!paren && RExC_parse < RExC_end) {
2551 if (*RExC_parse == ')') {
2552 RExC_parse++;
2553 vFAIL("Unmatched )");
2554 }
2555 else
2556 FAIL("Junk on end of regexp"); /* "Can't happen". */
2557 /* NOTREACHED */
2558 }
2559
2560 return(ret);
2561 }
2562
2563 /*
2564 - regbranch - one alternative of an | operator
2565 *
2566 * Implements the concatenation operator.
2567 */
2568 STATIC regnode *
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first)2569 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2570 {
2571 register regnode *ret;
2572 register regnode *chain = NULL;
2573 register regnode *latest;
2574 I32 flags = 0, c = 0;
2575
2576 if (first)
2577 ret = NULL;
2578 else {
2579 if (!SIZE_ONLY && RExC_extralen)
2580 ret = reganode(pRExC_state, BRANCHJ,0);
2581 else {
2582 ret = reg_node(pRExC_state, BRANCH);
2583 Set_Node_Length(ret, 1);
2584 }
2585 }
2586
2587 if (!first && SIZE_ONLY)
2588 RExC_extralen += 1; /* BRANCHJ */
2589
2590 *flagp = WORST; /* Tentatively. */
2591
2592 RExC_parse--;
2593 nextchar(pRExC_state);
2594 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2595 flags &= ~TRYAGAIN;
2596 latest = regpiece(pRExC_state, &flags);
2597 if (latest == NULL) {
2598 if (flags & TRYAGAIN)
2599 continue;
2600 return(NULL);
2601 }
2602 else if (ret == NULL)
2603 ret = latest;
2604 *flagp |= flags&HASWIDTH;
2605 if (chain == NULL) /* First piece. */
2606 *flagp |= flags&SPSTART;
2607 else {
2608 RExC_naughty++;
2609 regtail(pRExC_state, chain, latest);
2610 }
2611 chain = latest;
2612 c++;
2613 }
2614 if (chain == NULL) { /* Loop ran zero times. */
2615 chain = reg_node(pRExC_state, NOTHING);
2616 if (ret == NULL)
2617 ret = chain;
2618 }
2619 if (c == 1) {
2620 *flagp |= flags&SIMPLE;
2621 }
2622
2623 return(ret);
2624 }
2625
2626 /*
2627 - regpiece - something followed by possible [*+?]
2628 *
2629 * Note that the branching code sequences used for ? and the general cases
2630 * of * and + are somewhat optimized: they use the same NOTHING node as
2631 * both the endmarker for their branch list and the body of the last branch.
2632 * It might seem that this node could be dispensed with entirely, but the
2633 * endmarker role is not redundant.
2634 */
2635 STATIC regnode *
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp)2636 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2637 {
2638 register regnode *ret;
2639 register char op;
2640 register char *next;
2641 I32 flags;
2642 char *origparse = RExC_parse;
2643 char *maxpos;
2644 I32 min;
2645 I32 max = REG_INFTY;
2646 char *parse_start;
2647
2648 ret = regatom(pRExC_state, &flags);
2649 if (ret == NULL) {
2650 if (flags & TRYAGAIN)
2651 *flagp |= TRYAGAIN;
2652 return(NULL);
2653 }
2654
2655 op = *RExC_parse;
2656
2657 if (op == '{' && regcurly(RExC_parse)) {
2658 parse_start = RExC_parse; /* MJD */
2659 next = RExC_parse + 1;
2660 maxpos = Nullch;
2661 while (isDIGIT(*next) || *next == ',') {
2662 if (*next == ',') {
2663 if (maxpos)
2664 break;
2665 else
2666 maxpos = next;
2667 }
2668 next++;
2669 }
2670 if (*next == '}') { /* got one */
2671 if (!maxpos)
2672 maxpos = next;
2673 RExC_parse++;
2674 min = atoi(RExC_parse);
2675 if (*maxpos == ',')
2676 maxpos++;
2677 else
2678 maxpos = RExC_parse;
2679 max = atoi(maxpos);
2680 if (!max && *maxpos != '0')
2681 max = REG_INFTY; /* meaning "infinity" */
2682 else if (max >= REG_INFTY)
2683 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2684 RExC_parse = next;
2685 nextchar(pRExC_state);
2686
2687 do_curly:
2688 if ((flags&SIMPLE)) {
2689 RExC_naughty += 2 + RExC_naughty / 2;
2690 reginsert(pRExC_state, CURLY, ret);
2691 Set_Node_Offset(ret, parse_start+1); /* MJD */
2692 Set_Node_Cur_Length(ret);
2693 }
2694 else {
2695 regnode *w = reg_node(pRExC_state, WHILEM);
2696
2697 w->flags = 0;
2698 regtail(pRExC_state, ret, w);
2699 if (!SIZE_ONLY && RExC_extralen) {
2700 reginsert(pRExC_state, LONGJMP,ret);
2701 reginsert(pRExC_state, NOTHING,ret);
2702 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2703 }
2704 reginsert(pRExC_state, CURLYX,ret);
2705 /* MJD hk */
2706 Set_Node_Offset(ret, parse_start+1);
2707 Set_Node_Length(ret,
2708 op == '{' ? (RExC_parse - parse_start) : 1);
2709
2710 if (!SIZE_ONLY && RExC_extralen)
2711 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2712 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2713 if (SIZE_ONLY)
2714 RExC_whilem_seen++, RExC_extralen += 3;
2715 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2716 }
2717 ret->flags = 0;
2718
2719 if (min > 0)
2720 *flagp = WORST;
2721 if (max > 0)
2722 *flagp |= HASWIDTH;
2723 if (max && max < min)
2724 vFAIL("Can't do {n,m} with n > m");
2725 if (!SIZE_ONLY) {
2726 ARG1_SET(ret, (U16)min);
2727 ARG2_SET(ret, (U16)max);
2728 }
2729
2730 goto nest_check;
2731 }
2732 }
2733
2734 if (!ISMULT1(op)) {
2735 *flagp = flags;
2736 return(ret);
2737 }
2738
2739 #if 0 /* Now runtime fix should be reliable. */
2740
2741 /* if this is reinstated, don't forget to put this back into perldiag:
2742
2743 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2744
2745 (F) The part of the regexp subject to either the * or + quantifier
2746 could match an empty string. The {#} shows in the regular
2747 expression about where the problem was discovered.
2748
2749 */
2750
2751 if (!(flags&HASWIDTH) && op != '?')
2752 vFAIL("Regexp *+ operand could be empty");
2753 #endif
2754
2755 parse_start = RExC_parse;
2756 nextchar(pRExC_state);
2757
2758 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2759
2760 if (op == '*' && (flags&SIMPLE)) {
2761 reginsert(pRExC_state, STAR, ret);
2762 ret->flags = 0;
2763 RExC_naughty += 4;
2764 }
2765 else if (op == '*') {
2766 min = 0;
2767 goto do_curly;
2768 }
2769 else if (op == '+' && (flags&SIMPLE)) {
2770 reginsert(pRExC_state, PLUS, ret);
2771 ret->flags = 0;
2772 RExC_naughty += 3;
2773 }
2774 else if (op == '+') {
2775 min = 1;
2776 goto do_curly;
2777 }
2778 else if (op == '?') {
2779 min = 0; max = 1;
2780 goto do_curly;
2781 }
2782 nest_check:
2783 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2784 vWARN3(RExC_parse,
2785 "%.*s matches null string many times",
2786 RExC_parse - origparse,
2787 origparse);
2788 }
2789
2790 if (*RExC_parse == '?') {
2791 nextchar(pRExC_state);
2792 reginsert(pRExC_state, MINMOD, ret);
2793 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2794 }
2795 if (ISMULT2(RExC_parse)) {
2796 RExC_parse++;
2797 vFAIL("Nested quantifiers");
2798 }
2799
2800 return(ret);
2801 }
2802
2803 /*
2804 - regatom - the lowest level
2805 *
2806 * Optimization: gobbles an entire sequence of ordinary characters so that
2807 * it can turn them into a single node, which is smaller to store and
2808 * faster to run. Backslashed characters are exceptions, each becoming a
2809 * separate node; the code is simpler that way and it's not worth fixing.
2810 *
2811 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2812 STATIC regnode *
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp)2813 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2814 {
2815 register regnode *ret = 0;
2816 I32 flags;
2817 char *parse_start = RExC_parse;
2818
2819 *flagp = WORST; /* Tentatively. */
2820
2821 tryagain:
2822 switch (*RExC_parse) {
2823 case '^':
2824 RExC_seen_zerolen++;
2825 nextchar(pRExC_state);
2826 if (RExC_flags & PMf_MULTILINE)
2827 ret = reg_node(pRExC_state, MBOL);
2828 else if (RExC_flags & PMf_SINGLELINE)
2829 ret = reg_node(pRExC_state, SBOL);
2830 else
2831 ret = reg_node(pRExC_state, BOL);
2832 Set_Node_Length(ret, 1); /* MJD */
2833 break;
2834 case '$':
2835 nextchar(pRExC_state);
2836 if (*RExC_parse)
2837 RExC_seen_zerolen++;
2838 if (RExC_flags & PMf_MULTILINE)
2839 ret = reg_node(pRExC_state, MEOL);
2840 else if (RExC_flags & PMf_SINGLELINE)
2841 ret = reg_node(pRExC_state, SEOL);
2842 else
2843 ret = reg_node(pRExC_state, EOL);
2844 Set_Node_Length(ret, 1); /* MJD */
2845 break;
2846 case '.':
2847 nextchar(pRExC_state);
2848 if (RExC_flags & PMf_SINGLELINE)
2849 ret = reg_node(pRExC_state, SANY);
2850 else
2851 ret = reg_node(pRExC_state, REG_ANY);
2852 *flagp |= HASWIDTH|SIMPLE;
2853 RExC_naughty++;
2854 Set_Node_Length(ret, 1); /* MJD */
2855 break;
2856 case '[':
2857 {
2858 char *oregcomp_parse = ++RExC_parse;
2859 ret = regclass(pRExC_state);
2860 if (*RExC_parse != ']') {
2861 RExC_parse = oregcomp_parse;
2862 vFAIL("Unmatched [");
2863 }
2864 nextchar(pRExC_state);
2865 *flagp |= HASWIDTH|SIMPLE;
2866 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2867 break;
2868 }
2869 case '(':
2870 nextchar(pRExC_state);
2871 ret = reg(pRExC_state, 1, &flags);
2872 if (ret == NULL) {
2873 if (flags & TRYAGAIN) {
2874 if (RExC_parse == RExC_end) {
2875 /* Make parent create an empty node if needed. */
2876 *flagp |= TRYAGAIN;
2877 return(NULL);
2878 }
2879 goto tryagain;
2880 }
2881 return(NULL);
2882 }
2883 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2884 break;
2885 case '|':
2886 case ')':
2887 if (flags & TRYAGAIN) {
2888 *flagp |= TRYAGAIN;
2889 return NULL;
2890 }
2891 vFAIL("Internal urp");
2892 /* Supposed to be caught earlier. */
2893 break;
2894 case '{':
2895 if (!regcurly(RExC_parse)) {
2896 RExC_parse++;
2897 goto defchar;
2898 }
2899 /* FALL THROUGH */
2900 case '?':
2901 case '+':
2902 case '*':
2903 RExC_parse++;
2904 vFAIL("Quantifier follows nothing");
2905 break;
2906 case '\\':
2907 switch (*++RExC_parse) {
2908 case 'A':
2909 RExC_seen_zerolen++;
2910 ret = reg_node(pRExC_state, SBOL);
2911 *flagp |= SIMPLE;
2912 nextchar(pRExC_state);
2913 Set_Node_Length(ret, 2); /* MJD */
2914 break;
2915 case 'G':
2916 ret = reg_node(pRExC_state, GPOS);
2917 RExC_seen |= REG_SEEN_GPOS;
2918 *flagp |= SIMPLE;
2919 nextchar(pRExC_state);
2920 Set_Node_Length(ret, 2); /* MJD */
2921 break;
2922 case 'Z':
2923 ret = reg_node(pRExC_state, SEOL);
2924 *flagp |= SIMPLE;
2925 RExC_seen_zerolen++; /* Do not optimize RE away */
2926 nextchar(pRExC_state);
2927 break;
2928 case 'z':
2929 ret = reg_node(pRExC_state, EOS);
2930 *flagp |= SIMPLE;
2931 RExC_seen_zerolen++; /* Do not optimize RE away */
2932 nextchar(pRExC_state);
2933 Set_Node_Length(ret, 2); /* MJD */
2934 break;
2935 case 'C':
2936 ret = reg_node(pRExC_state, CANY);
2937 RExC_seen |= REG_SEEN_CANY;
2938 *flagp |= HASWIDTH|SIMPLE;
2939 nextchar(pRExC_state);
2940 Set_Node_Length(ret, 2); /* MJD */
2941 break;
2942 case 'X':
2943 ret = reg_node(pRExC_state, CLUMP);
2944 *flagp |= HASWIDTH;
2945 nextchar(pRExC_state);
2946 Set_Node_Length(ret, 2); /* MJD */
2947 break;
2948 case 'w':
2949 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2950 *flagp |= HASWIDTH|SIMPLE;
2951 nextchar(pRExC_state);
2952 Set_Node_Length(ret, 2); /* MJD */
2953 break;
2954 case 'W':
2955 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2956 *flagp |= HASWIDTH|SIMPLE;
2957 nextchar(pRExC_state);
2958 Set_Node_Length(ret, 2); /* MJD */
2959 break;
2960 case 'b':
2961 RExC_seen_zerolen++;
2962 RExC_seen |= REG_SEEN_LOOKBEHIND;
2963 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2964 *flagp |= SIMPLE;
2965 nextchar(pRExC_state);
2966 Set_Node_Length(ret, 2); /* MJD */
2967 break;
2968 case 'B':
2969 RExC_seen_zerolen++;
2970 RExC_seen |= REG_SEEN_LOOKBEHIND;
2971 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2972 *flagp |= SIMPLE;
2973 nextchar(pRExC_state);
2974 Set_Node_Length(ret, 2); /* MJD */
2975 break;
2976 case 's':
2977 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2978 *flagp |= HASWIDTH|SIMPLE;
2979 nextchar(pRExC_state);
2980 Set_Node_Length(ret, 2); /* MJD */
2981 break;
2982 case 'S':
2983 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2984 *flagp |= HASWIDTH|SIMPLE;
2985 nextchar(pRExC_state);
2986 Set_Node_Length(ret, 2); /* MJD */
2987 break;
2988 case 'd':
2989 ret = reg_node(pRExC_state, DIGIT);
2990 *flagp |= HASWIDTH|SIMPLE;
2991 nextchar(pRExC_state);
2992 Set_Node_Length(ret, 2); /* MJD */
2993 break;
2994 case 'D':
2995 ret = reg_node(pRExC_state, NDIGIT);
2996 *flagp |= HASWIDTH|SIMPLE;
2997 nextchar(pRExC_state);
2998 Set_Node_Length(ret, 2); /* MJD */
2999 break;
3000 case 'p':
3001 case 'P':
3002 {
3003 char* oldregxend = RExC_end;
3004 char* parse_start = RExC_parse - 2;
3005
3006 if (RExC_parse[1] == '{') {
3007 /* a lovely hack--pretend we saw [\pX] instead */
3008 RExC_end = strchr(RExC_parse, '}');
3009 if (!RExC_end) {
3010 U8 c = (U8)*RExC_parse;
3011 RExC_parse += 2;
3012 RExC_end = oldregxend;
3013 vFAIL2("Missing right brace on \\%c{}", c);
3014 }
3015 RExC_end++;
3016 }
3017 else {
3018 RExC_end = RExC_parse + 2;
3019 if (RExC_end > oldregxend)
3020 RExC_end = oldregxend;
3021 }
3022 RExC_parse--;
3023
3024 ret = regclass(pRExC_state);
3025
3026 RExC_end = oldregxend;
3027 RExC_parse--;
3028
3029 Set_Node_Offset(ret, parse_start + 2);
3030 Set_Node_Cur_Length(ret);
3031 nextchar(pRExC_state);
3032 *flagp |= HASWIDTH|SIMPLE;
3033 }
3034 break;
3035 case 'n':
3036 case 'r':
3037 case 't':
3038 case 'f':
3039 case 'e':
3040 case 'a':
3041 case 'x':
3042 case 'c':
3043 case '0':
3044 goto defchar;
3045 case '1': case '2': case '3': case '4':
3046 case '5': case '6': case '7': case '8': case '9':
3047 {
3048 I32 num = atoi(RExC_parse);
3049
3050 if (num > 9 && num >= RExC_npar)
3051 goto defchar;
3052 else {
3053 char * parse_start = RExC_parse - 1; /* MJD */
3054 while (isDIGIT(*RExC_parse))
3055 RExC_parse++;
3056
3057 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3058 vFAIL("Reference to nonexistent group");
3059 RExC_sawback = 1;
3060 ret = reganode(pRExC_state,
3061 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3062 num);
3063 *flagp |= HASWIDTH;
3064
3065 /* override incorrect value set in reganode MJD */
3066 Set_Node_Offset(ret, parse_start+1);
3067 Set_Node_Cur_Length(ret); /* MJD */
3068 RExC_parse--;
3069 nextchar(pRExC_state);
3070 }
3071 }
3072 break;
3073 case '\0':
3074 if (RExC_parse >= RExC_end)
3075 FAIL("Trailing \\");
3076 /* FALL THROUGH */
3077 default:
3078 /* Do not generate `unrecognized' warnings here, we fall
3079 back into the quick-grab loop below */
3080 parse_start--;
3081 goto defchar;
3082 }
3083 break;
3084
3085 case '#':
3086 if (RExC_flags & PMf_EXTENDED) {
3087 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3088 if (RExC_parse < RExC_end)
3089 goto tryagain;
3090 }
3091 /* FALL THROUGH */
3092
3093 default: {
3094 register STRLEN len;
3095 register UV ender;
3096 register char *p;
3097 char *oldp, *s;
3098 STRLEN numlen;
3099 STRLEN foldlen;
3100 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3101
3102 parse_start = RExC_parse - 1;
3103
3104 RExC_parse++;
3105
3106 defchar:
3107 ender = 0;
3108 ret = reg_node(pRExC_state,
3109 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3110 s = STRING(ret);
3111 for (len = 0, p = RExC_parse - 1;
3112 len < 127 && p < RExC_end;
3113 len++)
3114 {
3115 oldp = p;
3116
3117 if (RExC_flags & PMf_EXTENDED)
3118 p = regwhite(p, RExC_end);
3119 switch (*p) {
3120 case '^':
3121 case '$':
3122 case '.':
3123 case '[':
3124 case '(':
3125 case ')':
3126 case '|':
3127 goto loopdone;
3128 case '\\':
3129 switch (*++p) {
3130 case 'A':
3131 case 'C':
3132 case 'X':
3133 case 'G':
3134 case 'Z':
3135 case 'z':
3136 case 'w':
3137 case 'W':
3138 case 'b':
3139 case 'B':
3140 case 's':
3141 case 'S':
3142 case 'd':
3143 case 'D':
3144 case 'p':
3145 case 'P':
3146 --p;
3147 goto loopdone;
3148 case 'n':
3149 ender = '\n';
3150 p++;
3151 break;
3152 case 'r':
3153 ender = '\r';
3154 p++;
3155 break;
3156 case 't':
3157 ender = '\t';
3158 p++;
3159 break;
3160 case 'f':
3161 ender = '\f';
3162 p++;
3163 break;
3164 case 'e':
3165 ender = ASCII_TO_NATIVE('\033');
3166 p++;
3167 break;
3168 case 'a':
3169 ender = ASCII_TO_NATIVE('\007');
3170 p++;
3171 break;
3172 case 'x':
3173 if (*++p == '{') {
3174 char* e = strchr(p, '}');
3175
3176 if (!e) {
3177 RExC_parse = p + 1;
3178 vFAIL("Missing right brace on \\x{}");
3179 }
3180 else {
3181 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3182 | PERL_SCAN_DISALLOW_PREFIX;
3183 numlen = e - p - 1;
3184 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3185 if (ender > 0xff)
3186 RExC_utf8 = 1;
3187 p = e + 1;
3188 }
3189 }
3190 else {
3191 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3192 numlen = 2;
3193 ender = grok_hex(p, &numlen, &flags, NULL);
3194 p += numlen;
3195 }
3196 break;
3197 case 'c':
3198 p++;
3199 ender = UCHARAT(p++);
3200 ender = toCTRL(ender);
3201 break;
3202 case '0': case '1': case '2': case '3':case '4':
3203 case '5': case '6': case '7': case '8':case '9':
3204 if (*p == '0' ||
3205 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3206 I32 flags = 0;
3207 numlen = 3;
3208 ender = grok_oct(p, &numlen, &flags, NULL);
3209 p += numlen;
3210 }
3211 else {
3212 --p;
3213 goto loopdone;
3214 }
3215 break;
3216 case '\0':
3217 if (p >= RExC_end)
3218 FAIL("Trailing \\");
3219 /* FALL THROUGH */
3220 default:
3221 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3222 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3223 goto normal_default;
3224 }
3225 break;
3226 default:
3227 normal_default:
3228 if (UTF8_IS_START(*p) && UTF) {
3229 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3230 &numlen, 0);
3231 p += numlen;
3232 }
3233 else
3234 ender = *p++;
3235 break;
3236 }
3237 if (RExC_flags & PMf_EXTENDED)
3238 p = regwhite(p, RExC_end);
3239 if (UTF && FOLD) {
3240 /* Prime the casefolded buffer. */
3241 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3242 }
3243 if (ISMULT2(p)) { /* Back off on ?+*. */
3244 if (len)
3245 p = oldp;
3246 else if (UTF) {
3247 STRLEN unilen;
3248
3249 if (FOLD) {
3250 /* Emit all the Unicode characters. */
3251 for (foldbuf = tmpbuf;
3252 foldlen;
3253 foldlen -= numlen) {
3254 ender = utf8_to_uvchr(foldbuf, &numlen);
3255 if (numlen > 0) {
3256 reguni(pRExC_state, ender, s, &unilen);
3257 s += unilen;
3258 len += unilen;
3259 /* In EBCDIC the numlen
3260 * and unilen can differ. */
3261 foldbuf += numlen;
3262 if (numlen >= foldlen)
3263 break;
3264 }
3265 else
3266 break; /* "Can't happen." */
3267 }
3268 }
3269 else {
3270 reguni(pRExC_state, ender, s, &unilen);
3271 if (unilen > 0) {
3272 s += unilen;
3273 len += unilen;
3274 }
3275 }
3276 }
3277 else {
3278 len++;
3279 REGC((char)ender, s++);
3280 }
3281 break;
3282 }
3283 if (UTF) {
3284 STRLEN unilen;
3285
3286 if (FOLD) {
3287 /* Emit all the Unicode characters. */
3288 for (foldbuf = tmpbuf;
3289 foldlen;
3290 foldlen -= numlen) {
3291 ender = utf8_to_uvchr(foldbuf, &numlen);
3292 if (numlen > 0) {
3293 reguni(pRExC_state, ender, s, &unilen);
3294 len += unilen;
3295 s += unilen;
3296 /* In EBCDIC the numlen
3297 * and unilen can differ. */
3298 foldbuf += numlen;
3299 if (numlen >= foldlen)
3300 break;
3301 }
3302 else
3303 break;
3304 }
3305 }
3306 else {
3307 reguni(pRExC_state, ender, s, &unilen);
3308 if (unilen > 0) {
3309 s += unilen;
3310 len += unilen;
3311 }
3312 }
3313 len--;
3314 }
3315 else
3316 REGC((char)ender, s++);
3317 }
3318 loopdone:
3319 RExC_parse = p - 1;
3320 Set_Node_Cur_Length(ret); /* MJD */
3321 nextchar(pRExC_state);
3322 {
3323 /* len is STRLEN which is unsigned, need to copy to signed */
3324 IV iv = len;
3325 if (iv < 0)
3326 vFAIL("Internal disaster");
3327 }
3328 if (len > 0)
3329 *flagp |= HASWIDTH;
3330 if (len == 1 && UNI_IS_INVARIANT(ender))
3331 *flagp |= SIMPLE;
3332 if (!SIZE_ONLY)
3333 STR_LEN(ret) = len;
3334 if (SIZE_ONLY)
3335 RExC_size += STR_SZ(len);
3336 else
3337 RExC_emit += STR_SZ(len);
3338 }
3339 break;
3340 }
3341
3342 /* If the encoding pragma is in effect recode the text of
3343 * any EXACT-kind nodes. */
3344 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3345 STRLEN oldlen = STR_LEN(ret);
3346 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3347
3348 if (RExC_utf8)
3349 SvUTF8_on(sv);
3350 if (sv_utf8_downgrade(sv, TRUE)) {
3351 char *s = sv_recode_to_utf8(sv, PL_encoding);
3352 STRLEN newlen = SvCUR(sv);
3353
3354 if (SvUTF8(sv))
3355 RExC_utf8 = 1;
3356 if (!SIZE_ONLY) {
3357 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3358 (int)oldlen, STRING(ret),
3359 (int)newlen, s));
3360 Copy(s, STRING(ret), newlen, char);
3361 STR_LEN(ret) += newlen - oldlen;
3362 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3363 } else
3364 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3365 }
3366 }
3367
3368 return(ret);
3369 }
3370
3371 STATIC char *
S_regwhite(pTHX_ char * p,char * e)3372 S_regwhite(pTHX_ char *p, char *e)
3373 {
3374 while (p < e) {
3375 if (isSPACE(*p))
3376 ++p;
3377 else if (*p == '#') {
3378 do {
3379 p++;
3380 } while (p < e && *p != '\n');
3381 }
3382 else
3383 break;
3384 }
3385 return p;
3386 }
3387
3388 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3389 Character classes ([:foo:]) can also be negated ([:^foo:]).
3390 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3391 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3392 but trigger failures because they are currently unimplemented. */
3393
3394 #define POSIXCC_DONE(c) ((c) == ':')
3395 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3396 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3397
3398 STATIC I32
S_regpposixcc(pTHX_ RExC_state_t * pRExC_state,I32 value)3399 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3400 {
3401 char *posixcc = 0;
3402 I32 namedclass = OOB_NAMEDCLASS;
3403
3404 if (value == '[' && RExC_parse + 1 < RExC_end &&
3405 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3406 POSIXCC(UCHARAT(RExC_parse))) {
3407 char c = UCHARAT(RExC_parse);
3408 char* s = RExC_parse++;
3409
3410 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3411 RExC_parse++;
3412 if (RExC_parse == RExC_end)
3413 /* Grandfather lone [:, [=, [. */
3414 RExC_parse = s;
3415 else {
3416 char* t = RExC_parse++; /* skip over the c */
3417
3418 if (UCHARAT(RExC_parse) == ']') {
3419 RExC_parse++; /* skip over the ending ] */
3420 posixcc = s + 1;
3421 if (*s == ':') {
3422 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3423 I32 skip = 5; /* the most common skip */
3424
3425 switch (*posixcc) {
3426 case 'a':
3427 if (strnEQ(posixcc, "alnum", 5))
3428 namedclass =
3429 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3430 else if (strnEQ(posixcc, "alpha", 5))
3431 namedclass =
3432 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3433 else if (strnEQ(posixcc, "ascii", 5))
3434 namedclass =
3435 complement ? ANYOF_NASCII : ANYOF_ASCII;
3436 break;
3437 case 'b':
3438 if (strnEQ(posixcc, "blank", 5))
3439 namedclass =
3440 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3441 break;
3442 case 'c':
3443 if (strnEQ(posixcc, "cntrl", 5))
3444 namedclass =
3445 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3446 break;
3447 case 'd':
3448 if (strnEQ(posixcc, "digit", 5))
3449 namedclass =
3450 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3451 break;
3452 case 'g':
3453 if (strnEQ(posixcc, "graph", 5))
3454 namedclass =
3455 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3456 break;
3457 case 'l':
3458 if (strnEQ(posixcc, "lower", 5))
3459 namedclass =
3460 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3461 break;
3462 case 'p':
3463 if (strnEQ(posixcc, "print", 5))
3464 namedclass =
3465 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3466 else if (strnEQ(posixcc, "punct", 5))
3467 namedclass =
3468 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3469 break;
3470 case 's':
3471 if (strnEQ(posixcc, "space", 5))
3472 namedclass =
3473 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3474 break;
3475 case 'u':
3476 if (strnEQ(posixcc, "upper", 5))
3477 namedclass =
3478 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3479 break;
3480 case 'w': /* this is not POSIX, this is the Perl \w */
3481 if (strnEQ(posixcc, "word", 4)) {
3482 namedclass =
3483 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3484 skip = 4;
3485 }
3486 break;
3487 case 'x':
3488 if (strnEQ(posixcc, "xdigit", 6)) {
3489 namedclass =
3490 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3491 skip = 6;
3492 }
3493 break;
3494 }
3495 if (namedclass == OOB_NAMEDCLASS ||
3496 posixcc[skip] != ':' ||
3497 posixcc[skip+1] != ']')
3498 {
3499 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3500 t - s - 1, s + 1);
3501 }
3502 } else if (!SIZE_ONLY) {
3503 /* [[=foo=]] and [[.foo.]] are still future. */
3504
3505 /* adjust RExC_parse so the warning shows after
3506 the class closes */
3507 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3508 RExC_parse++;
3509 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3510 }
3511 } else {
3512 /* Maternal grandfather:
3513 * "[:" ending in ":" but not in ":]" */
3514 RExC_parse = s;
3515 }
3516 }
3517 }
3518
3519 return namedclass;
3520 }
3521
3522 STATIC void
S_checkposixcc(pTHX_ RExC_state_t * pRExC_state)3523 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3524 {
3525 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3526 char *s = RExC_parse;
3527 char c = *s++;
3528
3529 while(*s && isALNUM(*s))
3530 s++;
3531 if (*s && c == *s && s[1] == ']') {
3532 if (ckWARN(WARN_REGEXP))
3533 vWARN3(s+2,
3534 "POSIX syntax [%c %c] belongs inside character classes",
3535 c, c);
3536
3537 /* [[=foo=]] and [[.foo.]] are still future. */
3538 if (POSIXCC_NOTYET(c)) {
3539 /* adjust RExC_parse so the error shows after
3540 the class closes */
3541 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3542 ;
3543 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3544 }
3545 }
3546 }
3547 }
3548
3549 STATIC regnode *
S_regclass(pTHX_ RExC_state_t * pRExC_state)3550 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3551 {
3552 register UV value;
3553 register UV nextvalue;
3554 register IV prevvalue = OOB_UNICODE;
3555 register IV range = 0;
3556 register regnode *ret;
3557 STRLEN numlen;
3558 IV namedclass;
3559 char *rangebegin = 0;
3560 bool need_class = 0;
3561 SV *listsv = Nullsv;
3562 register char *e;
3563 UV n;
3564 bool optimize_invert = TRUE;
3565 AV* unicode_alternate = 0;
3566 #ifdef EBCDIC
3567 UV literal_endpoint = 0;
3568 #endif
3569
3570 ret = reganode(pRExC_state, ANYOF, 0);
3571
3572 if (!SIZE_ONLY)
3573 ANYOF_FLAGS(ret) = 0;
3574
3575 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3576 RExC_naughty++;
3577 RExC_parse++;
3578 if (!SIZE_ONLY)
3579 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3580 }
3581
3582 if (SIZE_ONLY)
3583 RExC_size += ANYOF_SKIP;
3584 else {
3585 RExC_emit += ANYOF_SKIP;
3586 if (FOLD)
3587 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3588 if (LOC)
3589 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3590 ANYOF_BITMAP_ZERO(ret);
3591 listsv = newSVpvn("# comment\n", 10);
3592 }
3593
3594 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3595
3596 if (!SIZE_ONLY && POSIXCC(nextvalue))
3597 checkposixcc(pRExC_state);
3598
3599 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3600 if (UCHARAT(RExC_parse) == ']')
3601 goto charclassloop;
3602
3603 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3604
3605 charclassloop:
3606
3607 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3608
3609 if (!range)
3610 rangebegin = RExC_parse;
3611 if (UTF) {
3612 value = utf8n_to_uvchr((U8*)RExC_parse,
3613 RExC_end - RExC_parse,
3614 &numlen, 0);
3615 RExC_parse += numlen;
3616 }
3617 else
3618 value = UCHARAT(RExC_parse++);
3619 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3620 if (value == '[' && POSIXCC(nextvalue))
3621 namedclass = regpposixcc(pRExC_state, value);
3622 else if (value == '\\') {
3623 if (UTF) {
3624 value = utf8n_to_uvchr((U8*)RExC_parse,
3625 RExC_end - RExC_parse,
3626 &numlen, 0);
3627 RExC_parse += numlen;
3628 }
3629 else
3630 value = UCHARAT(RExC_parse++);
3631 /* Some compilers cannot handle switching on 64-bit integer
3632 * values, therefore value cannot be an UV. Yes, this will
3633 * be a problem later if we want switch on Unicode.
3634 * A similar issue a little bit later when switching on
3635 * namedclass. --jhi */
3636 switch ((I32)value) {
3637 case 'w': namedclass = ANYOF_ALNUM; break;
3638 case 'W': namedclass = ANYOF_NALNUM; break;
3639 case 's': namedclass = ANYOF_SPACE; break;
3640 case 'S': namedclass = ANYOF_NSPACE; break;
3641 case 'd': namedclass = ANYOF_DIGIT; break;
3642 case 'D': namedclass = ANYOF_NDIGIT; break;
3643 case 'p':
3644 case 'P':
3645 if (RExC_parse >= RExC_end)
3646 vFAIL2("Empty \\%c{}", (U8)value);
3647 if (*RExC_parse == '{') {
3648 U8 c = (U8)value;
3649 e = strchr(RExC_parse++, '}');
3650 if (!e)
3651 vFAIL2("Missing right brace on \\%c{}", c);
3652 while (isSPACE(UCHARAT(RExC_parse)))
3653 RExC_parse++;
3654 if (e == RExC_parse)
3655 vFAIL2("Empty \\%c{}", c);
3656 n = e - RExC_parse;
3657 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3658 n--;
3659 }
3660 else {
3661 e = RExC_parse;
3662 n = 1;
3663 }
3664 if (!SIZE_ONLY) {
3665 if (UCHARAT(RExC_parse) == '^') {
3666 RExC_parse++;
3667 n--;
3668 value = value == 'p' ? 'P' : 'p'; /* toggle */
3669 while (isSPACE(UCHARAT(RExC_parse))) {
3670 RExC_parse++;
3671 n--;
3672 }
3673 }
3674 if (value == 'p')
3675 Perl_sv_catpvf(aTHX_ listsv,
3676 "+utf8::%.*s\n", (int)n, RExC_parse);
3677 else
3678 Perl_sv_catpvf(aTHX_ listsv,
3679 "!utf8::%.*s\n", (int)n, RExC_parse);
3680 }
3681 RExC_parse = e + 1;
3682 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3683 continue;
3684 case 'n': value = '\n'; break;
3685 case 'r': value = '\r'; break;
3686 case 't': value = '\t'; break;
3687 case 'f': value = '\f'; break;
3688 case 'b': value = '\b'; break;
3689 case 'e': value = ASCII_TO_NATIVE('\033');break;
3690 case 'a': value = ASCII_TO_NATIVE('\007');break;
3691 case 'x':
3692 if (*RExC_parse == '{') {
3693 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3694 | PERL_SCAN_DISALLOW_PREFIX;
3695 e = strchr(RExC_parse++, '}');
3696 if (!e)
3697 vFAIL("Missing right brace on \\x{}");
3698
3699 numlen = e - RExC_parse;
3700 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3701 RExC_parse = e + 1;
3702 }
3703 else {
3704 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3705 numlen = 2;
3706 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3707 RExC_parse += numlen;
3708 }
3709 break;
3710 case 'c':
3711 value = UCHARAT(RExC_parse++);
3712 value = toCTRL(value);
3713 break;
3714 case '0': case '1': case '2': case '3': case '4':
3715 case '5': case '6': case '7': case '8': case '9':
3716 {
3717 I32 flags = 0;
3718 numlen = 3;
3719 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3720 RExC_parse += numlen;
3721 break;
3722 }
3723 default:
3724 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3725 vWARN2(RExC_parse,
3726 "Unrecognized escape \\%c in character class passed through",
3727 (int)value);
3728 break;
3729 }
3730 } /* end of \blah */
3731 #ifdef EBCDIC
3732 else
3733 literal_endpoint++;
3734 #endif
3735
3736 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3737
3738 if (!SIZE_ONLY && !need_class)
3739 ANYOF_CLASS_ZERO(ret);
3740
3741 need_class = 1;
3742
3743 /* a bad range like a-\d, a-[:digit:] ? */
3744 if (range) {
3745 if (!SIZE_ONLY) {
3746 if (ckWARN(WARN_REGEXP))
3747 vWARN4(RExC_parse,
3748 "False [] range \"%*.*s\"",
3749 RExC_parse - rangebegin,
3750 RExC_parse - rangebegin,
3751 rangebegin);
3752 if (prevvalue < 256) {
3753 ANYOF_BITMAP_SET(ret, prevvalue);
3754 ANYOF_BITMAP_SET(ret, '-');
3755 }
3756 else {
3757 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3758 Perl_sv_catpvf(aTHX_ listsv,
3759 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3760 }
3761 }
3762
3763 range = 0; /* this was not a true range */
3764 }
3765
3766 if (!SIZE_ONLY) {
3767 if (namedclass > OOB_NAMEDCLASS)
3768 optimize_invert = FALSE;
3769 /* Possible truncation here but in some 64-bit environments
3770 * the compiler gets heartburn about switch on 64-bit values.
3771 * A similar issue a little earlier when switching on value.
3772 * --jhi */
3773 switch ((I32)namedclass) {
3774 case ANYOF_ALNUM:
3775 if (LOC)
3776 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3777 else {
3778 for (value = 0; value < 256; value++)
3779 if (isALNUM(value))
3780 ANYOF_BITMAP_SET(ret, value);
3781 }
3782 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3783 break;
3784 case ANYOF_NALNUM:
3785 if (LOC)
3786 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3787 else {
3788 for (value = 0; value < 256; value++)
3789 if (!isALNUM(value))
3790 ANYOF_BITMAP_SET(ret, value);
3791 }
3792 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3793 break;
3794 case ANYOF_ALNUMC:
3795 if (LOC)
3796 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3797 else {
3798 for (value = 0; value < 256; value++)
3799 if (isALNUMC(value))
3800 ANYOF_BITMAP_SET(ret, value);
3801 }
3802 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3803 break;
3804 case ANYOF_NALNUMC:
3805 if (LOC)
3806 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3807 else {
3808 for (value = 0; value < 256; value++)
3809 if (!isALNUMC(value))
3810 ANYOF_BITMAP_SET(ret, value);
3811 }
3812 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3813 break;
3814 case ANYOF_ALPHA:
3815 if (LOC)
3816 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3817 else {
3818 for (value = 0; value < 256; value++)
3819 if (isALPHA(value))
3820 ANYOF_BITMAP_SET(ret, value);
3821 }
3822 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3823 break;
3824 case ANYOF_NALPHA:
3825 if (LOC)
3826 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3827 else {
3828 for (value = 0; value < 256; value++)
3829 if (!isALPHA(value))
3830 ANYOF_BITMAP_SET(ret, value);
3831 }
3832 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3833 break;
3834 case ANYOF_ASCII:
3835 if (LOC)
3836 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3837 else {
3838 #ifndef EBCDIC
3839 for (value = 0; value < 128; value++)
3840 ANYOF_BITMAP_SET(ret, value);
3841 #else /* EBCDIC */
3842 for (value = 0; value < 256; value++) {
3843 if (isASCII(value))
3844 ANYOF_BITMAP_SET(ret, value);
3845 }
3846 #endif /* EBCDIC */
3847 }
3848 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3849 break;
3850 case ANYOF_NASCII:
3851 if (LOC)
3852 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3853 else {
3854 #ifndef EBCDIC
3855 for (value = 128; value < 256; value++)
3856 ANYOF_BITMAP_SET(ret, value);
3857 #else /* EBCDIC */
3858 for (value = 0; value < 256; value++) {
3859 if (!isASCII(value))
3860 ANYOF_BITMAP_SET(ret, value);
3861 }
3862 #endif /* EBCDIC */
3863 }
3864 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3865 break;
3866 case ANYOF_BLANK:
3867 if (LOC)
3868 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3869 else {
3870 for (value = 0; value < 256; value++)
3871 if (isBLANK(value))
3872 ANYOF_BITMAP_SET(ret, value);
3873 }
3874 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3875 break;
3876 case ANYOF_NBLANK:
3877 if (LOC)
3878 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3879 else {
3880 for (value = 0; value < 256; value++)
3881 if (!isBLANK(value))
3882 ANYOF_BITMAP_SET(ret, value);
3883 }
3884 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3885 break;
3886 case ANYOF_CNTRL:
3887 if (LOC)
3888 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3889 else {
3890 for (value = 0; value < 256; value++)
3891 if (isCNTRL(value))
3892 ANYOF_BITMAP_SET(ret, value);
3893 }
3894 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3895 break;
3896 case ANYOF_NCNTRL:
3897 if (LOC)
3898 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3899 else {
3900 for (value = 0; value < 256; value++)
3901 if (!isCNTRL(value))
3902 ANYOF_BITMAP_SET(ret, value);
3903 }
3904 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3905 break;
3906 case ANYOF_DIGIT:
3907 if (LOC)
3908 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3909 else {
3910 /* consecutive digits assumed */
3911 for (value = '0'; value <= '9'; value++)
3912 ANYOF_BITMAP_SET(ret, value);
3913 }
3914 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3915 break;
3916 case ANYOF_NDIGIT:
3917 if (LOC)
3918 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3919 else {
3920 /* consecutive digits assumed */
3921 for (value = 0; value < '0'; value++)
3922 ANYOF_BITMAP_SET(ret, value);
3923 for (value = '9' + 1; value < 256; value++)
3924 ANYOF_BITMAP_SET(ret, value);
3925 }
3926 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3927 break;
3928 case ANYOF_GRAPH:
3929 if (LOC)
3930 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3931 else {
3932 for (value = 0; value < 256; value++)
3933 if (isGRAPH(value))
3934 ANYOF_BITMAP_SET(ret, value);
3935 }
3936 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3937 break;
3938 case ANYOF_NGRAPH:
3939 if (LOC)
3940 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3941 else {
3942 for (value = 0; value < 256; value++)
3943 if (!isGRAPH(value))
3944 ANYOF_BITMAP_SET(ret, value);
3945 }
3946 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3947 break;
3948 case ANYOF_LOWER:
3949 if (LOC)
3950 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3951 else {
3952 for (value = 0; value < 256; value++)
3953 if (isLOWER(value))
3954 ANYOF_BITMAP_SET(ret, value);
3955 }
3956 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3957 break;
3958 case ANYOF_NLOWER:
3959 if (LOC)
3960 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3961 else {
3962 for (value = 0; value < 256; value++)
3963 if (!isLOWER(value))
3964 ANYOF_BITMAP_SET(ret, value);
3965 }
3966 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3967 break;
3968 case ANYOF_PRINT:
3969 if (LOC)
3970 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3971 else {
3972 for (value = 0; value < 256; value++)
3973 if (isPRINT(value))
3974 ANYOF_BITMAP_SET(ret, value);
3975 }
3976 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3977 break;
3978 case ANYOF_NPRINT:
3979 if (LOC)
3980 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3981 else {
3982 for (value = 0; value < 256; value++)
3983 if (!isPRINT(value))
3984 ANYOF_BITMAP_SET(ret, value);
3985 }
3986 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3987 break;
3988 case ANYOF_PSXSPC:
3989 if (LOC)
3990 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3991 else {
3992 for (value = 0; value < 256; value++)
3993 if (isPSXSPC(value))
3994 ANYOF_BITMAP_SET(ret, value);
3995 }
3996 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3997 break;
3998 case ANYOF_NPSXSPC:
3999 if (LOC)
4000 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4001 else {
4002 for (value = 0; value < 256; value++)
4003 if (!isPSXSPC(value))
4004 ANYOF_BITMAP_SET(ret, value);
4005 }
4006 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
4007 break;
4008 case ANYOF_PUNCT:
4009 if (LOC)
4010 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4011 else {
4012 for (value = 0; value < 256; value++)
4013 if (isPUNCT(value))
4014 ANYOF_BITMAP_SET(ret, value);
4015 }
4016 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
4017 break;
4018 case ANYOF_NPUNCT:
4019 if (LOC)
4020 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4021 else {
4022 for (value = 0; value < 256; value++)
4023 if (!isPUNCT(value))
4024 ANYOF_BITMAP_SET(ret, value);
4025 }
4026 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4027 break;
4028 case ANYOF_SPACE:
4029 if (LOC)
4030 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4031 else {
4032 for (value = 0; value < 256; value++)
4033 if (isSPACE(value))
4034 ANYOF_BITMAP_SET(ret, value);
4035 }
4036 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4037 break;
4038 case ANYOF_NSPACE:
4039 if (LOC)
4040 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4041 else {
4042 for (value = 0; value < 256; value++)
4043 if (!isSPACE(value))
4044 ANYOF_BITMAP_SET(ret, value);
4045 }
4046 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4047 break;
4048 case ANYOF_UPPER:
4049 if (LOC)
4050 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4051 else {
4052 for (value = 0; value < 256; value++)
4053 if (isUPPER(value))
4054 ANYOF_BITMAP_SET(ret, value);
4055 }
4056 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4057 break;
4058 case ANYOF_NUPPER:
4059 if (LOC)
4060 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4061 else {
4062 for (value = 0; value < 256; value++)
4063 if (!isUPPER(value))
4064 ANYOF_BITMAP_SET(ret, value);
4065 }
4066 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4067 break;
4068 case ANYOF_XDIGIT:
4069 if (LOC)
4070 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4071 else {
4072 for (value = 0; value < 256; value++)
4073 if (isXDIGIT(value))
4074 ANYOF_BITMAP_SET(ret, value);
4075 }
4076 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4077 break;
4078 case ANYOF_NXDIGIT:
4079 if (LOC)
4080 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4081 else {
4082 for (value = 0; value < 256; value++)
4083 if (!isXDIGIT(value))
4084 ANYOF_BITMAP_SET(ret, value);
4085 }
4086 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4087 break;
4088 default:
4089 vFAIL("Invalid [::] class");
4090 break;
4091 }
4092 if (LOC)
4093 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4094 continue;
4095 }
4096 } /* end of namedclass \blah */
4097
4098 if (range) {
4099 if (prevvalue > (IV)value) /* b-a */ {
4100 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4101 RExC_parse - rangebegin,
4102 RExC_parse - rangebegin,
4103 rangebegin);
4104 range = 0; /* not a valid range */
4105 }
4106 }
4107 else {
4108 prevvalue = value; /* save the beginning of the range */
4109 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4110 RExC_parse[1] != ']') {
4111 RExC_parse++;
4112
4113 /* a bad range like \w-, [:word:]- ? */
4114 if (namedclass > OOB_NAMEDCLASS) {
4115 if (ckWARN(WARN_REGEXP))
4116 vWARN4(RExC_parse,
4117 "False [] range \"%*.*s\"",
4118 RExC_parse - rangebegin,
4119 RExC_parse - rangebegin,
4120 rangebegin);
4121 if (!SIZE_ONLY)
4122 ANYOF_BITMAP_SET(ret, '-');
4123 } else
4124 range = 1; /* yeah, it's a range! */
4125 continue; /* but do it the next time */
4126 }
4127 }
4128
4129 /* now is the next time */
4130 if (!SIZE_ONLY) {
4131 IV i;
4132
4133 if (prevvalue < 256) {
4134 IV ceilvalue = value < 256 ? value : 255;
4135
4136 #ifdef EBCDIC
4137 /* In EBCDIC [\x89-\x91] should include
4138 * the \x8e but [i-j] should not. */
4139 if (literal_endpoint == 2 &&
4140 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4141 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4142 {
4143 if (isLOWER(prevvalue)) {
4144 for (i = prevvalue; i <= ceilvalue; i++)
4145 if (isLOWER(i))
4146 ANYOF_BITMAP_SET(ret, i);
4147 } else {
4148 for (i = prevvalue; i <= ceilvalue; i++)
4149 if (isUPPER(i))
4150 ANYOF_BITMAP_SET(ret, i);
4151 }
4152 }
4153 else
4154 #endif
4155 for (i = prevvalue; i <= ceilvalue; i++)
4156 ANYOF_BITMAP_SET(ret, i);
4157 }
4158 if (value > 255 || UTF) {
4159 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4160 UV natvalue = NATIVE_TO_UNI(value);
4161
4162 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4163 if (prevnatvalue < natvalue) { /* what about > ? */
4164 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4165 prevnatvalue, natvalue);
4166 }
4167 else if (prevnatvalue == natvalue) {
4168 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4169 if (FOLD) {
4170 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4171 STRLEN foldlen;
4172 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4173
4174 /* If folding and foldable and a single
4175 * character, insert also the folded version
4176 * to the charclass. */
4177 if (f != value) {
4178 if (foldlen == (STRLEN)UNISKIP(f))
4179 Perl_sv_catpvf(aTHX_ listsv,
4180 "%04"UVxf"\n", f);
4181 else {
4182 /* Any multicharacter foldings
4183 * require the following transform:
4184 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4185 * where E folds into "pq" and F folds
4186 * into "rst", all other characters
4187 * fold to single characters. We save
4188 * away these multicharacter foldings,
4189 * to be later saved as part of the
4190 * additional "s" data. */
4191 SV *sv;
4192
4193 if (!unicode_alternate)
4194 unicode_alternate = newAV();
4195 sv = newSVpvn((char*)foldbuf, foldlen);
4196 SvUTF8_on(sv);
4197 av_push(unicode_alternate, sv);
4198 }
4199 }
4200
4201 /* If folding and the value is one of the Greek
4202 * sigmas insert a few more sigmas to make the
4203 * folding rules of the sigmas to work right.
4204 * Note that not all the possible combinations
4205 * are handled here: some of them are handled
4206 * by the standard folding rules, and some of
4207 * them (literal or EXACTF cases) are handled
4208 * during runtime in regexec.c:S_find_byclass(). */
4209 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4210 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4211 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4212 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4213 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4214 }
4215 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4216 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4217 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4218 }
4219 }
4220 }
4221 #ifdef EBCDIC
4222 literal_endpoint = 0;
4223 #endif
4224 }
4225
4226 range = 0; /* this range (if it was one) is done now */
4227 }
4228
4229 if (need_class) {
4230 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4231 if (SIZE_ONLY)
4232 RExC_size += ANYOF_CLASS_ADD_SKIP;
4233 else
4234 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4235 }
4236
4237 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4238 if (!SIZE_ONLY &&
4239 /* If the only flag is folding (plus possibly inversion). */
4240 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4241 ) {
4242 for (value = 0; value < 256; ++value) {
4243 if (ANYOF_BITMAP_TEST(ret, value)) {
4244 UV fold = PL_fold[value];
4245
4246 if (fold != value)
4247 ANYOF_BITMAP_SET(ret, fold);
4248 }
4249 }
4250 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4251 }
4252
4253 /* optimize inverted simple patterns (e.g. [^a-z]) */
4254 if (!SIZE_ONLY && optimize_invert &&
4255 /* If the only flag is inversion. */
4256 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4257 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4258 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4259 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4260 }
4261
4262 if (!SIZE_ONLY) {
4263 AV *av = newAV();
4264 SV *rv;
4265
4266 /* The 0th element stores the character class description
4267 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4268 * to initialize the appropriate swash (which gets stored in
4269 * the 1st element), and also useful for dumping the regnode.
4270 * The 2nd element stores the multicharacter foldings,
4271 * used later (regexec.c:S_reginclass()). */
4272 av_store(av, 0, listsv);
4273 av_store(av, 1, NULL);
4274 av_store(av, 2, (SV*)unicode_alternate);
4275 rv = newRV_noinc((SV*)av);
4276 n = add_data(pRExC_state, 1, "s");
4277 RExC_rx->data->data[n] = (void*)rv;
4278 ARG_SET(ret, n);
4279 }
4280
4281 return ret;
4282 }
4283
4284 STATIC char*
S_nextchar(pTHX_ RExC_state_t * pRExC_state)4285 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4286 {
4287 char* retval = RExC_parse++;
4288
4289 for (;;) {
4290 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4291 RExC_parse[2] == '#') {
4292 while (*RExC_parse != ')') {
4293 if (RExC_parse == RExC_end)
4294 FAIL("Sequence (?#... not terminated");
4295 RExC_parse++;
4296 }
4297 RExC_parse++;
4298 continue;
4299 }
4300 if (RExC_flags & PMf_EXTENDED) {
4301 if (isSPACE(*RExC_parse)) {
4302 RExC_parse++;
4303 continue;
4304 }
4305 else if (*RExC_parse == '#') {
4306 while (RExC_parse < RExC_end)
4307 if (*RExC_parse++ == '\n') break;
4308 continue;
4309 }
4310 }
4311 return retval;
4312 }
4313 }
4314
4315 /*
4316 - reg_node - emit a node
4317 */
4318 STATIC regnode * /* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)4319 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4320 {
4321 register regnode *ret;
4322 register regnode *ptr;
4323
4324 ret = RExC_emit;
4325 if (SIZE_ONLY) {
4326 SIZE_ALIGN(RExC_size);
4327 RExC_size += 1;
4328 return(ret);
4329 }
4330
4331 NODE_ALIGN_FILL(ret);
4332 ptr = ret;
4333 FILL_ADVANCE_NODE(ptr, op);
4334 if (RExC_offsets) { /* MJD */
4335 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4336 "reg_node", __LINE__,
4337 reg_name[op],
4338 RExC_emit - RExC_emit_start > RExC_offsets[0]
4339 ? "Overwriting end of array!\n" : "OK",
4340 RExC_emit - RExC_emit_start,
4341 RExC_parse - RExC_start,
4342 RExC_offsets[0]));
4343 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4344 }
4345
4346 RExC_emit = ptr;
4347
4348 return(ret);
4349 }
4350
4351 /*
4352 - reganode - emit a node with an argument
4353 */
4354 STATIC regnode * /* Location. */
S_reganode(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)4355 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4356 {
4357 register regnode *ret;
4358 register regnode *ptr;
4359
4360 ret = RExC_emit;
4361 if (SIZE_ONLY) {
4362 SIZE_ALIGN(RExC_size);
4363 RExC_size += 2;
4364 return(ret);
4365 }
4366
4367 NODE_ALIGN_FILL(ret);
4368 ptr = ret;
4369 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4370 if (RExC_offsets) { /* MJD */
4371 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4372 "reganode",
4373 __LINE__,
4374 reg_name[op],
4375 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4376 "Overwriting end of array!\n" : "OK",
4377 RExC_emit - RExC_emit_start,
4378 RExC_parse - RExC_start,
4379 RExC_offsets[0]));
4380 Set_Cur_Node_Offset;
4381 }
4382
4383 RExC_emit = ptr;
4384
4385 return(ret);
4386 }
4387
4388 /*
4389 - reguni - emit (if appropriate) a Unicode character
4390 */
4391 STATIC void
S_reguni(pTHX_ RExC_state_t * pRExC_state,UV uv,char * s,STRLEN * lenp)4392 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4393 {
4394 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4395 }
4396
4397 /*
4398 - reginsert - insert an operator in front of already-emitted operand
4399 *
4400 * Means relocating the operand.
4401 */
4402 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,U8 op,regnode * opnd)4403 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4404 {
4405 register regnode *src;
4406 register regnode *dst;
4407 register regnode *place;
4408 register int offset = regarglen[(U8)op];
4409
4410 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4411
4412 if (SIZE_ONLY) {
4413 RExC_size += NODE_STEP_REGNODE + offset;
4414 return;
4415 }
4416
4417 src = RExC_emit;
4418 RExC_emit += NODE_STEP_REGNODE + offset;
4419 dst = RExC_emit;
4420 while (src > opnd) {
4421 StructCopy(--src, --dst, regnode);
4422 if (RExC_offsets) { /* MJD 20010112 */
4423 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4424 "reg_insert",
4425 __LINE__,
4426 reg_name[op],
4427 dst - RExC_emit_start > RExC_offsets[0]
4428 ? "Overwriting end of array!\n" : "OK",
4429 src - RExC_emit_start,
4430 dst - RExC_emit_start,
4431 RExC_offsets[0]));
4432 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4433 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4434 }
4435 }
4436
4437
4438 place = opnd; /* Op node, where operand used to be. */
4439 if (RExC_offsets) { /* MJD */
4440 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4441 "reginsert",
4442 __LINE__,
4443 reg_name[op],
4444 place - RExC_emit_start > RExC_offsets[0]
4445 ? "Overwriting end of array!\n" : "OK",
4446 place - RExC_emit_start,
4447 RExC_parse - RExC_start,
4448 RExC_offsets[0]));
4449 Set_Node_Offset(place, RExC_parse);
4450 Set_Node_Length(place, 1);
4451 }
4452 src = NEXTOPER(place);
4453 FILL_ADVANCE_NODE(place, op);
4454 Zero(src, offset, regnode);
4455 }
4456
4457 /*
4458 - regtail - set the next-pointer at the end of a node chain of p to val.
4459 */
4460 STATIC void
S_regtail(pTHX_ RExC_state_t * pRExC_state,regnode * p,regnode * val)4461 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4462 {
4463 register regnode *scan;
4464 register regnode *temp;
4465
4466 if (SIZE_ONLY)
4467 return;
4468
4469 /* Find last node. */
4470 scan = p;
4471 for (;;) {
4472 temp = regnext(scan);
4473 if (temp == NULL)
4474 break;
4475 scan = temp;
4476 }
4477
4478 if (reg_off_by_arg[OP(scan)]) {
4479 ARG_SET(scan, val - scan);
4480 }
4481 else {
4482 NEXT_OFF(scan) = val - scan;
4483 }
4484 }
4485
4486 /*
4487 - regoptail - regtail on operand of first argument; nop if operandless
4488 */
4489 STATIC void
S_regoptail(pTHX_ RExC_state_t * pRExC_state,regnode * p,regnode * val)4490 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4491 {
4492 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4493 if (p == NULL || SIZE_ONLY)
4494 return;
4495 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4496 regtail(pRExC_state, NEXTOPER(p), val);
4497 }
4498 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4499 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4500 }
4501 else
4502 return;
4503 }
4504
4505 /*
4506 - regcurly - a little FSA that accepts {\d+,?\d*}
4507 */
4508 STATIC I32
S_regcurly(pTHX_ register char * s)4509 S_regcurly(pTHX_ register char *s)
4510 {
4511 if (*s++ != '{')
4512 return FALSE;
4513 if (!isDIGIT(*s))
4514 return FALSE;
4515 while (isDIGIT(*s))
4516 s++;
4517 if (*s == ',')
4518 s++;
4519 while (isDIGIT(*s))
4520 s++;
4521 if (*s != '}')
4522 return FALSE;
4523 return TRUE;
4524 }
4525
4526
4527 #ifdef DEBUGGING
4528
4529 STATIC regnode *
S_dumpuntil(pTHX_ regnode * start,regnode * node,regnode * last,SV * sv,I32 l)4530 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4531 {
4532 register U8 op = EXACT; /* Arbitrary non-END op. */
4533 register regnode *next;
4534
4535 while (op != END && (!last || node < last)) {
4536 /* While that wasn't END last time... */
4537
4538 NODE_ALIGN(node);
4539 op = OP(node);
4540 if (op == CLOSE)
4541 l--;
4542 next = regnext(node);
4543 /* Where, what. */
4544 if (OP(node) == OPTIMIZED)
4545 goto after_print;
4546 regprop(sv, node);
4547 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4548 (int)(2*l + 1), "", SvPVX(sv));
4549 if (next == NULL) /* Next ptr. */
4550 PerlIO_printf(Perl_debug_log, "(0)");
4551 else
4552 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4553 (void)PerlIO_putc(Perl_debug_log, '\n');
4554 after_print:
4555 if (PL_regkind[(U8)op] == BRANCHJ) {
4556 register regnode *nnode = (OP(next) == LONGJMP
4557 ? regnext(next)
4558 : next);
4559 if (last && nnode > last)
4560 nnode = last;
4561 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4562 }
4563 else if (PL_regkind[(U8)op] == BRANCH) {
4564 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4565 }
4566 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4567 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4568 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4569 }
4570 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4571 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4572 next, sv, l + 1);
4573 }
4574 else if ( op == PLUS || op == STAR) {
4575 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4576 }
4577 else if (op == ANYOF) {
4578 /* arglen 1 + class block */
4579 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4580 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4581 node = NEXTOPER(node);
4582 }
4583 else if (PL_regkind[(U8)op] == EXACT) {
4584 /* Literal string, where present. */
4585 node += NODE_SZ_STR(node) - 1;
4586 node = NEXTOPER(node);
4587 }
4588 else {
4589 node = NEXTOPER(node);
4590 node += regarglen[(U8)op];
4591 }
4592 if (op == CURLYX || op == OPEN)
4593 l++;
4594 else if (op == WHILEM)
4595 l--;
4596 }
4597 return node;
4598 }
4599
4600 #endif /* DEBUGGING */
4601
4602 /*
4603 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4604 */
4605 void
Perl_regdump(pTHX_ regexp * r)4606 Perl_regdump(pTHX_ regexp *r)
4607 {
4608 #ifdef DEBUGGING
4609 SV *sv = sv_newmortal();
4610
4611 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4612
4613 /* Header fields of interest. */
4614 if (r->anchored_substr)
4615 PerlIO_printf(Perl_debug_log,
4616 "anchored `%s%.*s%s'%s at %"IVdf" ",
4617 PL_colors[0],
4618 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4619 SvPVX(r->anchored_substr),
4620 PL_colors[1],
4621 SvTAIL(r->anchored_substr) ? "$" : "",
4622 (IV)r->anchored_offset);
4623 else if (r->anchored_utf8)
4624 PerlIO_printf(Perl_debug_log,
4625 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4626 PL_colors[0],
4627 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4628 SvPVX(r->anchored_utf8),
4629 PL_colors[1],
4630 SvTAIL(r->anchored_utf8) ? "$" : "",
4631 (IV)r->anchored_offset);
4632 if (r->float_substr)
4633 PerlIO_printf(Perl_debug_log,
4634 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4635 PL_colors[0],
4636 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4637 SvPVX(r->float_substr),
4638 PL_colors[1],
4639 SvTAIL(r->float_substr) ? "$" : "",
4640 (IV)r->float_min_offset, (UV)r->float_max_offset);
4641 else if (r->float_utf8)
4642 PerlIO_printf(Perl_debug_log,
4643 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4644 PL_colors[0],
4645 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4646 SvPVX(r->float_utf8),
4647 PL_colors[1],
4648 SvTAIL(r->float_utf8) ? "$" : "",
4649 (IV)r->float_min_offset, (UV)r->float_max_offset);
4650 if (r->check_substr || r->check_utf8)
4651 PerlIO_printf(Perl_debug_log,
4652 r->check_substr == r->float_substr
4653 && r->check_utf8 == r->float_utf8
4654 ? "(checking floating" : "(checking anchored");
4655 if (r->reganch & ROPT_NOSCAN)
4656 PerlIO_printf(Perl_debug_log, " noscan");
4657 if (r->reganch & ROPT_CHECK_ALL)
4658 PerlIO_printf(Perl_debug_log, " isall");
4659 if (r->check_substr || r->check_utf8)
4660 PerlIO_printf(Perl_debug_log, ") ");
4661
4662 if (r->regstclass) {
4663 regprop(sv, r->regstclass);
4664 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4665 }
4666 if (r->reganch & ROPT_ANCH) {
4667 PerlIO_printf(Perl_debug_log, "anchored");
4668 if (r->reganch & ROPT_ANCH_BOL)
4669 PerlIO_printf(Perl_debug_log, "(BOL)");
4670 if (r->reganch & ROPT_ANCH_MBOL)
4671 PerlIO_printf(Perl_debug_log, "(MBOL)");
4672 if (r->reganch & ROPT_ANCH_SBOL)
4673 PerlIO_printf(Perl_debug_log, "(SBOL)");
4674 if (r->reganch & ROPT_ANCH_GPOS)
4675 PerlIO_printf(Perl_debug_log, "(GPOS)");
4676 PerlIO_putc(Perl_debug_log, ' ');
4677 }
4678 if (r->reganch & ROPT_GPOS_SEEN)
4679 PerlIO_printf(Perl_debug_log, "GPOS ");
4680 if (r->reganch & ROPT_SKIP)
4681 PerlIO_printf(Perl_debug_log, "plus ");
4682 if (r->reganch & ROPT_IMPLICIT)
4683 PerlIO_printf(Perl_debug_log, "implicit ");
4684 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4685 if (r->reganch & ROPT_EVAL_SEEN)
4686 PerlIO_printf(Perl_debug_log, "with eval ");
4687 PerlIO_printf(Perl_debug_log, "\n");
4688 if (r->offsets) {
4689 U32 i;
4690 U32 len = r->offsets[0];
4691 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4692 for (i = 1; i <= len; i++)
4693 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4694 (UV)r->offsets[i*2-1],
4695 (UV)r->offsets[i*2]);
4696 PerlIO_printf(Perl_debug_log, "\n");
4697 }
4698 #endif /* DEBUGGING */
4699 }
4700
4701 #ifdef DEBUGGING
4702
4703 STATIC void
S_put_byte(pTHX_ SV * sv,int c)4704 S_put_byte(pTHX_ SV *sv, int c)
4705 {
4706 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4707 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4708 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4709 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4710 else
4711 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4712 }
4713
4714 #endif /* DEBUGGING */
4715
4716 /*
4717 - regprop - printable representation of opcode
4718 */
4719 void
Perl_regprop(pTHX_ SV * sv,regnode * o)4720 Perl_regprop(pTHX_ SV *sv, regnode *o)
4721 {
4722 #ifdef DEBUGGING
4723 register int k;
4724
4725 sv_setpvn(sv, "", 0);
4726 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4727 /* It would be nice to FAIL() here, but this may be called from
4728 regexec.c, and it would be hard to supply pRExC_state. */
4729 Perl_croak(aTHX_ "Corrupted regexp opcode");
4730 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4731
4732 k = PL_regkind[(U8)OP(o)];
4733
4734 if (k == EXACT) {
4735 SV *dsv = sv_2mortal(newSVpvn("", 0));
4736 /* Using is_utf8_string() is a crude hack but it may
4737 * be the best for now since we have no flag "this EXACTish
4738 * node was UTF-8" --jhi */
4739 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4740 char *s = do_utf8 ?
4741 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4742 UNI_DISPLAY_REGEX) :
4743 STRING(o);
4744 int len = do_utf8 ?
4745 strlen(s) :
4746 STR_LEN(o);
4747 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4748 PL_colors[0],
4749 len, s,
4750 PL_colors[1]);
4751 }
4752 else if (k == CURLY) {
4753 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4754 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4755 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4756 }
4757 else if (k == WHILEM && o->flags) /* Ordinal/of */
4758 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4759 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4760 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4761 else if (k == LOGICAL)
4762 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4763 else if (k == ANYOF) {
4764 int i, rangestart = -1;
4765 U8 flags = ANYOF_FLAGS(o);
4766 const char * const anyofs[] = { /* Should be synchronized with
4767 * ANYOF_ #xdefines in regcomp.h */
4768 "\\w",
4769 "\\W",
4770 "\\s",
4771 "\\S",
4772 "\\d",
4773 "\\D",
4774 "[:alnum:]",
4775 "[:^alnum:]",
4776 "[:alpha:]",
4777 "[:^alpha:]",
4778 "[:ascii:]",
4779 "[:^ascii:]",
4780 "[:ctrl:]",
4781 "[:^ctrl:]",
4782 "[:graph:]",
4783 "[:^graph:]",
4784 "[:lower:]",
4785 "[:^lower:]",
4786 "[:print:]",
4787 "[:^print:]",
4788 "[:punct:]",
4789 "[:^punct:]",
4790 "[:upper:]",
4791 "[:^upper:]",
4792 "[:xdigit:]",
4793 "[:^xdigit:]",
4794 "[:space:]",
4795 "[:^space:]",
4796 "[:blank:]",
4797 "[:^blank:]"
4798 };
4799
4800 if (flags & ANYOF_LOCALE)
4801 sv_catpv(sv, "{loc}");
4802 if (flags & ANYOF_FOLD)
4803 sv_catpv(sv, "{i}");
4804 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4805 if (flags & ANYOF_INVERT)
4806 sv_catpv(sv, "^");
4807 for (i = 0; i <= 256; i++) {
4808 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4809 if (rangestart == -1)
4810 rangestart = i;
4811 } else if (rangestart != -1) {
4812 if (i <= rangestart + 3)
4813 for (; rangestart < i; rangestart++)
4814 put_byte(sv, rangestart);
4815 else {
4816 put_byte(sv, rangestart);
4817 sv_catpv(sv, "-");
4818 put_byte(sv, i - 1);
4819 }
4820 rangestart = -1;
4821 }
4822 }
4823
4824 if (o->flags & ANYOF_CLASS)
4825 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4826 if (ANYOF_CLASS_TEST(o,i))
4827 sv_catpv(sv, anyofs[i]);
4828
4829 if (flags & ANYOF_UNICODE)
4830 sv_catpv(sv, "{unicode}");
4831 else if (flags & ANYOF_UNICODE_ALL)
4832 sv_catpv(sv, "{unicode_all}");
4833
4834 {
4835 SV *lv;
4836 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4837
4838 if (lv) {
4839 if (sw) {
4840 U8 s[UTF8_MAXLEN+1];
4841
4842 for (i = 0; i <= 256; i++) { /* just the first 256 */
4843 U8 *e = uvchr_to_utf8(s, i);
4844
4845 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4846 if (rangestart == -1)
4847 rangestart = i;
4848 } else if (rangestart != -1) {
4849 U8 *p;
4850
4851 if (i <= rangestart + 3)
4852 for (; rangestart < i; rangestart++) {
4853 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4854 put_byte(sv, *p);
4855 }
4856 else {
4857 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4858 put_byte(sv, *p);
4859 sv_catpv(sv, "-");
4860 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4861 put_byte(sv, *p);
4862 }
4863 rangestart = -1;
4864 }
4865 }
4866
4867 sv_catpv(sv, "..."); /* et cetera */
4868 }
4869
4870 {
4871 char *s = savepv(SvPVX(lv));
4872 char *origs = s;
4873
4874 while(*s && *s != '\n') s++;
4875
4876 if (*s == '\n') {
4877 char *t = ++s;
4878
4879 while (*s) {
4880 if (*s == '\n')
4881 *s = ' ';
4882 s++;
4883 }
4884 if (s[-1] == ' ')
4885 s[-1] = 0;
4886
4887 sv_catpv(sv, t);
4888 }
4889
4890 Safefree(origs);
4891 }
4892 }
4893 }
4894
4895 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4896 }
4897 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4898 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4899 #endif /* DEBUGGING */
4900 }
4901
4902 SV *
Perl_re_intuit_string(pTHX_ regexp * prog)4903 Perl_re_intuit_string(pTHX_ regexp *prog)
4904 { /* Assume that RE_INTUIT is set */
4905 DEBUG_r(
4906 { STRLEN n_a;
4907 char *s = SvPV(prog->check_substr
4908 ? prog->check_substr : prog->check_utf8, n_a);
4909
4910 if (!PL_colorset) reginitcolors();
4911 PerlIO_printf(Perl_debug_log,
4912 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4913 PL_colors[4],
4914 prog->check_substr ? "" : "utf8 ",
4915 PL_colors[5],PL_colors[0],
4916 s,
4917 PL_colors[1],
4918 (strlen(s) > 60 ? "..." : ""));
4919 } );
4920
4921 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4922 }
4923
4924 void
Perl_pregfree(pTHX_ struct regexp * r)4925 Perl_pregfree(pTHX_ struct regexp *r)
4926 {
4927 #ifdef DEBUGGING
4928 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4929 #endif
4930
4931 if (!r || (--r->refcnt > 0))
4932 return;
4933 DEBUG_r({
4934 int len;
4935 char *s;
4936
4937 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4938 r->prelen, 60, UNI_DISPLAY_REGEX)
4939 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4940 len = SvCUR(dsv);
4941 if (!PL_colorset)
4942 reginitcolors();
4943 PerlIO_printf(Perl_debug_log,
4944 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4945 PL_colors[4],PL_colors[5],PL_colors[0],
4946 len, len, s,
4947 PL_colors[1],
4948 len > 60 ? "..." : "");
4949 });
4950
4951 if (r->precomp)
4952 Safefree(r->precomp);
4953 if (r->offsets) /* 20010421 MJD */
4954 Safefree(r->offsets);
4955 if (RX_MATCH_COPIED(r))
4956 Safefree(r->subbeg);
4957 if (r->substrs) {
4958 if (r->anchored_substr)
4959 SvREFCNT_dec(r->anchored_substr);
4960 if (r->anchored_utf8)
4961 SvREFCNT_dec(r->anchored_utf8);
4962 if (r->float_substr)
4963 SvREFCNT_dec(r->float_substr);
4964 if (r->float_utf8)
4965 SvREFCNT_dec(r->float_utf8);
4966 Safefree(r->substrs);
4967 }
4968 if (r->data) {
4969 int n = r->data->count;
4970 PAD* new_comppad = NULL;
4971 PAD* old_comppad;
4972
4973 while (--n >= 0) {
4974 /* If you add a ->what type here, update the comment in regcomp.h */
4975 switch (r->data->what[n]) {
4976 case 's':
4977 SvREFCNT_dec((SV*)r->data->data[n]);
4978 break;
4979 case 'f':
4980 Safefree(r->data->data[n]);
4981 break;
4982 case 'p':
4983 new_comppad = (AV*)r->data->data[n];
4984 break;
4985 case 'o':
4986 if (new_comppad == NULL)
4987 Perl_croak(aTHX_ "panic: pregfree comppad");
4988 PAD_SAVE_LOCAL(old_comppad,
4989 /* Watch out for global destruction's random ordering. */
4990 (SvTYPE(new_comppad) == SVt_PVAV) ?
4991 new_comppad : Null(PAD *)
4992 );
4993 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4994 op_free((OP_4tree*)r->data->data[n]);
4995 }
4996
4997 PAD_RESTORE_LOCAL(old_comppad);
4998 SvREFCNT_dec((SV*)new_comppad);
4999 new_comppad = NULL;
5000 break;
5001 case 'n':
5002 break;
5003 default:
5004 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
5005 }
5006 }
5007 Safefree(r->data->what);
5008 Safefree(r->data);
5009 }
5010 Safefree(r->startp);
5011 Safefree(r->endp);
5012 Safefree(r);
5013 }
5014
5015 /*
5016 - regnext - dig the "next" pointer out of a node
5017 *
5018 * [Note, when REGALIGN is defined there are two places in regmatch()
5019 * that bypass this code for speed.]
5020 */
5021 regnode *
Perl_regnext(pTHX_ register regnode * p)5022 Perl_regnext(pTHX_ register regnode *p)
5023 {
5024 register I32 offset;
5025
5026 if (p == &PL_regdummy)
5027 return(NULL);
5028
5029 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5030 if (offset == 0)
5031 return(NULL);
5032
5033 return(p+offset);
5034 }
5035
5036 STATIC void
S_re_croak2(pTHX_ const char * pat1,const char * pat2,...)5037 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5038 {
5039 va_list args;
5040 STRLEN l1 = strlen(pat1);
5041 STRLEN l2 = strlen(pat2);
5042 char buf[512];
5043 SV *msv;
5044 char *message;
5045
5046 if (l1 > 510)
5047 l1 = 510;
5048 if (l1 + l2 > 510)
5049 l2 = 510 - l1;
5050 Copy(pat1, buf, l1 , char);
5051 Copy(pat2, buf + l1, l2 , char);
5052 buf[l1 + l2] = '\n';
5053 buf[l1 + l2 + 1] = '\0';
5054 #ifdef I_STDARG
5055 /* ANSI variant takes additional second argument */
5056 va_start(args, pat2);
5057 #else
5058 va_start(args);
5059 #endif
5060 msv = vmess(buf, &args);
5061 va_end(args);
5062 message = SvPV(msv,l1);
5063 if (l1 > 512)
5064 l1 = 512;
5065 Copy(message, buf, l1 , char);
5066 buf[l1-1] = '\0'; /* Overwrite \n */
5067 Perl_croak(aTHX_ "%s", buf);
5068 }
5069
5070 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5071
5072 void
Perl_save_re_context(pTHX)5073 Perl_save_re_context(pTHX)
5074 {
5075 SAVEI32(PL_reg_flags); /* from regexec.c */
5076 SAVEPPTR(PL_bostr);
5077 SAVEPPTR(PL_reginput); /* String-input pointer. */
5078 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5079 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5080 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5081 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5082 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5083 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5084 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5085 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5086 PL_reg_start_tmp = 0;
5087 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5088 PL_reg_start_tmpl = 0;
5089 SAVEVPTR(PL_regdata);
5090 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5091 SAVEI32(PL_regnarrate); /* from regexec.c */
5092 SAVEVPTR(PL_regprogram); /* from regexec.c */
5093 SAVEINT(PL_regindent); /* from regexec.c */
5094 SAVEVPTR(PL_regcc); /* from regexec.c */
5095 SAVEVPTR(PL_curcop);
5096 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5097 SAVEVPTR(PL_reg_re); /* from regexec.c */
5098 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5099 SAVESPTR(PL_reg_sv); /* from regexec.c */
5100 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5101 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5102 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5103 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5104 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5105 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5106 PL_reg_oldsaved = Nullch;
5107 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5108 PL_reg_oldsavedlen = 0;
5109 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5110 PL_reg_maxiter = 0;
5111 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5112 PL_reg_leftiter = 0;
5113 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5114 PL_reg_poscache = Nullch;
5115 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5116 PL_reg_poscache_size = 0;
5117 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5118 SAVEI32(PL_regnpar); /* () count. */
5119 SAVEI32(PL_regsize); /* from regexec.c */
5120
5121 {
5122 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5123 U32 i;
5124 GV *mgv;
5125 REGEXP *rx;
5126 char digits[16];
5127
5128 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5129 for (i = 1; i <= rx->nparens; i++) {
5130 sprintf(digits, "%lu", (long)i);
5131 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5132 save_scalar(mgv);
5133 }
5134 }
5135 }
5136
5137 #ifdef DEBUGGING
5138 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5139 #endif
5140 }
5141
5142 static void
clear_re(pTHX_ void * r)5143 clear_re(pTHX_ void *r)
5144 {
5145 ReREFCNT_dec((regexp *)r);
5146 }
5147
5148