1 /* regcomp.c 2 */ 3 4 /* 5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee 6 * 7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] 8 */ 9 10 /* This file contains functions for compiling a regular expression. See 11 * also regexec.c which funnily enough, contains functions for executing 12 * a regular expression. 13 * 14 * This file is also copied at build time to ext/re/re_comp.c, where 15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 16 * This causes the main functions to be compiled under new names and with 17 * debugging support added, which makes "use re 'debug'" work. 18 */ 19 20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 21 * confused with the original package (see point 3 below). Thanks, Henry! 22 */ 23 24 /* Additional note: this code is very heavily munged from Henry's version 25 * in places. In some spots I've traded clarity for efficiency, so don't 26 * blame Henry for some of the lack of readability. 27 */ 28 29 /* The names of the functions have been changed from regcomp and 30 * regexec to pregcomp and pregexec in order to avoid conflicts 31 * with the POSIX routines of the same names. 32 */ 33 34 #ifdef PERL_EXT_RE_BUILD 35 #include "re_top.h" 36 #endif 37 38 /* 39 * pregcomp and pregexec -- regsub and regerror are not used in perl 40 * 41 * Copyright (c) 1986 by University of Toronto. 42 * Written by Henry Spencer. Not derived from licensed software. 43 * 44 * Permission is granted to anyone to use this software for any 45 * purpose on any computer system, and to redistribute it freely, 46 * subject to the following restrictions: 47 * 48 * 1. The author is not responsible for the consequences of use of 49 * this software, no matter how awful, even if they arise 50 * from defects in it. 51 * 52 * 2. The origin of this software must not be misrepresented, either 53 * by explicit claim or by omission. 54 * 55 * 3. Altered versions must be plainly marked as such, and must not 56 * be misrepresented as being the original software. 57 * 58 * 59 **** Alterations to Henry's code are... 60 **** 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 63 **** by Larry Wall and others 64 **** 65 **** You may distribute under the terms of either the GNU General Public 66 **** License or the Artistic License, as specified in the README file. 67 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGCOMP_C 75 #include "perl.h" 76 77 #ifndef PERL_IN_XSUB_RE 78 # include "INTERN.h" 79 #endif 80 81 #define REG_COMP_C 82 #ifdef PERL_IN_XSUB_RE 83 # include "re_comp.h" 84 EXTERN_C const struct regexp_engine my_reg_engine; 85 #else 86 # include "regcomp.h" 87 #endif 88 89 #include "dquote_inline.h" 90 #include "invlist_inline.h" 91 #include "unicode_constants.h" 92 93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \ 94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) 95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ 96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) 97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 99 100 #ifndef STATIC 101 #define STATIC static 102 #endif 103 104 #ifndef MIN 105 #define MIN(a,b) ((a) < (b) ? (a) : (b)) 106 #endif 107 108 #ifndef MAX 109 #define MAX(a,b) ((a) > (b) ? (a) : (b)) 110 #endif 111 112 /* this is a chain of data about sub patterns we are processing that 113 need to be handled separately/specially in study_chunk. Its so 114 we can simulate recursion without losing state. */ 115 struct scan_frame; 116 typedef struct scan_frame { 117 regnode *last_regnode; /* last node to process in this frame */ 118 regnode *next_regnode; /* next node to process when last is reached */ 119 U32 prev_recursed_depth; 120 I32 stopparen; /* what stopparen do we use */ 121 U32 is_top_frame; /* what flags do we use? */ 122 123 struct scan_frame *this_prev_frame; /* this previous frame */ 124 struct scan_frame *prev_frame; /* previous frame */ 125 struct scan_frame *next_frame; /* next frame */ 126 } scan_frame; 127 128 /* Certain characters are output as a sequence with the first being a 129 * backslash. */ 130 #define isBACKSLASHED_PUNCT(c) \ 131 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^') 132 133 134 struct RExC_state_t { 135 U32 flags; /* RXf_* are we folding, multilining? */ 136 U32 pm_flags; /* PMf_* stuff from the calling PMOP */ 137 char *precomp; /* uncompiled string. */ 138 char *precomp_end; /* pointer to end of uncompiled string. */ 139 REGEXP *rx_sv; /* The SV that is the regexp. */ 140 regexp *rx; /* perl core regexp structure */ 141 regexp_internal *rxi; /* internal data for regexp object 142 pprivate field */ 143 char *start; /* Start of input for compile */ 144 char *end; /* End of input for compile */ 145 char *parse; /* Input-scan pointer. */ 146 char *adjusted_start; /* 'start', adjusted. See code use */ 147 STRLEN precomp_adj; /* an offset beyond precomp. See code use */ 148 SSize_t whilem_seen; /* number of WHILEM in this expr */ 149 regnode *emit_start; /* Start of emitted-code area */ 150 regnode *emit_bound; /* First regnode outside of the 151 allocated space */ 152 regnode *emit; /* Code-emit pointer; if = &emit_dummy, 153 implies compiling, so don't emit */ 154 regnode_ssc emit_dummy; /* placeholder for emit to point to; 155 large enough for the largest 156 non-EXACTish node, so can use it as 157 scratch in pass1 */ 158 I32 naughty; /* How bad is this pattern? */ 159 I32 sawback; /* Did we see \1, ...? */ 160 U32 seen; 161 SSize_t size; /* Code size. */ 162 I32 npar; /* Capture buffer count, (OPEN) plus 163 one. ("par" 0 is the whole 164 pattern)*/ 165 I32 nestroot; /* root parens we are in - used by 166 accept */ 167 I32 extralen; 168 I32 seen_zerolen; 169 regnode **open_parens; /* pointers to open parens */ 170 regnode **close_parens; /* pointers to close parens */ 171 regnode *end_op; /* END node in program */ 172 I32 utf8; /* whether the pattern is utf8 or not */ 173 I32 orig_utf8; /* whether the pattern was originally in utf8 */ 174 /* XXX use this for future optimisation of case 175 * where pattern must be upgraded to utf8. */ 176 I32 uni_semantics; /* If a d charset modifier should use unicode 177 rules, even if the pattern is not in 178 utf8 */ 179 HV *paren_names; /* Paren names */ 180 181 regnode **recurse; /* Recurse regops */ 182 I32 recurse_count; /* Number of recurse regops we have generated */ 183 U8 *study_chunk_recursed; /* bitmap of which subs we have moved 184 through */ 185 U32 study_chunk_recursed_bytes; /* bytes in bitmap */ 186 I32 in_lookbehind; 187 I32 contains_locale; 188 I32 contains_i; 189 I32 override_recoding; 190 #ifdef EBCDIC 191 I32 recode_x_to_native; 192 #endif 193 I32 in_multi_char_class; 194 struct reg_code_block *code_blocks; /* positions of literal (?{}) 195 within pattern */ 196 int num_code_blocks; /* size of code_blocks[] */ 197 int code_index; /* next code_blocks[] slot */ 198 SSize_t maxlen; /* mininum possible number of chars in string to match */ 199 scan_frame *frame_head; 200 scan_frame *frame_last; 201 U32 frame_count; 202 AV *warn_text; 203 #ifdef ADD_TO_REGEXEC 204 char *starttry; /* -Dr: where regtry was called. */ 205 #define RExC_starttry (pRExC_state->starttry) 206 #endif 207 SV *runtime_code_qr; /* qr with the runtime code blocks */ 208 #ifdef DEBUGGING 209 const char *lastparse; 210 I32 lastnum; 211 AV *paren_name_list; /* idx -> name */ 212 U32 study_chunk_recursed_count; 213 SV *mysv1; 214 SV *mysv2; 215 #define RExC_lastparse (pRExC_state->lastparse) 216 #define RExC_lastnum (pRExC_state->lastnum) 217 #define RExC_paren_name_list (pRExC_state->paren_name_list) 218 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) 219 #define RExC_mysv (pRExC_state->mysv1) 220 #define RExC_mysv1 (pRExC_state->mysv1) 221 #define RExC_mysv2 (pRExC_state->mysv2) 222 223 #endif 224 bool seen_unfolded_sharp_s; 225 bool strict; 226 bool study_started; 227 }; 228 229 #define RExC_flags (pRExC_state->flags) 230 #define RExC_pm_flags (pRExC_state->pm_flags) 231 #define RExC_precomp (pRExC_state->precomp) 232 #define RExC_precomp_adj (pRExC_state->precomp_adj) 233 #define RExC_adjusted_start (pRExC_state->adjusted_start) 234 #define RExC_precomp_end (pRExC_state->precomp_end) 235 #define RExC_rx_sv (pRExC_state->rx_sv) 236 #define RExC_rx (pRExC_state->rx) 237 #define RExC_rxi (pRExC_state->rxi) 238 #define RExC_start (pRExC_state->start) 239 #define RExC_end (pRExC_state->end) 240 #define RExC_parse (pRExC_state->parse) 241 #define RExC_whilem_seen (pRExC_state->whilem_seen) 242 243 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any 244 * EXACTF node, hence was parsed under /di rules. If later in the parse, 245 * something forces the pattern into using /ui rules, the sharp s should be 246 * folded into the sequence 'ss', which takes up more space than previously 247 * calculated. This means that the sizing pass needs to be restarted. (The 248 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node 249 * that gets converted to /ui (and EXACTFU) occupies the same amount of space, 250 * so there is no need to resize [perl #125990]. */ 251 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s) 252 253 #ifdef RE_TRACK_PATTERN_OFFSETS 254 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the 255 others */ 256 #endif 257 #define RExC_emit (pRExC_state->emit) 258 #define RExC_emit_dummy (pRExC_state->emit_dummy) 259 #define RExC_emit_start (pRExC_state->emit_start) 260 #define RExC_emit_bound (pRExC_state->emit_bound) 261 #define RExC_sawback (pRExC_state->sawback) 262 #define RExC_seen (pRExC_state->seen) 263 #define RExC_size (pRExC_state->size) 264 #define RExC_maxlen (pRExC_state->maxlen) 265 #define RExC_npar (pRExC_state->npar) 266 #define RExC_nestroot (pRExC_state->nestroot) 267 #define RExC_extralen (pRExC_state->extralen) 268 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 269 #define RExC_utf8 (pRExC_state->utf8) 270 #define RExC_uni_semantics (pRExC_state->uni_semantics) 271 #define RExC_orig_utf8 (pRExC_state->orig_utf8) 272 #define RExC_open_parens (pRExC_state->open_parens) 273 #define RExC_close_parens (pRExC_state->close_parens) 274 #define RExC_end_op (pRExC_state->end_op) 275 #define RExC_paren_names (pRExC_state->paren_names) 276 #define RExC_recurse (pRExC_state->recurse) 277 #define RExC_recurse_count (pRExC_state->recurse_count) 278 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) 279 #define RExC_study_chunk_recursed_bytes \ 280 (pRExC_state->study_chunk_recursed_bytes) 281 #define RExC_in_lookbehind (pRExC_state->in_lookbehind) 282 #define RExC_contains_locale (pRExC_state->contains_locale) 283 #define RExC_contains_i (pRExC_state->contains_i) 284 #define RExC_override_recoding (pRExC_state->override_recoding) 285 #ifdef EBCDIC 286 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) 287 #endif 288 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) 289 #define RExC_frame_head (pRExC_state->frame_head) 290 #define RExC_frame_last (pRExC_state->frame_last) 291 #define RExC_frame_count (pRExC_state->frame_count) 292 #define RExC_strict (pRExC_state->strict) 293 #define RExC_study_started (pRExC_state->study_started) 294 #define RExC_warn_text (pRExC_state->warn_text) 295 296 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set 297 * a flag to disable back-off on the fixed/floating substrings - if it's 298 * a high complexity pattern we assume the benefit of avoiding a full match 299 * is worth the cost of checking for the substrings even if they rarely help. 300 */ 301 #define RExC_naughty (pRExC_state->naughty) 302 #define TOO_NAUGHTY (10) 303 #define MARK_NAUGHTY(add) \ 304 if (RExC_naughty < TOO_NAUGHTY) \ 305 RExC_naughty += (add) 306 #define MARK_NAUGHTY_EXP(exp, add) \ 307 if (RExC_naughty < TOO_NAUGHTY) \ 308 RExC_naughty += RExC_naughty / (exp) + (add) 309 310 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') 311 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ 312 ((*s) == '{' && regcurly(s))) 313 314 /* 315 * Flags to be passed up and down. 316 */ 317 #define WORST 0 /* Worst case. */ 318 #define HASWIDTH 0x01 /* Known to match non-null strings. */ 319 320 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single 321 * character. (There needs to be a case: in the switch statement in regexec.c 322 * for any node marked SIMPLE.) Note that this is not the same thing as 323 * REGNODE_SIMPLE */ 324 #define SIMPLE 0x02 325 #define SPSTART 0x04 /* Starts with * or + */ 326 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ 327 #define TRYAGAIN 0x10 /* Weeded out a declaration. */ 328 #define RESTART_PASS1 0x20 /* Need to restart sizing pass */ 329 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to 330 calcuate sizes as UTF-8 */ 331 332 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) 333 334 /* whether trie related optimizations are enabled */ 335 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 336 #define TRIE_STUDY_OPT 337 #define FULL_TRIE_STUDY 338 #define TRIE_STCLASS 339 #endif 340 341 342 343 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] 344 #define PBITVAL(paren) (1 << ((paren) & 7)) 345 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) 346 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) 347 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) 348 349 #define REQUIRE_UTF8(flagp) STMT_START { \ 350 if (!UTF) { \ 351 assert(PASS1); \ 352 *flagp = RESTART_PASS1|NEED_UTF8; \ 353 return NULL; \ 354 } \ 355 } STMT_END 356 357 /* Change from /d into /u rules, and restart the parse if we've already seen 358 * something whose size would increase as a result, by setting *flagp and 359 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates 360 * we've change to /u during the parse. */ 361 #define REQUIRE_UNI_RULES(flagp, restart_retval) \ 362 STMT_START { \ 363 if (DEPENDS_SEMANTICS) { \ 364 assert(PASS1); \ 365 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ 366 RExC_uni_semantics = 1; \ 367 if (RExC_seen_unfolded_sharp_s) { \ 368 *flagp |= RESTART_PASS1; \ 369 return restart_retval; \ 370 } \ 371 } \ 372 } STMT_END 373 374 /* This converts the named class defined in regcomp.h to its equivalent class 375 * number defined in handy.h. */ 376 #define namedclass_to_classnum(class) ((int) ((class) / 2)) 377 #define classnum_to_namedclass(classnum) ((classnum) * 2) 378 379 #define _invlist_union_complement_2nd(a, b, output) \ 380 _invlist_union_maybe_complement_2nd(a, b, TRUE, output) 381 #define _invlist_intersection_complement_2nd(a, b, output) \ 382 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) 383 384 /* About scan_data_t. 385 386 During optimisation we recurse through the regexp program performing 387 various inplace (keyhole style) optimisations. In addition study_chunk 388 and scan_commit populate this data structure with information about 389 what strings MUST appear in the pattern. We look for the longest 390 string that must appear at a fixed location, and we look for the 391 longest string that may appear at a floating location. So for instance 392 in the pattern: 393 394 /FOO[xX]A.*B[xX]BAR/ 395 396 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating 397 strings (because they follow a .* construct). study_chunk will identify 398 both FOO and BAR as being the longest fixed and floating strings respectively. 399 400 The strings can be composites, for instance 401 402 /(f)(o)(o)/ 403 404 will result in a composite fixed substring 'foo'. 405 406 For each string some basic information is maintained: 407 408 - offset or min_offset 409 This is the position the string must appear at, or not before. 410 It also implicitly (when combined with minlenp) tells us how many 411 characters must match before the string we are searching for. 412 Likewise when combined with minlenp and the length of the string it 413 tells us how many characters must appear after the string we have 414 found. 415 416 - max_offset 417 Only used for floating strings. This is the rightmost point that 418 the string can appear at. If set to SSize_t_MAX it indicates that the 419 string can occur infinitely far to the right. 420 421 - minlenp 422 A pointer to the minimum number of characters of the pattern that the 423 string was found inside. This is important as in the case of positive 424 lookahead or positive lookbehind we can have multiple patterns 425 involved. Consider 426 427 /(?=FOO).*F/ 428 429 The minimum length of the pattern overall is 3, the minimum length 430 of the lookahead part is 3, but the minimum length of the part that 431 will actually match is 1. So 'FOO's minimum length is 3, but the 432 minimum length for the F is 1. This is important as the minimum length 433 is used to determine offsets in front of and behind the string being 434 looked for. Since strings can be composites this is the length of the 435 pattern at the time it was committed with a scan_commit. Note that 436 the length is calculated by study_chunk, so that the minimum lengths 437 are not known until the full pattern has been compiled, thus the 438 pointer to the value. 439 440 - lookbehind 441 442 In the case of lookbehind the string being searched for can be 443 offset past the start point of the final matching string. 444 If this value was just blithely removed from the min_offset it would 445 invalidate some of the calculations for how many chars must match 446 before or after (as they are derived from min_offset and minlen and 447 the length of the string being searched for). 448 When the final pattern is compiled and the data is moved from the 449 scan_data_t structure into the regexp structure the information 450 about lookbehind is factored in, with the information that would 451 have been lost precalculated in the end_shift field for the 452 associated string. 453 454 The fields pos_min and pos_delta are used to store the minimum offset 455 and the delta to the maximum offset at the current point in the pattern. 456 457 */ 458 459 typedef struct scan_data_t { 460 /*I32 len_min; unused */ 461 /*I32 len_delta; unused */ 462 SSize_t pos_min; 463 SSize_t pos_delta; 464 SV *last_found; 465 SSize_t last_end; /* min value, <0 unless valid. */ 466 SSize_t last_start_min; 467 SSize_t last_start_max; 468 SV **longest; /* Either &l_fixed, or &l_float. */ 469 SV *longest_fixed; /* longest fixed string found in pattern */ 470 SSize_t offset_fixed; /* offset where it starts */ 471 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ 472 I32 lookbehind_fixed; /* is the position of the string modfied by LB */ 473 SV *longest_float; /* longest floating string found in pattern */ 474 SSize_t offset_float_min; /* earliest point in string it can appear */ 475 SSize_t offset_float_max; /* latest point in string it can appear */ 476 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ 477 SSize_t lookbehind_float; /* is the pos of the string modified by LB */ 478 I32 flags; 479 I32 whilem_c; 480 SSize_t *last_closep; 481 regnode_ssc *start_class; 482 } scan_data_t; 483 484 /* 485 * Forward declarations for pregcomp()'s friends. 486 */ 487 488 static const scan_data_t zero_scan_data = 489 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; 490 491 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 492 #define SF_BEFORE_SEOL 0x0001 493 #define SF_BEFORE_MEOL 0x0002 494 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) 495 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) 496 497 #define SF_FIX_SHIFT_EOL (+2) 498 #define SF_FL_SHIFT_EOL (+4) 499 500 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) 501 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) 502 503 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) 504 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ 505 #define SF_IS_INF 0x0040 506 #define SF_HAS_PAR 0x0080 507 #define SF_IN_PAR 0x0100 508 #define SF_HAS_EVAL 0x0200 509 510 511 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the 512 * longest substring in the pattern. When it is not set the optimiser keeps 513 * track of position, but does not keep track of the actual strings seen, 514 * 515 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but 516 * /foo/i will not. 517 * 518 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" 519 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be 520 * turned off because of the alternation (BRANCH). */ 521 #define SCF_DO_SUBSTR 0x0400 522 523 #define SCF_DO_STCLASS_AND 0x0800 524 #define SCF_DO_STCLASS_OR 0x1000 525 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 526 #define SCF_WHILEM_VISITED_POS 0x2000 527 528 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ 529 #define SCF_SEEN_ACCEPT 0x8000 530 #define SCF_TRIE_DOING_RESTUDY 0x10000 531 #define SCF_IN_DEFINE 0x20000 532 533 534 535 536 #define UTF cBOOL(RExC_utf8) 537 538 /* The enums for all these are ordered so things work out correctly */ 539 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) 540 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ 541 == REGEX_DEPENDS_CHARSET) 542 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) 543 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ 544 >= REGEX_UNICODE_CHARSET) 545 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 546 == REGEX_ASCII_RESTRICTED_CHARSET) 547 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ 548 >= REGEX_ASCII_RESTRICTED_CHARSET) 549 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ 550 == REGEX_ASCII_MORE_RESTRICTED_CHARSET) 551 552 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) 553 554 /* For programs that want to be strictly Unicode compatible by dying if any 555 * attempt is made to match a non-Unicode code point against a Unicode 556 * property. */ 557 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) 558 559 #define OOB_NAMEDCLASS -1 560 561 /* There is no code point that is out-of-bounds, so this is problematic. But 562 * its only current use is to initialize a variable that is always set before 563 * looked at. */ 564 #define OOB_UNICODE 0xDEADBEEF 565 566 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 567 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) 568 569 570 /* length of regex to show in messages that don't mark a position within */ 571 #define RegexLengthToShowInErrorMessages 127 572 573 /* 574 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 575 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 576 * op/pragma/warn/regcomp. 577 */ 578 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 579 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 580 581 #define REPORT_LOCATION " in regex; marked by " MARKER1 \ 582 " in m/%"UTF8f MARKER2 "%"UTF8f"/" 583 584 /* The code in this file in places uses one level of recursion with parsing 585 * rebased to an alternate string constructed by us in memory. This can take 586 * the form of something that is completely different from the input, or 587 * something that uses the input as part of the alternate. In the first case, 588 * there should be no possibility of an error, as we are in complete control of 589 * the alternate string. But in the second case we don't control the input 590 * portion, so there may be errors in that. Here's an example: 591 * /[abc\x{DF}def]/ui 592 * is handled specially because \x{df} folds to a sequence of more than one 593 * character, 'ss'. What is done is to create and parse an alternate string, 594 * which looks like this: 595 * /(?:\x{DF}|[abc\x{DF}def])/ui 596 * where it uses the input unchanged in the middle of something it constructs, 597 * which is a branch for the DF outside the character class, and clustering 598 * parens around the whole thing. (It knows enough to skip the DF inside the 599 * class while in this substitute parse.) 'abc' and 'def' may have errors that 600 * need to be reported. The general situation looks like this: 601 * 602 * sI tI xI eI 603 * Input: ---------------------------------------------------- 604 * Constructed: --------------------------------------------------- 605 * sC tC xC eC EC 606 * 607 * The input string sI..eI is the input pattern. The string sC..EC is the 608 * constructed substitute parse string. The portions sC..tC and eC..EC are 609 * constructed by us. The portion tC..eC is an exact duplicate of the input 610 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that 611 * while parsing, we find an error at xC. We want to display a message showing 612 * the real input string. Thus we need to find the point xI in it which 613 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has 614 * been constructed by us, and so shouldn't have errors. We get: 615 * 616 * xI = sI + (tI - sI) + (xC - tC) 617 * 618 * and, the offset into sI is: 619 * 620 * (xI - sI) = (tI - sI) + (xC - tC) 621 * 622 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj, 623 * and we save tC as RExC_adjusted_start. 624 * 625 * During normal processing of the input pattern, everything points to that, 626 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI. 627 */ 628 629 #define tI_sI RExC_precomp_adj 630 #define tC RExC_adjusted_start 631 #define sC RExC_precomp 632 #define xI_offset(xC) ((IV) (tI_sI + (xC - tC))) 633 #define xI(xC) (sC + xI_offset(xC)) 634 #define eC RExC_precomp_end 635 636 #define REPORT_LOCATION_ARGS(xC) \ 637 UTF8fARG(UTF, \ 638 (xI(xC) > eC) /* Don't run off end */ \ 639 ? eC - sC /* Length before the <--HERE */ \ 640 : xI_offset(xC), \ 641 sC), /* The input pattern printed up to the <--HERE */ \ 642 UTF8fARG(UTF, \ 643 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \ 644 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */ 645 646 /* Used to point after bad bytes for an error message, but avoid skipping 647 * past a nul byte. */ 648 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1) 649 650 /* 651 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 652 * arg. Show regex, up to a maximum length. If it's too long, chop and add 653 * "...". 654 */ 655 #define _FAIL(code) STMT_START { \ 656 const char *ellipses = ""; \ 657 IV len = RExC_precomp_end - RExC_precomp; \ 658 \ 659 if (!SIZE_ONLY) \ 660 SAVEFREESV(RExC_rx_sv); \ 661 if (len > RegexLengthToShowInErrorMessages) { \ 662 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 663 len = RegexLengthToShowInErrorMessages - 10; \ 664 ellipses = "..."; \ 665 } \ 666 code; \ 667 } STMT_END 668 669 #define FAIL(msg) _FAIL( \ 670 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ 671 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 672 673 #define FAIL2(msg,arg) _FAIL( \ 674 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ 675 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) 676 677 /* 678 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 679 */ 680 #define Simple_vFAIL(m) STMT_START { \ 681 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 682 m, REPORT_LOCATION_ARGS(RExC_parse)); \ 683 } STMT_END 684 685 /* 686 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 687 */ 688 #define vFAIL(m) STMT_START { \ 689 if (!SIZE_ONLY) \ 690 SAVEFREESV(RExC_rx_sv); \ 691 Simple_vFAIL(m); \ 692 } STMT_END 693 694 /* 695 * Like Simple_vFAIL(), but accepts two arguments. 696 */ 697 #define Simple_vFAIL2(m,a1) STMT_START { \ 698 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ 699 REPORT_LOCATION_ARGS(RExC_parse)); \ 700 } STMT_END 701 702 /* 703 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 704 */ 705 #define vFAIL2(m,a1) STMT_START { \ 706 if (!SIZE_ONLY) \ 707 SAVEFREESV(RExC_rx_sv); \ 708 Simple_vFAIL2(m, a1); \ 709 } STMT_END 710 711 712 /* 713 * Like Simple_vFAIL(), but accepts three arguments. 714 */ 715 #define Simple_vFAIL3(m, a1, a2) STMT_START { \ 716 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ 717 REPORT_LOCATION_ARGS(RExC_parse)); \ 718 } STMT_END 719 720 /* 721 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 722 */ 723 #define vFAIL3(m,a1,a2) STMT_START { \ 724 if (!SIZE_ONLY) \ 725 SAVEFREESV(RExC_rx_sv); \ 726 Simple_vFAIL3(m, a1, a2); \ 727 } STMT_END 728 729 /* 730 * Like Simple_vFAIL(), but accepts four arguments. 731 */ 732 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ 733 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ 734 REPORT_LOCATION_ARGS(RExC_parse)); \ 735 } STMT_END 736 737 #define vFAIL4(m,a1,a2,a3) STMT_START { \ 738 if (!SIZE_ONLY) \ 739 SAVEFREESV(RExC_rx_sv); \ 740 Simple_vFAIL4(m, a1, a2, a3); \ 741 } STMT_END 742 743 /* A specialized version of vFAIL2 that works with UTF8f */ 744 #define vFAIL2utf8f(m, a1) STMT_START { \ 745 if (!SIZE_ONLY) \ 746 SAVEFREESV(RExC_rx_sv); \ 747 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ 748 REPORT_LOCATION_ARGS(RExC_parse)); \ 749 } STMT_END 750 751 #define vFAIL3utf8f(m, a1, a2) STMT_START { \ 752 if (!SIZE_ONLY) \ 753 SAVEFREESV(RExC_rx_sv); \ 754 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ 755 REPORT_LOCATION_ARGS(RExC_parse)); \ 756 } STMT_END 757 758 /* These have asserts in them because of [perl #122671] Many warnings in 759 * regcomp.c can occur twice. If they get output in pass1 and later in that 760 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they 761 * would get output again. So they should be output in pass2, and these 762 * asserts make sure new warnings follow that paradigm. */ 763 764 /* m is not necessarily a "literal string", in this macro */ 765 #define reg_warn_non_literal_string(loc, m) STMT_START { \ 766 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 767 "%s" REPORT_LOCATION, \ 768 m, REPORT_LOCATION_ARGS(loc)); \ 769 } STMT_END 770 771 #define ckWARNreg(loc,m) STMT_START { \ 772 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 773 m REPORT_LOCATION, \ 774 REPORT_LOCATION_ARGS(loc)); \ 775 } STMT_END 776 777 #define vWARN(loc, m) STMT_START { \ 778 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 779 m REPORT_LOCATION, \ 780 REPORT_LOCATION_ARGS(loc)); \ 781 } STMT_END 782 783 #define vWARN_dep(loc, m) STMT_START { \ 784 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \ 785 m REPORT_LOCATION, \ 786 REPORT_LOCATION_ARGS(loc)); \ 787 } STMT_END 788 789 #define ckWARNdep(loc,m) STMT_START { \ 790 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ 791 m REPORT_LOCATION, \ 792 REPORT_LOCATION_ARGS(loc)); \ 793 } STMT_END 794 795 #define ckWARNregdep(loc,m) STMT_START { \ 796 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \ 797 WARN_REGEXP), \ 798 m REPORT_LOCATION, \ 799 REPORT_LOCATION_ARGS(loc)); \ 800 } STMT_END 801 802 #define ckWARN2reg_d(loc,m, a1) STMT_START { \ 803 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ 804 m REPORT_LOCATION, \ 805 a1, REPORT_LOCATION_ARGS(loc)); \ 806 } STMT_END 807 808 #define ckWARN2reg(loc, m, a1) STMT_START { \ 809 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 810 m REPORT_LOCATION, \ 811 a1, REPORT_LOCATION_ARGS(loc)); \ 812 } STMT_END 813 814 #define vWARN3(loc, m, a1, a2) STMT_START { \ 815 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 816 m REPORT_LOCATION, \ 817 a1, a2, REPORT_LOCATION_ARGS(loc)); \ 818 } STMT_END 819 820 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ 821 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 822 m REPORT_LOCATION, \ 823 a1, a2, \ 824 REPORT_LOCATION_ARGS(loc)); \ 825 } STMT_END 826 827 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ 828 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 829 m REPORT_LOCATION, \ 830 a1, a2, a3, \ 831 REPORT_LOCATION_ARGS(loc)); \ 832 } STMT_END 833 834 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ 835 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ 836 m REPORT_LOCATION, \ 837 a1, a2, a3, \ 838 REPORT_LOCATION_ARGS(loc)); \ 839 } STMT_END 840 841 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ 842 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ 843 m REPORT_LOCATION, \ 844 a1, a2, a3, a4, \ 845 REPORT_LOCATION_ARGS(loc)); \ 846 } STMT_END 847 848 /* Macros for recording node offsets. 20001227 mjd@plover.com 849 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in 850 * element 2*n-1 of the array. Element #2n holds the byte length node #n. 851 * Element 0 holds the number n. 852 * Position is 1 indexed. 853 */ 854 #ifndef RE_TRACK_PATTERN_OFFSETS 855 #define Set_Node_Offset_To_R(node,byte) 856 #define Set_Node_Offset(node,byte) 857 #define Set_Cur_Node_Offset 858 #define Set_Node_Length_To_R(node,len) 859 #define Set_Node_Length(node,len) 860 #define Set_Node_Cur_Length(node,start) 861 #define Node_Offset(n) 862 #define Node_Length(n) 863 #define Set_Node_Offset_Length(node,offset,len) 864 #define ProgLen(ri) ri->u.proglen 865 #define SetProgLen(ri,x) ri->u.proglen = x 866 #else 867 #define ProgLen(ri) ri->u.offsets[0] 868 #define SetProgLen(ri,x) ri->u.offsets[0] = x 869 #define Set_Node_Offset_To_R(node,byte) STMT_START { \ 870 if (! SIZE_ONLY) { \ 871 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ 872 __LINE__, (int)(node), (int)(byte))); \ 873 if((node) < 0) { \ 874 Perl_croak(aTHX_ "value of node is %d in Offset macro", \ 875 (int)(node)); \ 876 } else { \ 877 RExC_offsets[2*(node)-1] = (byte); \ 878 } \ 879 } \ 880 } STMT_END 881 882 #define Set_Node_Offset(node,byte) \ 883 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) 884 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) 885 886 #define Set_Node_Length_To_R(node,len) STMT_START { \ 887 if (! SIZE_ONLY) { \ 888 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ 889 __LINE__, (int)(node), (int)(len))); \ 890 if((node) < 0) { \ 891 Perl_croak(aTHX_ "value of node is %d in Length macro", \ 892 (int)(node)); \ 893 } else { \ 894 RExC_offsets[2*(node)] = (len); \ 895 } \ 896 } \ 897 } STMT_END 898 899 #define Set_Node_Length(node,len) \ 900 Set_Node_Length_To_R((node)-RExC_emit_start, len) 901 #define Set_Node_Cur_Length(node, start) \ 902 Set_Node_Length(node, RExC_parse - start) 903 904 /* Get offsets and lengths */ 905 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) 906 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) 907 908 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \ 909 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ 910 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ 911 } STMT_END 912 #endif 913 914 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 915 #define EXPERIMENTAL_INPLACESCAN 916 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ 917 918 #ifdef DEBUGGING 919 int 920 Perl_re_printf(pTHX_ const char *fmt, ...) 921 { 922 va_list ap; 923 int result; 924 PerlIO *f= Perl_debug_log; 925 PERL_ARGS_ASSERT_RE_PRINTF; 926 va_start(ap, fmt); 927 result = PerlIO_vprintf(f, fmt, ap); 928 va_end(ap); 929 return result; 930 } 931 932 int 933 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) 934 { 935 va_list ap; 936 int result; 937 PerlIO *f= Perl_debug_log; 938 PERL_ARGS_ASSERT_RE_INDENTF; 939 va_start(ap, depth); 940 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); 941 result = PerlIO_vprintf(f, fmt, ap); 942 va_end(ap); 943 return result; 944 } 945 #endif /* DEBUGGING */ 946 947 #define DEBUG_RExC_seen() \ 948 DEBUG_OPTIMISE_MORE_r({ \ 949 Perl_re_printf( aTHX_ "RExC_seen: "); \ 950 \ 951 if (RExC_seen & REG_ZERO_LEN_SEEN) \ 952 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ 953 \ 954 if (RExC_seen & REG_LOOKBEHIND_SEEN) \ 955 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ 956 \ 957 if (RExC_seen & REG_GPOS_SEEN) \ 958 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ 959 \ 960 if (RExC_seen & REG_RECURSE_SEEN) \ 961 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ 962 \ 963 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ 964 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ 965 \ 966 if (RExC_seen & REG_VERBARG_SEEN) \ 967 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ 968 \ 969 if (RExC_seen & REG_CUTGROUP_SEEN) \ 970 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ 971 \ 972 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ 973 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ 974 \ 975 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ 976 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ 977 \ 978 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ 979 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ 980 \ 981 Perl_re_printf( aTHX_ "\n"); \ 982 }); 983 984 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ 985 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) 986 987 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \ 988 if ( ( flags ) ) { \ 989 Perl_re_printf( aTHX_ "%s", open_str); \ 990 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \ 991 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \ 992 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \ 993 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \ 994 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \ 995 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \ 996 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \ 997 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \ 998 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \ 999 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \ 1000 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \ 1001 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \ 1002 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \ 1003 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \ 1004 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \ 1005 Perl_re_printf( aTHX_ "%s", close_str); \ 1006 } 1007 1008 1009 #define DEBUG_STUDYDATA(str,data,depth) \ 1010 DEBUG_OPTIMISE_MORE_r(if(data){ \ 1011 Perl_re_indentf( aTHX_ "" str "Pos:%"IVdf"/%"IVdf \ 1012 " Flags: 0x%"UVXf, \ 1013 depth, \ 1014 (IV)((data)->pos_min), \ 1015 (IV)((data)->pos_delta), \ 1016 (UV)((data)->flags) \ 1017 ); \ 1018 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \ 1019 Perl_re_printf( aTHX_ \ 1020 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ 1021 (IV)((data)->whilem_c), \ 1022 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ 1023 is_inf ? "INF " : "" \ 1024 ); \ 1025 if ((data)->last_found) \ 1026 Perl_re_printf( aTHX_ \ 1027 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ 1028 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ 1029 SvPVX_const((data)->last_found), \ 1030 (IV)((data)->last_end), \ 1031 (IV)((data)->last_start_min), \ 1032 (IV)((data)->last_start_max), \ 1033 ((data)->longest && \ 1034 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ 1035 SvPVX_const((data)->longest_fixed), \ 1036 (IV)((data)->offset_fixed), \ 1037 ((data)->longest && \ 1038 (data)->longest==&((data)->longest_float)) ? "*" : "", \ 1039 SvPVX_const((data)->longest_float), \ 1040 (IV)((data)->offset_float_min), \ 1041 (IV)((data)->offset_float_max) \ 1042 ); \ 1043 Perl_re_printf( aTHX_ "\n"); \ 1044 }); 1045 1046 1047 /* ========================================================= 1048 * BEGIN edit_distance stuff. 1049 * 1050 * This calculates how many single character changes of any type are needed to 1051 * transform a string into another one. It is taken from version 3.1 of 1052 * 1053 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS 1054 */ 1055 1056 /* Our unsorted dictionary linked list. */ 1057 /* Note we use UVs, not chars. */ 1058 1059 struct dictionary{ 1060 UV key; 1061 UV value; 1062 struct dictionary* next; 1063 }; 1064 typedef struct dictionary item; 1065 1066 1067 PERL_STATIC_INLINE item* 1068 push(UV key,item* curr) 1069 { 1070 item* head; 1071 Newxz(head, 1, item); 1072 head->key = key; 1073 head->value = 0; 1074 head->next = curr; 1075 return head; 1076 } 1077 1078 1079 PERL_STATIC_INLINE item* 1080 find(item* head, UV key) 1081 { 1082 item* iterator = head; 1083 while (iterator){ 1084 if (iterator->key == key){ 1085 return iterator; 1086 } 1087 iterator = iterator->next; 1088 } 1089 1090 return NULL; 1091 } 1092 1093 PERL_STATIC_INLINE item* 1094 uniquePush(item* head,UV key) 1095 { 1096 item* iterator = head; 1097 1098 while (iterator){ 1099 if (iterator->key == key) { 1100 return head; 1101 } 1102 iterator = iterator->next; 1103 } 1104 1105 return push(key,head); 1106 } 1107 1108 PERL_STATIC_INLINE void 1109 dict_free(item* head) 1110 { 1111 item* iterator = head; 1112 1113 while (iterator) { 1114 item* temp = iterator; 1115 iterator = iterator->next; 1116 Safefree(temp); 1117 } 1118 1119 head = NULL; 1120 } 1121 1122 /* End of Dictionary Stuff */ 1123 1124 /* All calculations/work are done here */ 1125 STATIC int 1126 S_edit_distance(const UV* src, 1127 const UV* tgt, 1128 const STRLEN x, /* length of src[] */ 1129 const STRLEN y, /* length of tgt[] */ 1130 const SSize_t maxDistance 1131 ) 1132 { 1133 item *head = NULL; 1134 UV swapCount,swapScore,targetCharCount,i,j; 1135 UV *scores; 1136 UV score_ceil = x + y; 1137 1138 PERL_ARGS_ASSERT_EDIT_DISTANCE; 1139 1140 /* intialize matrix start values */ 1141 Newxz(scores, ( (x + 2) * (y + 2)), UV); 1142 scores[0] = score_ceil; 1143 scores[1 * (y + 2) + 0] = score_ceil; 1144 scores[0 * (y + 2) + 1] = score_ceil; 1145 scores[1 * (y + 2) + 1] = 0; 1146 head = uniquePush(uniquePush(head,src[0]),tgt[0]); 1147 1148 /* work loops */ 1149 /* i = src index */ 1150 /* j = tgt index */ 1151 for (i=1;i<=x;i++) { 1152 if (i < x) 1153 head = uniquePush(head,src[i]); 1154 scores[(i+1) * (y + 2) + 1] = i; 1155 scores[(i+1) * (y + 2) + 0] = score_ceil; 1156 swapCount = 0; 1157 1158 for (j=1;j<=y;j++) { 1159 if (i == 1) { 1160 if(j < y) 1161 head = uniquePush(head,tgt[j]); 1162 scores[1 * (y + 2) + (j + 1)] = j; 1163 scores[0 * (y + 2) + (j + 1)] = score_ceil; 1164 } 1165 1166 targetCharCount = find(head,tgt[j-1])->value; 1167 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount; 1168 1169 if (src[i-1] != tgt[j-1]){ 1170 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1)); 1171 } 1172 else { 1173 swapCount = j; 1174 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore); 1175 } 1176 } 1177 1178 find(head,src[i-1])->value = i; 1179 } 1180 1181 { 1182 IV score = scores[(x+1) * (y + 2) + (y + 1)]; 1183 dict_free(head); 1184 Safefree(scores); 1185 return (maxDistance != 0 && maxDistance < score)?(-1):score; 1186 } 1187 } 1188 1189 /* END of edit_distance() stuff 1190 * ========================================================= */ 1191 1192 /* is c a control character for which we have a mnemonic? */ 1193 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 1194 1195 STATIC const char * 1196 S_cntrl_to_mnemonic(const U8 c) 1197 { 1198 /* Returns the mnemonic string that represents character 'c', if one 1199 * exists; NULL otherwise. The only ones that exist for the purposes of 1200 * this routine are a few control characters */ 1201 1202 switch (c) { 1203 case '\a': return "\\a"; 1204 case '\b': return "\\b"; 1205 case ESC_NATIVE: return "\\e"; 1206 case '\f': return "\\f"; 1207 case '\n': return "\\n"; 1208 case '\r': return "\\r"; 1209 case '\t': return "\\t"; 1210 } 1211 1212 return NULL; 1213 } 1214 1215 /* Mark that we cannot extend a found fixed substring at this point. 1216 Update the longest found anchored substring and the longest found 1217 floating substrings if needed. */ 1218 1219 STATIC void 1220 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, 1221 SSize_t *minlenp, int is_inf) 1222 { 1223 const STRLEN l = CHR_SVLEN(data->last_found); 1224 const STRLEN old_l = CHR_SVLEN(*data->longest); 1225 GET_RE_DEBUG_FLAGS_DECL; 1226 1227 PERL_ARGS_ASSERT_SCAN_COMMIT; 1228 1229 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 1230 SvSetMagicSV(*data->longest, data->last_found); 1231 if (*data->longest == data->longest_fixed) { 1232 data->offset_fixed = l ? data->last_start_min : data->pos_min; 1233 if (data->flags & SF_BEFORE_EOL) 1234 data->flags 1235 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); 1236 else 1237 data->flags &= ~SF_FIX_BEFORE_EOL; 1238 data->minlen_fixed=minlenp; 1239 data->lookbehind_fixed=0; 1240 } 1241 else { /* *data->longest == data->longest_float */ 1242 data->offset_float_min = l ? data->last_start_min : data->pos_min; 1243 data->offset_float_max = (l 1244 ? data->last_start_max 1245 : (data->pos_delta > SSize_t_MAX - data->pos_min 1246 ? SSize_t_MAX 1247 : data->pos_min + data->pos_delta)); 1248 if (is_inf 1249 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) 1250 data->offset_float_max = SSize_t_MAX; 1251 if (data->flags & SF_BEFORE_EOL) 1252 data->flags 1253 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); 1254 else 1255 data->flags &= ~SF_FL_BEFORE_EOL; 1256 data->minlen_float=minlenp; 1257 data->lookbehind_float=0; 1258 } 1259 } 1260 SvCUR_set(data->last_found, 0); 1261 { 1262 SV * const sv = data->last_found; 1263 if (SvUTF8(sv) && SvMAGICAL(sv)) { 1264 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); 1265 if (mg) 1266 mg->mg_len = 0; 1267 } 1268 } 1269 data->last_end = -1; 1270 data->flags &= ~SF_BEFORE_EOL; 1271 DEBUG_STUDYDATA("commit: ",data,0); 1272 } 1273 1274 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion 1275 * list that describes which code points it matches */ 1276 1277 STATIC void 1278 S_ssc_anything(pTHX_ regnode_ssc *ssc) 1279 { 1280 /* Set the SSC 'ssc' to match an empty string or any code point */ 1281 1282 PERL_ARGS_ASSERT_SSC_ANYTHING; 1283 1284 assert(is_ANYOF_SYNTHETIC(ssc)); 1285 1286 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ 1287 _append_range_to_invlist(ssc->invlist, 0, UV_MAX); 1288 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ 1289 } 1290 1291 STATIC int 1292 S_ssc_is_anything(const regnode_ssc *ssc) 1293 { 1294 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code 1295 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys 1296 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted 1297 * in any way, so there's no point in using it */ 1298 1299 UV start, end; 1300 bool ret; 1301 1302 PERL_ARGS_ASSERT_SSC_IS_ANYTHING; 1303 1304 assert(is_ANYOF_SYNTHETIC(ssc)); 1305 1306 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) { 1307 return FALSE; 1308 } 1309 1310 /* See if the list consists solely of the range 0 - Infinity */ 1311 invlist_iterinit(ssc->invlist); 1312 ret = invlist_iternext(ssc->invlist, &start, &end) 1313 && start == 0 1314 && end == UV_MAX; 1315 1316 invlist_iterfinish(ssc->invlist); 1317 1318 if (ret) { 1319 return TRUE; 1320 } 1321 1322 /* If e.g., both \w and \W are set, matches everything */ 1323 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1324 int i; 1325 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { 1326 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { 1327 return TRUE; 1328 } 1329 } 1330 } 1331 1332 return FALSE; 1333 } 1334 1335 STATIC void 1336 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) 1337 { 1338 /* Initializes the SSC 'ssc'. This includes setting it to match an empty 1339 * string, any code point, or any posix class under locale */ 1340 1341 PERL_ARGS_ASSERT_SSC_INIT; 1342 1343 Zero(ssc, 1, regnode_ssc); 1344 set_ANYOF_SYNTHETIC(ssc); 1345 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP); 1346 ssc_anything(ssc); 1347 1348 /* If any portion of the regex is to operate under locale rules that aren't 1349 * fully known at compile time, initialization includes it. The reason 1350 * this isn't done for all regexes is that the optimizer was written under 1351 * the assumption that locale was all-or-nothing. Given the complexity and 1352 * lack of documentation in the optimizer, and that there are inadequate 1353 * test cases for locale, many parts of it may not work properly, it is 1354 * safest to avoid locale unless necessary. */ 1355 if (RExC_contains_locale) { 1356 ANYOF_POSIXL_SETALL(ssc); 1357 } 1358 else { 1359 ANYOF_POSIXL_ZERO(ssc); 1360 } 1361 } 1362 1363 STATIC int 1364 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, 1365 const regnode_ssc *ssc) 1366 { 1367 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only 1368 * to the list of code points matched, and locale posix classes; hence does 1369 * not check its flags) */ 1370 1371 UV start, end; 1372 bool ret; 1373 1374 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; 1375 1376 assert(is_ANYOF_SYNTHETIC(ssc)); 1377 1378 invlist_iterinit(ssc->invlist); 1379 ret = invlist_iternext(ssc->invlist, &start, &end) 1380 && start == 0 1381 && end == UV_MAX; 1382 1383 invlist_iterfinish(ssc->invlist); 1384 1385 if (! ret) { 1386 return FALSE; 1387 } 1388 1389 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { 1390 return FALSE; 1391 } 1392 1393 return TRUE; 1394 } 1395 1396 STATIC SV* 1397 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, 1398 const regnode_charclass* const node) 1399 { 1400 /* Returns a mortal inversion list defining which code points are matched 1401 * by 'node', which is of type ANYOF. Handles complementing the result if 1402 * appropriate. If some code points aren't knowable at this time, the 1403 * returned list must, and will, contain every code point that is a 1404 * possibility. */ 1405 1406 SV* invlist = NULL; 1407 SV* only_utf8_locale_invlist = NULL; 1408 unsigned int i; 1409 const U32 n = ARG(node); 1410 bool new_node_has_latin1 = FALSE; 1411 1412 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; 1413 1414 /* Look at the data structure created by S_set_ANYOF_arg() */ 1415 if (n != ANYOF_ONLY_HAS_BITMAP) { 1416 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); 1417 AV * const av = MUTABLE_AV(SvRV(rv)); 1418 SV **const ary = AvARRAY(av); 1419 assert(RExC_rxi->data->what[n] == 's'); 1420 1421 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ 1422 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); 1423 } 1424 else if (ary[0] && ary[0] != &PL_sv_undef) { 1425 1426 /* Here, no compile-time swash, and there are things that won't be 1427 * known until runtime -- we have to assume it could be anything */ 1428 invlist = sv_2mortal(_new_invlist(1)); 1429 return _add_range_to_invlist(invlist, 0, UV_MAX); 1430 } 1431 else if (ary[3] && ary[3] != &PL_sv_undef) { 1432 1433 /* Here no compile-time swash, and no run-time only data. Use the 1434 * node's inversion list */ 1435 invlist = sv_2mortal(invlist_clone(ary[3])); 1436 } 1437 1438 /* Get the code points valid only under UTF-8 locales */ 1439 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD) 1440 && ary[2] && ary[2] != &PL_sv_undef) 1441 { 1442 only_utf8_locale_invlist = ary[2]; 1443 } 1444 } 1445 1446 if (! invlist) { 1447 invlist = sv_2mortal(_new_invlist(0)); 1448 } 1449 1450 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS 1451 * code points, and an inversion list for the others, but if there are code 1452 * points that should match only conditionally on the target string being 1453 * UTF-8, those are placed in the inversion list, and not the bitmap. 1454 * Since there are circumstances under which they could match, they are 1455 * included in the SSC. But if the ANYOF node is to be inverted, we have 1456 * to exclude them here, so that when we invert below, the end result 1457 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We 1458 * have to do this here before we add the unconditionally matched code 1459 * points */ 1460 if (ANYOF_FLAGS(node) & ANYOF_INVERT) { 1461 _invlist_intersection_complement_2nd(invlist, 1462 PL_UpperLatin1, 1463 &invlist); 1464 } 1465 1466 /* Add in the points from the bit map */ 1467 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { 1468 if (ANYOF_BITMAP_TEST(node, i)) { 1469 unsigned int start = i++; 1470 1471 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) { 1472 /* empty */ 1473 } 1474 invlist = _add_range_to_invlist(invlist, start, i-1); 1475 new_node_has_latin1 = TRUE; 1476 } 1477 } 1478 1479 /* If this can match all upper Latin1 code points, have to add them 1480 * as well. But don't add them if inverting, as when that gets done below, 1481 * it would exclude all these characters, including the ones it shouldn't 1482 * that were added just above */ 1483 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD 1484 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) 1485 { 1486 _invlist_union(invlist, PL_UpperLatin1, &invlist); 1487 } 1488 1489 /* Similarly for these */ 1490 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 1491 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); 1492 } 1493 1494 if (ANYOF_FLAGS(node) & ANYOF_INVERT) { 1495 _invlist_invert(invlist); 1496 } 1497 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) { 1498 1499 /* Under /li, any 0-255 could fold to any other 0-255, depending on the 1500 * locale. We can skip this if there are no 0-255 at all. */ 1501 _invlist_union(invlist, PL_Latin1, &invlist); 1502 } 1503 1504 /* Similarly add the UTF-8 locale possible matches. These have to be 1505 * deferred until after the non-UTF-8 locale ones are taken care of just 1506 * above, or it leads to wrong results under ANYOF_INVERT */ 1507 if (only_utf8_locale_invlist) { 1508 _invlist_union_maybe_complement_2nd(invlist, 1509 only_utf8_locale_invlist, 1510 ANYOF_FLAGS(node) & ANYOF_INVERT, 1511 &invlist); 1512 } 1513 1514 return invlist; 1515 } 1516 1517 /* These two functions currently do the exact same thing */ 1518 #define ssc_init_zero ssc_init 1519 1520 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) 1521 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) 1522 1523 /* 'AND' a given class with another one. Can create false positives. 'ssc' 1524 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be 1525 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */ 1526 1527 STATIC void 1528 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, 1529 const regnode_charclass *and_with) 1530 { 1531 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either 1532 * another SSC or a regular ANYOF class. Can create false positives. */ 1533 1534 SV* anded_cp_list; 1535 U8 anded_flags; 1536 1537 PERL_ARGS_ASSERT_SSC_AND; 1538 1539 assert(is_ANYOF_SYNTHETIC(ssc)); 1540 1541 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract 1542 * the code point inversion list and just the relevant flags */ 1543 if (is_ANYOF_SYNTHETIC(and_with)) { 1544 anded_cp_list = ((regnode_ssc *)and_with)->invlist; 1545 anded_flags = ANYOF_FLAGS(and_with); 1546 1547 /* XXX This is a kludge around what appears to be deficiencies in the 1548 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, 1549 * there are paths through the optimizer where it doesn't get weeded 1550 * out when it should. And if we don't make some extra provision for 1551 * it like the code just below, it doesn't get added when it should. 1552 * This solution is to add it only when AND'ing, which is here, and 1553 * only when what is being AND'ed is the pristine, original node 1554 * matching anything. Thus it is like adding it to ssc_anything() but 1555 * only when the result is to be AND'ed. Probably the same solution 1556 * could be adopted for the same problem we have with /l matching, 1557 * which is solved differently in S_ssc_init(), and that would lead to 1558 * fewer false positives than that solution has. But if this solution 1559 * creates bugs, the consequences are only that a warning isn't raised 1560 * that should be; while the consequences for having /l bugs is 1561 * incorrect matches */ 1562 if (ssc_is_anything((regnode_ssc *)and_with)) { 1563 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; 1564 } 1565 } 1566 else { 1567 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); 1568 if (OP(and_with) == ANYOFD) { 1569 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; 1570 } 1571 else { 1572 anded_flags = ANYOF_FLAGS(and_with) 1573 &( ANYOF_COMMON_FLAGS 1574 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 1575 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); 1576 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) { 1577 anded_flags &= 1578 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; 1579 } 1580 } 1581 } 1582 1583 ANYOF_FLAGS(ssc) &= anded_flags; 1584 1585 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. 1586 * C2 is the list of code points in 'and-with'; P2, its posix classes. 1587 * 'and_with' may be inverted. When not inverted, we have the situation of 1588 * computing: 1589 * (C1 | P1) & (C2 | P2) 1590 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) 1591 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) 1592 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) 1593 * <= ((C1 & C2) | P1 | P2) 1594 * Alternatively, the last few steps could be: 1595 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) 1596 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) 1597 * <= (C1 | C2 | (P1 & P2)) 1598 * We favor the second approach if either P1 or P2 is non-empty. This is 1599 * because these components are a barrier to doing optimizations, as what 1600 * they match cannot be known until the moment of matching as they are 1601 * dependent on the current locale, 'AND"ing them likely will reduce or 1602 * eliminate them. 1603 * But we can do better if we know that C1,P1 are in their initial state (a 1604 * frequent occurrence), each matching everything: 1605 * (<everything>) & (C2 | P2) = C2 | P2 1606 * Similarly, if C2,P2 are in their initial state (again a frequent 1607 * occurrence), the result is a no-op 1608 * (C1 | P1) & (<everything>) = C1 | P1 1609 * 1610 * Inverted, we have 1611 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) 1612 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) 1613 * <= (C1 & ~C2) | (P1 & ~P2) 1614 * */ 1615 1616 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) 1617 && ! is_ANYOF_SYNTHETIC(and_with)) 1618 { 1619 unsigned int i; 1620 1621 ssc_intersection(ssc, 1622 anded_cp_list, 1623 FALSE /* Has already been inverted */ 1624 ); 1625 1626 /* If either P1 or P2 is empty, the intersection will be also; can skip 1627 * the loop */ 1628 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) { 1629 ANYOF_POSIXL_ZERO(ssc); 1630 } 1631 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1632 1633 /* Note that the Posix class component P from 'and_with' actually 1634 * looks like: 1635 * P = Pa | Pb | ... | Pn 1636 * where each component is one posix class, such as in [\w\s]. 1637 * Thus 1638 * ~P = ~(Pa | Pb | ... | Pn) 1639 * = ~Pa & ~Pb & ... & ~Pn 1640 * <= ~Pa | ~Pb | ... | ~Pn 1641 * The last is something we can easily calculate, but unfortunately 1642 * is likely to have many false positives. We could do better 1643 * in some (but certainly not all) instances if two classes in 1644 * P have known relationships. For example 1645 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: 1646 * So 1647 * :lower: & :print: = :lower: 1648 * And similarly for classes that must be disjoint. For example, 1649 * since \s and \w can have no elements in common based on rules in 1650 * the POSIX standard, 1651 * \w & ^\S = nothing 1652 * Unfortunately, some vendor locales do not meet the Posix 1653 * standard, in particular almost everything by Microsoft. 1654 * The loop below just changes e.g., \w into \W and vice versa */ 1655 1656 regnode_charclass_posixl temp; 1657 int add = 1; /* To calculate the index of the complement */ 1658 1659 ANYOF_POSIXL_ZERO(&temp); 1660 for (i = 0; i < ANYOF_MAX; i++) { 1661 assert(i % 2 != 0 1662 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) 1663 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); 1664 1665 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { 1666 ANYOF_POSIXL_SET(&temp, i + add); 1667 } 1668 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ 1669 } 1670 ANYOF_POSIXL_AND(&temp, ssc); 1671 1672 } /* else ssc already has no posixes */ 1673 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC 1674 in its initial state */ 1675 else if (! is_ANYOF_SYNTHETIC(and_with) 1676 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) 1677 { 1678 /* But if 'ssc' is in its initial state, the result is just 'and_with'; 1679 * copy it over 'ssc' */ 1680 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { 1681 if (is_ANYOF_SYNTHETIC(and_with)) { 1682 StructCopy(and_with, ssc, regnode_ssc); 1683 } 1684 else { 1685 ssc->invlist = anded_cp_list; 1686 ANYOF_POSIXL_ZERO(ssc); 1687 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) { 1688 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); 1689 } 1690 } 1691 } 1692 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) 1693 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) 1694 { 1695 /* One or the other of P1, P2 is non-empty. */ 1696 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) { 1697 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); 1698 } 1699 ssc_union(ssc, anded_cp_list, FALSE); 1700 } 1701 else { /* P1 = P2 = empty */ 1702 ssc_intersection(ssc, anded_cp_list, FALSE); 1703 } 1704 } 1705 } 1706 1707 STATIC void 1708 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, 1709 const regnode_charclass *or_with) 1710 { 1711 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either 1712 * another SSC or a regular ANYOF class. Can create false positives if 1713 * 'or_with' is to be inverted. */ 1714 1715 SV* ored_cp_list; 1716 U8 ored_flags; 1717 1718 PERL_ARGS_ASSERT_SSC_OR; 1719 1720 assert(is_ANYOF_SYNTHETIC(ssc)); 1721 1722 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract 1723 * the code point inversion list and just the relevant flags */ 1724 if (is_ANYOF_SYNTHETIC(or_with)) { 1725 ored_cp_list = ((regnode_ssc*) or_with)->invlist; 1726 ored_flags = ANYOF_FLAGS(or_with); 1727 } 1728 else { 1729 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); 1730 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; 1731 if (OP(or_with) != ANYOFD) { 1732 ored_flags 1733 |= ANYOF_FLAGS(or_with) 1734 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 1735 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); 1736 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) { 1737 ored_flags |= 1738 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; 1739 } 1740 } 1741 } 1742 1743 ANYOF_FLAGS(ssc) |= ored_flags; 1744 1745 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. 1746 * C2 is the list of code points in 'or-with'; P2, its posix classes. 1747 * 'or_with' may be inverted. When not inverted, we have the simple 1748 * situation of computing: 1749 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) 1750 * If P1|P2 yields a situation with both a class and its complement are 1751 * set, like having both \w and \W, this matches all code points, and we 1752 * can delete these from the P component of the ssc going forward. XXX We 1753 * might be able to delete all the P components, but I (khw) am not certain 1754 * about this, and it is better to be safe. 1755 * 1756 * Inverted, we have 1757 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) 1758 * <= (C1 | P1) | ~C2 1759 * <= (C1 | ~C2) | P1 1760 * (which results in actually simpler code than the non-inverted case) 1761 * */ 1762 1763 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) 1764 && ! is_ANYOF_SYNTHETIC(or_with)) 1765 { 1766 /* We ignore P2, leaving P1 going forward */ 1767 } /* else Not inverted */ 1768 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) { 1769 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); 1770 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1771 unsigned int i; 1772 for (i = 0; i < ANYOF_MAX; i += 2) { 1773 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) 1774 { 1775 ssc_match_all_cp(ssc); 1776 ANYOF_POSIXL_CLEAR(ssc, i); 1777 ANYOF_POSIXL_CLEAR(ssc, i+1); 1778 } 1779 } 1780 } 1781 } 1782 1783 ssc_union(ssc, 1784 ored_cp_list, 1785 FALSE /* Already has been inverted */ 1786 ); 1787 } 1788 1789 PERL_STATIC_INLINE void 1790 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) 1791 { 1792 PERL_ARGS_ASSERT_SSC_UNION; 1793 1794 assert(is_ANYOF_SYNTHETIC(ssc)); 1795 1796 _invlist_union_maybe_complement_2nd(ssc->invlist, 1797 invlist, 1798 invert2nd, 1799 &ssc->invlist); 1800 } 1801 1802 PERL_STATIC_INLINE void 1803 S_ssc_intersection(pTHX_ regnode_ssc *ssc, 1804 SV* const invlist, 1805 const bool invert2nd) 1806 { 1807 PERL_ARGS_ASSERT_SSC_INTERSECTION; 1808 1809 assert(is_ANYOF_SYNTHETIC(ssc)); 1810 1811 _invlist_intersection_maybe_complement_2nd(ssc->invlist, 1812 invlist, 1813 invert2nd, 1814 &ssc->invlist); 1815 } 1816 1817 PERL_STATIC_INLINE void 1818 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) 1819 { 1820 PERL_ARGS_ASSERT_SSC_ADD_RANGE; 1821 1822 assert(is_ANYOF_SYNTHETIC(ssc)); 1823 1824 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); 1825 } 1826 1827 PERL_STATIC_INLINE void 1828 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) 1829 { 1830 /* AND just the single code point 'cp' into the SSC 'ssc' */ 1831 1832 SV* cp_list = _new_invlist(2); 1833 1834 PERL_ARGS_ASSERT_SSC_CP_AND; 1835 1836 assert(is_ANYOF_SYNTHETIC(ssc)); 1837 1838 cp_list = add_cp_to_invlist(cp_list, cp); 1839 ssc_intersection(ssc, cp_list, 1840 FALSE /* Not inverted */ 1841 ); 1842 SvREFCNT_dec_NN(cp_list); 1843 } 1844 1845 PERL_STATIC_INLINE void 1846 S_ssc_clear_locale(regnode_ssc *ssc) 1847 { 1848 /* Set the SSC 'ssc' to not match any locale things */ 1849 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; 1850 1851 assert(is_ANYOF_SYNTHETIC(ssc)); 1852 1853 ANYOF_POSIXL_ZERO(ssc); 1854 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; 1855 } 1856 1857 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C 1858 1859 STATIC bool 1860 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) 1861 { 1862 /* The synthetic start class is used to hopefully quickly winnow down 1863 * places where a pattern could start a match in the target string. If it 1864 * doesn't really narrow things down that much, there isn't much point to 1865 * having the overhead of using it. This function uses some very crude 1866 * heuristics to decide if to use the ssc or not. 1867 * 1868 * It returns TRUE if 'ssc' rules out more than half what it considers to 1869 * be the "likely" possible matches, but of course it doesn't know what the 1870 * actual things being matched are going to be; these are only guesses 1871 * 1872 * For /l matches, it assumes that the only likely matches are going to be 1873 * in the 0-255 range, uniformly distributed, so half of that is 127 1874 * For /a and /d matches, it assumes that the likely matches will be just 1875 * the ASCII range, so half of that is 63 1876 * For /u and there isn't anything matching above the Latin1 range, it 1877 * assumes that that is the only range likely to be matched, and uses 1878 * half that as the cut-off: 127. If anything matches above Latin1, 1879 * it assumes that all of Unicode could match (uniformly), except for 1880 * non-Unicode code points and things in the General Category "Other" 1881 * (unassigned, private use, surrogates, controls and formats). This 1882 * is a much large number. */ 1883 1884 U32 count = 0; /* Running total of number of code points matched by 1885 'ssc' */ 1886 UV start, end; /* Start and end points of current range in inversion 1887 list */ 1888 const U32 max_code_points = (LOC) 1889 ? 256 1890 : (( ! UNI_SEMANTICS 1891 || invlist_highest(ssc->invlist) < 256) 1892 ? 128 1893 : NON_OTHER_COUNT); 1894 const U32 max_match = max_code_points / 2; 1895 1896 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; 1897 1898 invlist_iterinit(ssc->invlist); 1899 while (invlist_iternext(ssc->invlist, &start, &end)) { 1900 if (start >= max_code_points) { 1901 break; 1902 } 1903 end = MIN(end, max_code_points - 1); 1904 count += end - start + 1; 1905 if (count >= max_match) { 1906 invlist_iterfinish(ssc->invlist); 1907 return FALSE; 1908 } 1909 } 1910 1911 return TRUE; 1912 } 1913 1914 1915 STATIC void 1916 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) 1917 { 1918 /* The inversion list in the SSC is marked mortal; now we need a more 1919 * permanent copy, which is stored the same way that is done in a regular 1920 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit 1921 * map */ 1922 1923 SV* invlist = invlist_clone(ssc->invlist); 1924 1925 PERL_ARGS_ASSERT_SSC_FINALIZE; 1926 1927 assert(is_ANYOF_SYNTHETIC(ssc)); 1928 1929 /* The code in this file assumes that all but these flags aren't relevant 1930 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared 1931 * by the time we reach here */ 1932 assert(! (ANYOF_FLAGS(ssc) 1933 & ~( ANYOF_COMMON_FLAGS 1934 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 1935 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP))); 1936 1937 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); 1938 1939 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, 1940 NULL, NULL, NULL, FALSE); 1941 1942 /* Make sure is clone-safe */ 1943 ssc->invlist = NULL; 1944 1945 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1946 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; 1947 } 1948 1949 if (RExC_contains_locale) { 1950 OP(ssc) = ANYOFL; 1951 } 1952 1953 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); 1954 } 1955 1956 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] 1957 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) 1958 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) 1959 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ 1960 ? (TRIE_LIST_CUR( idx ) - 1) \ 1961 : 0 ) 1962 1963 1964 #ifdef DEBUGGING 1965 /* 1966 dump_trie(trie,widecharmap,revcharmap) 1967 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) 1968 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) 1969 1970 These routines dump out a trie in a somewhat readable format. 1971 The _interim_ variants are used for debugging the interim 1972 tables that are used to generate the final compressed 1973 representation which is what dump_trie expects. 1974 1975 Part of the reason for their existence is to provide a form 1976 of documentation as to how the different representations function. 1977 1978 */ 1979 1980 /* 1981 Dumps the final compressed table form of the trie to Perl_debug_log. 1982 Used for debugging make_trie(). 1983 */ 1984 1985 STATIC void 1986 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, 1987 AV *revcharmap, U32 depth) 1988 { 1989 U32 state; 1990 SV *sv=sv_newmortal(); 1991 int colwidth= widecharmap ? 6 : 4; 1992 U16 word; 1993 GET_RE_DEBUG_FLAGS_DECL; 1994 1995 PERL_ARGS_ASSERT_DUMP_TRIE; 1996 1997 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ", 1998 depth+1, "Match","Base","Ofs" ); 1999 2000 for( state = 0 ; state < trie->uniquecharcount ; state++ ) { 2001 SV ** const tmp = av_fetch( revcharmap, state, 0); 2002 if ( tmp ) { 2003 Perl_re_printf( aTHX_ "%*s", 2004 colwidth, 2005 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 2006 PL_colors[0], PL_colors[1], 2007 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 2008 PERL_PV_ESCAPE_FIRSTCHAR 2009 ) 2010 ); 2011 } 2012 } 2013 Perl_re_printf( aTHX_ "\n"); 2014 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1); 2015 2016 for( state = 0 ; state < trie->uniquecharcount ; state++ ) 2017 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------"); 2018 Perl_re_printf( aTHX_ "\n"); 2019 2020 for( state = 1 ; state < trie->statecount ; state++ ) { 2021 const U32 base = trie->states[ state ].trans.base; 2022 2023 Perl_re_indentf( aTHX_ "#%4"UVXf"|", depth+1, (UV)state); 2024 2025 if ( trie->states[ state ].wordnum ) { 2026 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); 2027 } else { 2028 Perl_re_printf( aTHX_ "%6s", "" ); 2029 } 2030 2031 Perl_re_printf( aTHX_ " @%4"UVXf" ", (UV)base ); 2032 2033 if ( base ) { 2034 U32 ofs = 0; 2035 2036 while( ( base + ofs < trie->uniquecharcount ) || 2037 ( base + ofs - trie->uniquecharcount < trie->lasttrans 2038 && trie->trans[ base + ofs - trie->uniquecharcount ].check 2039 != state)) 2040 ofs++; 2041 2042 Perl_re_printf( aTHX_ "+%2"UVXf"[ ", (UV)ofs); 2043 2044 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 2045 if ( ( base + ofs >= trie->uniquecharcount ) 2046 && ( base + ofs - trie->uniquecharcount 2047 < trie->lasttrans ) 2048 && trie->trans[ base + ofs 2049 - trie->uniquecharcount ].check == state ) 2050 { 2051 Perl_re_printf( aTHX_ "%*"UVXf, colwidth, 2052 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next 2053 ); 2054 } else { 2055 Perl_re_printf( aTHX_ "%*s",colwidth," ." ); 2056 } 2057 } 2058 2059 Perl_re_printf( aTHX_ "]"); 2060 2061 } 2062 Perl_re_printf( aTHX_ "\n" ); 2063 } 2064 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=", 2065 depth); 2066 for (word=1; word <= trie->wordcount; word++) { 2067 Perl_re_printf( aTHX_ " %d:(%d,%d)", 2068 (int)word, (int)(trie->wordinfo[word].prev), 2069 (int)(trie->wordinfo[word].len)); 2070 } 2071 Perl_re_printf( aTHX_ "\n" ); 2072 } 2073 /* 2074 Dumps a fully constructed but uncompressed trie in list form. 2075 List tries normally only are used for construction when the number of 2076 possible chars (trie->uniquecharcount) is very high. 2077 Used for debugging make_trie(). 2078 */ 2079 STATIC void 2080 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, 2081 HV *widecharmap, AV *revcharmap, U32 next_alloc, 2082 U32 depth) 2083 { 2084 U32 state; 2085 SV *sv=sv_newmortal(); 2086 int colwidth= widecharmap ? 6 : 4; 2087 GET_RE_DEBUG_FLAGS_DECL; 2088 2089 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; 2090 2091 /* print out the table precompression. */ 2092 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n", 2093 depth+1 ); 2094 Perl_re_indentf( aTHX_ "%s", 2095 depth+1, "------:-----+-----------------\n" ); 2096 2097 for( state=1 ; state < next_alloc ; state ++ ) { 2098 U16 charid; 2099 2100 Perl_re_indentf( aTHX_ " %4"UVXf" :", 2101 depth+1, (UV)state ); 2102 if ( ! trie->states[ state ].wordnum ) { 2103 Perl_re_printf( aTHX_ "%5s| ",""); 2104 } else { 2105 Perl_re_printf( aTHX_ "W%4x| ", 2106 trie->states[ state ].wordnum 2107 ); 2108 } 2109 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { 2110 SV ** const tmp = av_fetch( revcharmap, 2111 TRIE_LIST_ITEM(state,charid).forid, 0); 2112 if ( tmp ) { 2113 Perl_re_printf( aTHX_ "%*s:%3X=%4"UVXf" | ", 2114 colwidth, 2115 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 2116 colwidth, 2117 PL_colors[0], PL_colors[1], 2118 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) 2119 | PERL_PV_ESCAPE_FIRSTCHAR 2120 ) , 2121 TRIE_LIST_ITEM(state,charid).forid, 2122 (UV)TRIE_LIST_ITEM(state,charid).newstate 2123 ); 2124 if (!(charid % 10)) 2125 Perl_re_printf( aTHX_ "\n%*s| ", 2126 (int)((depth * 2) + 14), ""); 2127 } 2128 } 2129 Perl_re_printf( aTHX_ "\n"); 2130 } 2131 } 2132 2133 /* 2134 Dumps a fully constructed but uncompressed trie in table form. 2135 This is the normal DFA style state transition table, with a few 2136 twists to facilitate compression later. 2137 Used for debugging make_trie(). 2138 */ 2139 STATIC void 2140 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, 2141 HV *widecharmap, AV *revcharmap, U32 next_alloc, 2142 U32 depth) 2143 { 2144 U32 state; 2145 U16 charid; 2146 SV *sv=sv_newmortal(); 2147 int colwidth= widecharmap ? 6 : 4; 2148 GET_RE_DEBUG_FLAGS_DECL; 2149 2150 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; 2151 2152 /* 2153 print out the table precompression so that we can do a visual check 2154 that they are identical. 2155 */ 2156 2157 Perl_re_indentf( aTHX_ "Char : ", depth+1 ); 2158 2159 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 2160 SV ** const tmp = av_fetch( revcharmap, charid, 0); 2161 if ( tmp ) { 2162 Perl_re_printf( aTHX_ "%*s", 2163 colwidth, 2164 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 2165 PL_colors[0], PL_colors[1], 2166 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 2167 PERL_PV_ESCAPE_FIRSTCHAR 2168 ) 2169 ); 2170 } 2171 } 2172 2173 Perl_re_printf( aTHX_ "\n"); 2174 Perl_re_indentf( aTHX_ "State+-", depth+1 ); 2175 2176 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { 2177 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------"); 2178 } 2179 2180 Perl_re_printf( aTHX_ "\n" ); 2181 2182 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { 2183 2184 Perl_re_indentf( aTHX_ "%4"UVXf" : ", 2185 depth+1, 2186 (UV)TRIE_NODENUM( state ) ); 2187 2188 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 2189 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); 2190 if (v) 2191 Perl_re_printf( aTHX_ "%*"UVXf, colwidth, v ); 2192 else 2193 Perl_re_printf( aTHX_ "%*s", colwidth, "." ); 2194 } 2195 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { 2196 Perl_re_printf( aTHX_ " (%4"UVXf")\n", 2197 (UV)trie->trans[ state ].check ); 2198 } else { 2199 Perl_re_printf( aTHX_ " (%4"UVXf") W%4X\n", 2200 (UV)trie->trans[ state ].check, 2201 trie->states[ TRIE_NODENUM( state ) ].wordnum ); 2202 } 2203 } 2204 } 2205 2206 #endif 2207 2208 2209 /* make_trie(startbranch,first,last,tail,word_count,flags,depth) 2210 startbranch: the first branch in the whole branch sequence 2211 first : start branch of sequence of branch-exact nodes. 2212 May be the same as startbranch 2213 last : Thing following the last branch. 2214 May be the same as tail. 2215 tail : item following the branch sequence 2216 count : words in the sequence 2217 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/ 2218 depth : indent depth 2219 2220 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. 2221 2222 A trie is an N'ary tree where the branches are determined by digital 2223 decomposition of the key. IE, at the root node you look up the 1st character and 2224 follow that branch repeat until you find the end of the branches. Nodes can be 2225 marked as "accepting" meaning they represent a complete word. Eg: 2226 2227 /he|she|his|hers/ 2228 2229 would convert into the following structure. Numbers represent states, letters 2230 following numbers represent valid transitions on the letter from that state, if 2231 the number is in square brackets it represents an accepting state, otherwise it 2232 will be in parenthesis. 2233 2234 +-h->+-e->[3]-+-r->(8)-+-s->[9] 2235 | | 2236 | (2) 2237 | | 2238 (1) +-i->(6)-+-s->[7] 2239 | 2240 +-s->(3)-+-h->(4)-+-e->[5] 2241 2242 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) 2243 2244 This shows that when matching against the string 'hers' we will begin at state 1 2245 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, 2246 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which 2247 is also accepting. Thus we know that we can match both 'he' and 'hers' with a 2248 single traverse. We store a mapping from accepting to state to which word was 2249 matched, and then when we have multiple possibilities we try to complete the 2250 rest of the regex in the order in which they occurred in the alternation. 2251 2252 The only prior NFA like behaviour that would be changed by the TRIE support is 2253 the silent ignoring of duplicate alternations which are of the form: 2254 2255 / (DUPE|DUPE) X? (?{ ... }) Y /x 2256 2257 Thus EVAL blocks following a trie may be called a different number of times with 2258 and without the optimisation. With the optimisations dupes will be silently 2259 ignored. This inconsistent behaviour of EVAL type nodes is well established as 2260 the following demonstrates: 2261 2262 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ 2263 2264 which prints out 'word' three times, but 2265 2266 'words'=~/(word|word|word)(?{ print $1 })S/ 2267 2268 which doesnt print it out at all. This is due to other optimisations kicking in. 2269 2270 Example of what happens on a structural level: 2271 2272 The regexp /(ac|ad|ab)+/ will produce the following debug output: 2273 2274 1: CURLYM[1] {1,32767}(18) 2275 5: BRANCH(8) 2276 6: EXACT <ac>(16) 2277 8: BRANCH(11) 2278 9: EXACT <ad>(16) 2279 11: BRANCH(14) 2280 12: EXACT <ab>(16) 2281 16: SUCCEED(0) 2282 17: NOTHING(18) 2283 18: END(0) 2284 2285 This would be optimizable with startbranch=5, first=5, last=16, tail=16 2286 and should turn into: 2287 2288 1: CURLYM[1] {1,32767}(18) 2289 5: TRIE(16) 2290 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] 2291 <ac> 2292 <ad> 2293 <ab> 2294 16: SUCCEED(0) 2295 17: NOTHING(18) 2296 18: END(0) 2297 2298 Cases where tail != last would be like /(?foo|bar)baz/: 2299 2300 1: BRANCH(4) 2301 2: EXACT <foo>(8) 2302 4: BRANCH(7) 2303 5: EXACT <bar>(8) 2304 7: TAIL(8) 2305 8: EXACT <baz>(10) 2306 10: END(0) 2307 2308 which would be optimizable with startbranch=1, first=1, last=7, tail=8 2309 and would end up looking like: 2310 2311 1: TRIE(8) 2312 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] 2313 <foo> 2314 <bar> 2315 7: TAIL(8) 2316 8: EXACT <baz>(10) 2317 10: END(0) 2318 2319 d = uvchr_to_utf8_flags(d, uv, 0); 2320 2321 is the recommended Unicode-aware way of saying 2322 2323 *(d++) = uv; 2324 */ 2325 2326 #define TRIE_STORE_REVCHAR(val) \ 2327 STMT_START { \ 2328 if (UTF) { \ 2329 SV *zlopp = newSV(UTF8_MAXBYTES); \ 2330 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ 2331 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ 2332 SvCUR_set(zlopp, kapow - flrbbbbb); \ 2333 SvPOK_on(zlopp); \ 2334 SvUTF8_on(zlopp); \ 2335 av_push(revcharmap, zlopp); \ 2336 } else { \ 2337 char ooooff = (char)val; \ 2338 av_push(revcharmap, newSVpvn(&ooooff, 1)); \ 2339 } \ 2340 } STMT_END 2341 2342 /* This gets the next character from the input, folding it if not already 2343 * folded. */ 2344 #define TRIE_READ_CHAR STMT_START { \ 2345 wordlen++; \ 2346 if ( UTF ) { \ 2347 /* if it is UTF then it is either already folded, or does not need \ 2348 * folding */ \ 2349 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ 2350 } \ 2351 else if (folder == PL_fold_latin1) { \ 2352 /* This folder implies Unicode rules, which in the range expressible \ 2353 * by not UTF is the lower case, with the two exceptions, one of \ 2354 * which should have been taken care of before calling this */ \ 2355 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ 2356 uvc = toLOWER_L1(*uc); \ 2357 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ 2358 len = 1; \ 2359 } else { \ 2360 /* raw data, will be folded later if needed */ \ 2361 uvc = (U32)*uc; \ 2362 len = 1; \ 2363 } \ 2364 } STMT_END 2365 2366 2367 2368 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ 2369 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ 2370 U32 ging = TRIE_LIST_LEN( state ) *= 2; \ 2371 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ 2372 } \ 2373 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ 2374 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ 2375 TRIE_LIST_CUR( state )++; \ 2376 } STMT_END 2377 2378 #define TRIE_LIST_NEW(state) STMT_START { \ 2379 Newxz( trie->states[ state ].trans.list, \ 2380 4, reg_trie_trans_le ); \ 2381 TRIE_LIST_CUR( state ) = 1; \ 2382 TRIE_LIST_LEN( state ) = 4; \ 2383 } STMT_END 2384 2385 #define TRIE_HANDLE_WORD(state) STMT_START { \ 2386 U16 dupe= trie->states[ state ].wordnum; \ 2387 regnode * const noper_next = regnext( noper ); \ 2388 \ 2389 DEBUG_r({ \ 2390 /* store the word for dumping */ \ 2391 SV* tmp; \ 2392 if (OP(noper) != NOTHING) \ 2393 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ 2394 else \ 2395 tmp = newSVpvn_utf8( "", 0, UTF ); \ 2396 av_push( trie_words, tmp ); \ 2397 }); \ 2398 \ 2399 curword++; \ 2400 trie->wordinfo[curword].prev = 0; \ 2401 trie->wordinfo[curword].len = wordlen; \ 2402 trie->wordinfo[curword].accept = state; \ 2403 \ 2404 if ( noper_next < tail ) { \ 2405 if (!trie->jump) \ 2406 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ 2407 sizeof(U16) ); \ 2408 trie->jump[curword] = (U16)(noper_next - convert); \ 2409 if (!jumper) \ 2410 jumper = noper_next; \ 2411 if (!nextbranch) \ 2412 nextbranch= regnext(cur); \ 2413 } \ 2414 \ 2415 if ( dupe ) { \ 2416 /* It's a dupe. Pre-insert into the wordinfo[].prev */\ 2417 /* chain, so that when the bits of chain are later */\ 2418 /* linked together, the dups appear in the chain */\ 2419 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ 2420 trie->wordinfo[dupe].prev = curword; \ 2421 } else { \ 2422 /* we haven't inserted this word yet. */ \ 2423 trie->states[ state ].wordnum = curword; \ 2424 } \ 2425 } STMT_END 2426 2427 2428 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ 2429 ( ( base + charid >= ucharcount \ 2430 && base + charid < ubound \ 2431 && state == trie->trans[ base - ucharcount + charid ].check \ 2432 && trie->trans[ base - ucharcount + charid ].next ) \ 2433 ? trie->trans[ base - ucharcount + charid ].next \ 2434 : ( state==1 ? special : 0 ) \ 2435 ) 2436 2437 #define MADE_TRIE 1 2438 #define MADE_JUMP_TRIE 2 2439 #define MADE_EXACT_TRIE 4 2440 2441 STATIC I32 2442 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, 2443 regnode *first, regnode *last, regnode *tail, 2444 U32 word_count, U32 flags, U32 depth) 2445 { 2446 /* first pass, loop through and scan words */ 2447 reg_trie_data *trie; 2448 HV *widecharmap = NULL; 2449 AV *revcharmap = newAV(); 2450 regnode *cur; 2451 STRLEN len = 0; 2452 UV uvc = 0; 2453 U16 curword = 0; 2454 U32 next_alloc = 0; 2455 regnode *jumper = NULL; 2456 regnode *nextbranch = NULL; 2457 regnode *convert = NULL; 2458 U32 *prev_states; /* temp array mapping each state to previous one */ 2459 /* we just use folder as a flag in utf8 */ 2460 const U8 * folder = NULL; 2461 2462 #ifdef DEBUGGING 2463 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); 2464 AV *trie_words = NULL; 2465 /* along with revcharmap, this only used during construction but both are 2466 * useful during debugging so we store them in the struct when debugging. 2467 */ 2468 #else 2469 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); 2470 STRLEN trie_charcount=0; 2471 #endif 2472 SV *re_trie_maxbuff; 2473 GET_RE_DEBUG_FLAGS_DECL; 2474 2475 PERL_ARGS_ASSERT_MAKE_TRIE; 2476 #ifndef DEBUGGING 2477 PERL_UNUSED_ARG(depth); 2478 #endif 2479 2480 switch (flags) { 2481 case EXACT: case EXACTL: break; 2482 case EXACTFA: 2483 case EXACTFU_SS: 2484 case EXACTFU: 2485 case EXACTFLU8: folder = PL_fold_latin1; break; 2486 case EXACTF: folder = PL_fold; break; 2487 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); 2488 } 2489 2490 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); 2491 trie->refcount = 1; 2492 trie->startstate = 1; 2493 trie->wordcount = word_count; 2494 RExC_rxi->data->data[ data_slot ] = (void*)trie; 2495 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); 2496 if (flags == EXACT || flags == EXACTL) 2497 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); 2498 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( 2499 trie->wordcount+1, sizeof(reg_trie_wordinfo)); 2500 2501 DEBUG_r({ 2502 trie_words = newAV(); 2503 }); 2504 2505 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 2506 assert(re_trie_maxbuff); 2507 if (!SvIOK(re_trie_maxbuff)) { 2508 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 2509 } 2510 DEBUG_TRIE_COMPILE_r({ 2511 Perl_re_indentf( aTHX_ 2512 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", 2513 depth+1, 2514 REG_NODE_NUM(startbranch),REG_NODE_NUM(first), 2515 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); 2516 }); 2517 2518 /* Find the node we are going to overwrite */ 2519 if ( first == startbranch && OP( last ) != BRANCH ) { 2520 /* whole branch chain */ 2521 convert = first; 2522 } else { 2523 /* branch sub-chain */ 2524 convert = NEXTOPER( first ); 2525 } 2526 2527 /* -- First loop and Setup -- 2528 2529 We first traverse the branches and scan each word to determine if it 2530 contains widechars, and how many unique chars there are, this is 2531 important as we have to build a table with at least as many columns as we 2532 have unique chars. 2533 2534 We use an array of integers to represent the character codes 0..255 2535 (trie->charmap) and we use a an HV* to store Unicode characters. We use 2536 the native representation of the character value as the key and IV's for 2537 the coded index. 2538 2539 *TODO* If we keep track of how many times each character is used we can 2540 remap the columns so that the table compression later on is more 2541 efficient in terms of memory by ensuring the most common value is in the 2542 middle and the least common are on the outside. IMO this would be better 2543 than a most to least common mapping as theres a decent chance the most 2544 common letter will share a node with the least common, meaning the node 2545 will not be compressible. With a middle is most common approach the worst 2546 case is when we have the least common nodes twice. 2547 2548 */ 2549 2550 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 2551 regnode *noper = NEXTOPER( cur ); 2552 const U8 *uc; 2553 const U8 *e; 2554 int foldlen = 0; 2555 U32 wordlen = 0; /* required init */ 2556 STRLEN minchars = 0; 2557 STRLEN maxchars = 0; 2558 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the 2559 bitmap?*/ 2560 2561 if (OP(noper) == NOTHING) { 2562 regnode *noper_next= regnext(noper); 2563 if (noper_next < tail) 2564 noper= noper_next; 2565 } 2566 2567 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { 2568 uc= (U8*)STRING(noper); 2569 e= uc + STR_LEN(noper); 2570 } else { 2571 trie->minlen= 0; 2572 continue; 2573 } 2574 2575 2576 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ 2577 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte 2578 regardless of encoding */ 2579 if (OP( noper ) == EXACTFU_SS) { 2580 /* false positives are ok, so just set this */ 2581 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); 2582 } 2583 } 2584 for ( ; uc < e ; uc += len ) { /* Look at each char in the current 2585 branch */ 2586 TRIE_CHARCOUNT(trie)++; 2587 TRIE_READ_CHAR; 2588 2589 /* TRIE_READ_CHAR returns the current character, or its fold if /i 2590 * is in effect. Under /i, this character can match itself, or 2591 * anything that folds to it. If not under /i, it can match just 2592 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN 2593 * all fold to k, and all are single characters. But some folds 2594 * expand to more than one character, so for example LATIN SMALL 2595 * LIGATURE FFI folds to the three character sequence 'ffi'. If 2596 * the string beginning at 'uc' is 'ffi', it could be matched by 2597 * three characters, or just by the one ligature character. (It 2598 * could also be matched by two characters: LATIN SMALL LIGATURE FF 2599 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). 2600 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also 2601 * match.) The trie needs to know the minimum and maximum number 2602 * of characters that could match so that it can use size alone to 2603 * quickly reject many match attempts. The max is simple: it is 2604 * the number of folded characters in this branch (since a fold is 2605 * never shorter than what folds to it. */ 2606 2607 maxchars++; 2608 2609 /* And the min is equal to the max if not under /i (indicated by 2610 * 'folder' being NULL), or there are no multi-character folds. If 2611 * there is a multi-character fold, the min is incremented just 2612 * once, for the character that folds to the sequence. Each 2613 * character in the sequence needs to be added to the list below of 2614 * characters in the trie, but we count only the first towards the 2615 * min number of characters needed. This is done through the 2616 * variable 'foldlen', which is returned by the macros that look 2617 * for these sequences as the number of bytes the sequence 2618 * occupies. Each time through the loop, we decrement 'foldlen' by 2619 * how many bytes the current char occupies. Only when it reaches 2620 * 0 do we increment 'minchars' or look for another multi-character 2621 * sequence. */ 2622 if (folder == NULL) { 2623 minchars++; 2624 } 2625 else if (foldlen > 0) { 2626 foldlen -= (UTF) ? UTF8SKIP(uc) : 1; 2627 } 2628 else { 2629 minchars++; 2630 2631 /* See if *uc is the beginning of a multi-character fold. If 2632 * so, we decrement the length remaining to look at, to account 2633 * for the current character this iteration. (We can use 'uc' 2634 * instead of the fold returned by TRIE_READ_CHAR because for 2635 * non-UTF, the latin1_safe macro is smart enough to account 2636 * for all the unfolded characters, and because for UTF, the 2637 * string will already have been folded earlier in the 2638 * compilation process */ 2639 if (UTF) { 2640 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { 2641 foldlen -= UTF8SKIP(uc); 2642 } 2643 } 2644 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { 2645 foldlen--; 2646 } 2647 } 2648 2649 /* The current character (and any potential folds) should be added 2650 * to the possible matching characters for this position in this 2651 * branch */ 2652 if ( uvc < 256 ) { 2653 if ( folder ) { 2654 U8 folded= folder[ (U8) uvc ]; 2655 if ( !trie->charmap[ folded ] ) { 2656 trie->charmap[ folded ]=( ++trie->uniquecharcount ); 2657 TRIE_STORE_REVCHAR( folded ); 2658 } 2659 } 2660 if ( !trie->charmap[ uvc ] ) { 2661 trie->charmap[ uvc ]=( ++trie->uniquecharcount ); 2662 TRIE_STORE_REVCHAR( uvc ); 2663 } 2664 if ( set_bit ) { 2665 /* store the codepoint in the bitmap, and its folded 2666 * equivalent. */ 2667 TRIE_BITMAP_SET(trie, uvc); 2668 2669 /* store the folded codepoint */ 2670 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); 2671 2672 if ( !UTF ) { 2673 /* store first byte of utf8 representation of 2674 variant codepoints */ 2675 if (! UVCHR_IS_INVARIANT(uvc)) { 2676 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); 2677 } 2678 } 2679 set_bit = 0; /* We've done our bit :-) */ 2680 } 2681 } else { 2682 2683 /* XXX We could come up with the list of code points that fold 2684 * to this using PL_utf8_foldclosures, except not for 2685 * multi-char folds, as there may be multiple combinations 2686 * there that could work, which needs to wait until runtime to 2687 * resolve (The comment about LIGATURE FFI above is such an 2688 * example */ 2689 2690 SV** svpp; 2691 if ( !widecharmap ) 2692 widecharmap = newHV(); 2693 2694 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); 2695 2696 if ( !svpp ) 2697 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); 2698 2699 if ( !SvTRUE( *svpp ) ) { 2700 sv_setiv( *svpp, ++trie->uniquecharcount ); 2701 TRIE_STORE_REVCHAR(uvc); 2702 } 2703 } 2704 } /* end loop through characters in this branch of the trie */ 2705 2706 /* We take the min and max for this branch and combine to find the min 2707 * and max for all branches processed so far */ 2708 if( cur == first ) { 2709 trie->minlen = minchars; 2710 trie->maxlen = maxchars; 2711 } else if (minchars < trie->minlen) { 2712 trie->minlen = minchars; 2713 } else if (maxchars > trie->maxlen) { 2714 trie->maxlen = maxchars; 2715 } 2716 } /* end first pass */ 2717 DEBUG_TRIE_COMPILE_r( 2718 Perl_re_indentf( aTHX_ 2719 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", 2720 depth+1, 2721 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, 2722 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, 2723 (int)trie->minlen, (int)trie->maxlen ) 2724 ); 2725 2726 /* 2727 We now know what we are dealing with in terms of unique chars and 2728 string sizes so we can calculate how much memory a naive 2729 representation using a flat table will take. If it's over a reasonable 2730 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory 2731 conservative but potentially much slower representation using an array 2732 of lists. 2733 2734 At the end we convert both representations into the same compressed 2735 form that will be used in regexec.c for matching with. The latter 2736 is a form that cannot be used to construct with but has memory 2737 properties similar to the list form and access properties similar 2738 to the table form making it both suitable for fast searches and 2739 small enough that its feasable to store for the duration of a program. 2740 2741 See the comment in the code where the compressed table is produced 2742 inplace from the flat tabe representation for an explanation of how 2743 the compression works. 2744 2745 */ 2746 2747 2748 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); 2749 prev_states[1] = 0; 2750 2751 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) 2752 > SvIV(re_trie_maxbuff) ) 2753 { 2754 /* 2755 Second Pass -- Array Of Lists Representation 2756 2757 Each state will be represented by a list of charid:state records 2758 (reg_trie_trans_le) the first such element holds the CUR and LEN 2759 points of the allocated array. (See defines above). 2760 2761 We build the initial structure using the lists, and then convert 2762 it into the compressed table form which allows faster lookups 2763 (but cant be modified once converted). 2764 */ 2765 2766 STRLEN transcount = 1; 2767 2768 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n", 2769 depth+1)); 2770 2771 trie->states = (reg_trie_state *) 2772 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 2773 sizeof(reg_trie_state) ); 2774 TRIE_LIST_NEW(1); 2775 next_alloc = 2; 2776 2777 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 2778 2779 regnode *noper = NEXTOPER( cur ); 2780 U32 state = 1; /* required init */ 2781 U16 charid = 0; /* sanity init */ 2782 U32 wordlen = 0; /* required init */ 2783 2784 if (OP(noper) == NOTHING) { 2785 regnode *noper_next= regnext(noper); 2786 if (noper_next < tail) 2787 noper= noper_next; 2788 } 2789 2790 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { 2791 const U8 *uc= (U8*)STRING(noper); 2792 const U8 *e= uc + STR_LEN(noper); 2793 2794 for ( ; uc < e ; uc += len ) { 2795 2796 TRIE_READ_CHAR; 2797 2798 if ( uvc < 256 ) { 2799 charid = trie->charmap[ uvc ]; 2800 } else { 2801 SV** const svpp = hv_fetch( widecharmap, 2802 (char*)&uvc, 2803 sizeof( UV ), 2804 0); 2805 if ( !svpp ) { 2806 charid = 0; 2807 } else { 2808 charid=(U16)SvIV( *svpp ); 2809 } 2810 } 2811 /* charid is now 0 if we dont know the char read, or 2812 * nonzero if we do */ 2813 if ( charid ) { 2814 2815 U16 check; 2816 U32 newstate = 0; 2817 2818 charid--; 2819 if ( !trie->states[ state ].trans.list ) { 2820 TRIE_LIST_NEW( state ); 2821 } 2822 for ( check = 1; 2823 check <= TRIE_LIST_USED( state ); 2824 check++ ) 2825 { 2826 if ( TRIE_LIST_ITEM( state, check ).forid 2827 == charid ) 2828 { 2829 newstate = TRIE_LIST_ITEM( state, check ).newstate; 2830 break; 2831 } 2832 } 2833 if ( ! newstate ) { 2834 newstate = next_alloc++; 2835 prev_states[newstate] = state; 2836 TRIE_LIST_PUSH( state, charid, newstate ); 2837 transcount++; 2838 } 2839 state = newstate; 2840 } else { 2841 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 2842 } 2843 } 2844 } 2845 TRIE_HANDLE_WORD(state); 2846 2847 } /* end second pass */ 2848 2849 /* next alloc is the NEXT state to be allocated */ 2850 trie->statecount = next_alloc; 2851 trie->states = (reg_trie_state *) 2852 PerlMemShared_realloc( trie->states, 2853 next_alloc 2854 * sizeof(reg_trie_state) ); 2855 2856 /* and now dump it out before we compress it */ 2857 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, 2858 revcharmap, next_alloc, 2859 depth+1) 2860 ); 2861 2862 trie->trans = (reg_trie_trans *) 2863 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); 2864 { 2865 U32 state; 2866 U32 tp = 0; 2867 U32 zp = 0; 2868 2869 2870 for( state=1 ; state < next_alloc ; state ++ ) { 2871 U32 base=0; 2872 2873 /* 2874 DEBUG_TRIE_COMPILE_MORE_r( 2875 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp) 2876 ); 2877 */ 2878 2879 if (trie->states[state].trans.list) { 2880 U16 minid=TRIE_LIST_ITEM( state, 1).forid; 2881 U16 maxid=minid; 2882 U16 idx; 2883 2884 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 2885 const U16 forid = TRIE_LIST_ITEM( state, idx).forid; 2886 if ( forid < minid ) { 2887 minid=forid; 2888 } else if ( forid > maxid ) { 2889 maxid=forid; 2890 } 2891 } 2892 if ( transcount < tp + maxid - minid + 1) { 2893 transcount *= 2; 2894 trie->trans = (reg_trie_trans *) 2895 PerlMemShared_realloc( trie->trans, 2896 transcount 2897 * sizeof(reg_trie_trans) ); 2898 Zero( trie->trans + (transcount / 2), 2899 transcount / 2, 2900 reg_trie_trans ); 2901 } 2902 base = trie->uniquecharcount + tp - minid; 2903 if ( maxid == minid ) { 2904 U32 set = 0; 2905 for ( ; zp < tp ; zp++ ) { 2906 if ( ! trie->trans[ zp ].next ) { 2907 base = trie->uniquecharcount + zp - minid; 2908 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 2909 1).newstate; 2910 trie->trans[ zp ].check = state; 2911 set = 1; 2912 break; 2913 } 2914 } 2915 if ( !set ) { 2916 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 2917 1).newstate; 2918 trie->trans[ tp ].check = state; 2919 tp++; 2920 zp = tp; 2921 } 2922 } else { 2923 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 2924 const U32 tid = base 2925 - trie->uniquecharcount 2926 + TRIE_LIST_ITEM( state, idx ).forid; 2927 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, 2928 idx ).newstate; 2929 trie->trans[ tid ].check = state; 2930 } 2931 tp += ( maxid - minid + 1 ); 2932 } 2933 Safefree(trie->states[ state ].trans.list); 2934 } 2935 /* 2936 DEBUG_TRIE_COMPILE_MORE_r( 2937 Perl_re_printf( aTHX_ " base: %d\n",base); 2938 ); 2939 */ 2940 trie->states[ state ].trans.base=base; 2941 } 2942 trie->lasttrans = tp + 1; 2943 } 2944 } else { 2945 /* 2946 Second Pass -- Flat Table Representation. 2947 2948 we dont use the 0 slot of either trans[] or states[] so we add 1 to 2949 each. We know that we will need Charcount+1 trans at most to store 2950 the data (one row per char at worst case) So we preallocate both 2951 structures assuming worst case. 2952 2953 We then construct the trie using only the .next slots of the entry 2954 structs. 2955 2956 We use the .check field of the first entry of the node temporarily 2957 to make compression both faster and easier by keeping track of how 2958 many non zero fields are in the node. 2959 2960 Since trans are numbered from 1 any 0 pointer in the table is a FAIL 2961 transition. 2962 2963 There are two terms at use here: state as a TRIE_NODEIDX() which is 2964 a number representing the first entry of the node, and state as a 2965 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) 2966 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) 2967 if there are 2 entrys per node. eg: 2968 2969 A B A B 2970 1. 2 4 1. 3 7 2971 2. 0 3 3. 0 5 2972 3. 0 0 5. 0 0 2973 4. 0 0 7. 0 0 2974 2975 The table is internally in the right hand, idx form. However as we 2976 also have to deal with the states array which is indexed by nodenum 2977 we have to use TRIE_NODENUM() to convert. 2978 2979 */ 2980 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n", 2981 depth+1)); 2982 2983 trie->trans = (reg_trie_trans *) 2984 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) 2985 * trie->uniquecharcount + 1, 2986 sizeof(reg_trie_trans) ); 2987 trie->states = (reg_trie_state *) 2988 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 2989 sizeof(reg_trie_state) ); 2990 next_alloc = trie->uniquecharcount + 1; 2991 2992 2993 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 2994 2995 regnode *noper = NEXTOPER( cur ); 2996 2997 U32 state = 1; /* required init */ 2998 2999 U16 charid = 0; /* sanity init */ 3000 U32 accept_state = 0; /* sanity init */ 3001 3002 U32 wordlen = 0; /* required init */ 3003 3004 if (OP(noper) == NOTHING) { 3005 regnode *noper_next= regnext(noper); 3006 if (noper_next < tail) 3007 noper= noper_next; 3008 } 3009 3010 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { 3011 const U8 *uc= (U8*)STRING(noper); 3012 const U8 *e= uc + STR_LEN(noper); 3013 3014 for ( ; uc < e ; uc += len ) { 3015 3016 TRIE_READ_CHAR; 3017 3018 if ( uvc < 256 ) { 3019 charid = trie->charmap[ uvc ]; 3020 } else { 3021 SV* const * const svpp = hv_fetch( widecharmap, 3022 (char*)&uvc, 3023 sizeof( UV ), 3024 0); 3025 charid = svpp ? (U16)SvIV(*svpp) : 0; 3026 } 3027 if ( charid ) { 3028 charid--; 3029 if ( !trie->trans[ state + charid ].next ) { 3030 trie->trans[ state + charid ].next = next_alloc; 3031 trie->trans[ state ].check++; 3032 prev_states[TRIE_NODENUM(next_alloc)] 3033 = TRIE_NODENUM(state); 3034 next_alloc += trie->uniquecharcount; 3035 } 3036 state = trie->trans[ state + charid ].next; 3037 } else { 3038 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 3039 } 3040 /* charid is now 0 if we dont know the char read, or 3041 * nonzero if we do */ 3042 } 3043 } 3044 accept_state = TRIE_NODENUM( state ); 3045 TRIE_HANDLE_WORD(accept_state); 3046 3047 } /* end second pass */ 3048 3049 /* and now dump it out before we compress it */ 3050 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, 3051 revcharmap, 3052 next_alloc, depth+1)); 3053 3054 { 3055 /* 3056 * Inplace compress the table.* 3057 3058 For sparse data sets the table constructed by the trie algorithm will 3059 be mostly 0/FAIL transitions or to put it another way mostly empty. 3060 (Note that leaf nodes will not contain any transitions.) 3061 3062 This algorithm compresses the tables by eliminating most such 3063 transitions, at the cost of a modest bit of extra work during lookup: 3064 3065 - Each states[] entry contains a .base field which indicates the 3066 index in the state[] array wheres its transition data is stored. 3067 3068 - If .base is 0 there are no valid transitions from that node. 3069 3070 - If .base is nonzero then charid is added to it to find an entry in 3071 the trans array. 3072 3073 -If trans[states[state].base+charid].check!=state then the 3074 transition is taken to be a 0/Fail transition. Thus if there are fail 3075 transitions at the front of the node then the .base offset will point 3076 somewhere inside the previous nodes data (or maybe even into a node 3077 even earlier), but the .check field determines if the transition is 3078 valid. 3079 3080 XXX - wrong maybe? 3081 The following process inplace converts the table to the compressed 3082 table: We first do not compress the root node 1,and mark all its 3083 .check pointers as 1 and set its .base pointer as 1 as well. This 3084 allows us to do a DFA construction from the compressed table later, 3085 and ensures that any .base pointers we calculate later are greater 3086 than 0. 3087 3088 - We set 'pos' to indicate the first entry of the second node. 3089 3090 - We then iterate over the columns of the node, finding the first and 3091 last used entry at l and m. We then copy l..m into pos..(pos+m-l), 3092 and set the .check pointers accordingly, and advance pos 3093 appropriately and repreat for the next node. Note that when we copy 3094 the next pointers we have to convert them from the original 3095 NODEIDX form to NODENUM form as the former is not valid post 3096 compression. 3097 3098 - If a node has no transitions used we mark its base as 0 and do not 3099 advance the pos pointer. 3100 3101 - If a node only has one transition we use a second pointer into the 3102 structure to fill in allocated fail transitions from other states. 3103 This pointer is independent of the main pointer and scans forward 3104 looking for null transitions that are allocated to a state. When it 3105 finds one it writes the single transition into the "hole". If the 3106 pointer doesnt find one the single transition is appended as normal. 3107 3108 - Once compressed we can Renew/realloc the structures to release the 3109 excess space. 3110 3111 See "Table-Compression Methods" in sec 3.9 of the Red Dragon, 3112 specifically Fig 3.47 and the associated pseudocode. 3113 3114 demq 3115 */ 3116 const U32 laststate = TRIE_NODENUM( next_alloc ); 3117 U32 state, charid; 3118 U32 pos = 0, zp=0; 3119 trie->statecount = laststate; 3120 3121 for ( state = 1 ; state < laststate ; state++ ) { 3122 U8 flag = 0; 3123 const U32 stateidx = TRIE_NODEIDX( state ); 3124 const U32 o_used = trie->trans[ stateidx ].check; 3125 U32 used = trie->trans[ stateidx ].check; 3126 trie->trans[ stateidx ].check = 0; 3127 3128 for ( charid = 0; 3129 used && charid < trie->uniquecharcount; 3130 charid++ ) 3131 { 3132 if ( flag || trie->trans[ stateidx + charid ].next ) { 3133 if ( trie->trans[ stateidx + charid ].next ) { 3134 if (o_used == 1) { 3135 for ( ; zp < pos ; zp++ ) { 3136 if ( ! trie->trans[ zp ].next ) { 3137 break; 3138 } 3139 } 3140 trie->states[ state ].trans.base 3141 = zp 3142 + trie->uniquecharcount 3143 - charid ; 3144 trie->trans[ zp ].next 3145 = SAFE_TRIE_NODENUM( trie->trans[ stateidx 3146 + charid ].next ); 3147 trie->trans[ zp ].check = state; 3148 if ( ++zp > pos ) pos = zp; 3149 break; 3150 } 3151 used--; 3152 } 3153 if ( !flag ) { 3154 flag = 1; 3155 trie->states[ state ].trans.base 3156 = pos + trie->uniquecharcount - charid ; 3157 } 3158 trie->trans[ pos ].next 3159 = SAFE_TRIE_NODENUM( 3160 trie->trans[ stateidx + charid ].next ); 3161 trie->trans[ pos ].check = state; 3162 pos++; 3163 } 3164 } 3165 } 3166 trie->lasttrans = pos + 1; 3167 trie->states = (reg_trie_state *) 3168 PerlMemShared_realloc( trie->states, laststate 3169 * sizeof(reg_trie_state) ); 3170 DEBUG_TRIE_COMPILE_MORE_r( 3171 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", 3172 depth+1, 3173 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount 3174 + 1 ), 3175 (IV)next_alloc, 3176 (IV)pos, 3177 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); 3178 ); 3179 3180 } /* end table compress */ 3181 } 3182 DEBUG_TRIE_COMPILE_MORE_r( 3183 Perl_re_indentf( aTHX_ "Statecount:%"UVxf" Lasttrans:%"UVxf"\n", 3184 depth+1, 3185 (UV)trie->statecount, 3186 (UV)trie->lasttrans) 3187 ); 3188 /* resize the trans array to remove unused space */ 3189 trie->trans = (reg_trie_trans *) 3190 PerlMemShared_realloc( trie->trans, trie->lasttrans 3191 * sizeof(reg_trie_trans) ); 3192 3193 { /* Modify the program and insert the new TRIE node */ 3194 U8 nodetype =(U8)(flags & 0xFF); 3195 char *str=NULL; 3196 3197 #ifdef DEBUGGING 3198 regnode *optimize = NULL; 3199 #ifdef RE_TRACK_PATTERN_OFFSETS 3200 3201 U32 mjd_offset = 0; 3202 U32 mjd_nodelen = 0; 3203 #endif /* RE_TRACK_PATTERN_OFFSETS */ 3204 #endif /* DEBUGGING */ 3205 /* 3206 This means we convert either the first branch or the first Exact, 3207 depending on whether the thing following (in 'last') is a branch 3208 or not and whther first is the startbranch (ie is it a sub part of 3209 the alternation or is it the whole thing.) 3210 Assuming its a sub part we convert the EXACT otherwise we convert 3211 the whole branch sequence, including the first. 3212 */ 3213 /* Find the node we are going to overwrite */ 3214 if ( first != startbranch || OP( last ) == BRANCH ) { 3215 /* branch sub-chain */ 3216 NEXT_OFF( first ) = (U16)(last - first); 3217 #ifdef RE_TRACK_PATTERN_OFFSETS 3218 DEBUG_r({ 3219 mjd_offset= Node_Offset((convert)); 3220 mjd_nodelen= Node_Length((convert)); 3221 }); 3222 #endif 3223 /* whole branch chain */ 3224 } 3225 #ifdef RE_TRACK_PATTERN_OFFSETS 3226 else { 3227 DEBUG_r({ 3228 const regnode *nop = NEXTOPER( convert ); 3229 mjd_offset= Node_Offset((nop)); 3230 mjd_nodelen= Node_Length((nop)); 3231 }); 3232 } 3233 DEBUG_OPTIMISE_r( 3234 Perl_re_indentf( aTHX_ "MJD offset:%"UVuf" MJD length:%"UVuf"\n", 3235 depth+1, 3236 (UV)mjd_offset, (UV)mjd_nodelen) 3237 ); 3238 #endif 3239 /* But first we check to see if there is a common prefix we can 3240 split out as an EXACT and put in front of the TRIE node. */ 3241 trie->startstate= 1; 3242 if ( trie->bitmap && !widecharmap && !trie->jump ) { 3243 U32 state; 3244 for ( state = 1 ; state < trie->statecount-1 ; state++ ) { 3245 U32 ofs = 0; 3246 I32 idx = -1; 3247 U32 count = 0; 3248 const U32 base = trie->states[ state ].trans.base; 3249 3250 if ( trie->states[state].wordnum ) 3251 count = 1; 3252 3253 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 3254 if ( ( base + ofs >= trie->uniquecharcount ) && 3255 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 3256 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 3257 { 3258 if ( ++count > 1 ) { 3259 SV **tmp = av_fetch( revcharmap, ofs, 0); 3260 const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); 3261 if ( state == 1 ) break; 3262 if ( count == 2 ) { 3263 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); 3264 DEBUG_OPTIMISE_r( 3265 Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [", 3266 depth+1, 3267 (UV)state)); 3268 if (idx >= 0) { 3269 SV ** const tmp = av_fetch( revcharmap, idx, 0); 3270 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); 3271 3272 TRIE_BITMAP_SET(trie,*ch); 3273 if ( folder ) 3274 TRIE_BITMAP_SET(trie, folder[ *ch ]); 3275 DEBUG_OPTIMISE_r( 3276 Perl_re_printf( aTHX_ "%s", (char*)ch) 3277 ); 3278 } 3279 } 3280 TRIE_BITMAP_SET(trie,*ch); 3281 if ( folder ) 3282 TRIE_BITMAP_SET(trie,folder[ *ch ]); 3283 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); 3284 } 3285 idx = ofs; 3286 } 3287 } 3288 if ( count == 1 ) { 3289 SV **tmp = av_fetch( revcharmap, idx, 0); 3290 STRLEN len; 3291 char *ch = SvPV( *tmp, len ); 3292 DEBUG_OPTIMISE_r({ 3293 SV *sv=sv_newmortal(); 3294 Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", 3295 depth+1, 3296 (UV)state, (UV)idx, 3297 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, 3298 PL_colors[0], PL_colors[1], 3299 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 3300 PERL_PV_ESCAPE_FIRSTCHAR 3301 ) 3302 ); 3303 }); 3304 if ( state==1 ) { 3305 OP( convert ) = nodetype; 3306 str=STRING(convert); 3307 STR_LEN(convert)=0; 3308 } 3309 STR_LEN(convert) += len; 3310 while (len--) 3311 *str++ = *ch++; 3312 } else { 3313 #ifdef DEBUGGING 3314 if (state>1) 3315 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n")); 3316 #endif 3317 break; 3318 } 3319 } 3320 trie->prefixlen = (state-1); 3321 if (str) { 3322 regnode *n = convert+NODE_SZ_STR(convert); 3323 NEXT_OFF(convert) = NODE_SZ_STR(convert); 3324 trie->startstate = state; 3325 trie->minlen -= (state - 1); 3326 trie->maxlen -= (state - 1); 3327 #ifdef DEBUGGING 3328 /* At least the UNICOS C compiler choked on this 3329 * being argument to DEBUG_r(), so let's just have 3330 * it right here. */ 3331 if ( 3332 #ifdef PERL_EXT_RE_BUILD 3333 1 3334 #else 3335 DEBUG_r_TEST 3336 #endif 3337 ) { 3338 regnode *fix = convert; 3339 U32 word = trie->wordcount; 3340 mjd_nodelen++; 3341 Set_Node_Offset_Length(convert, mjd_offset, state - 1); 3342 while( ++fix < n ) { 3343 Set_Node_Offset_Length(fix, 0, 0); 3344 } 3345 while (word--) { 3346 SV ** const tmp = av_fetch( trie_words, word, 0 ); 3347 if (tmp) { 3348 if ( STR_LEN(convert) <= SvCUR(*tmp) ) 3349 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); 3350 else 3351 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); 3352 } 3353 } 3354 } 3355 #endif 3356 if (trie->maxlen) { 3357 convert = n; 3358 } else { 3359 NEXT_OFF(convert) = (U16)(tail - convert); 3360 DEBUG_r(optimize= n); 3361 } 3362 } 3363 } 3364 if (!jumper) 3365 jumper = last; 3366 if ( trie->maxlen ) { 3367 NEXT_OFF( convert ) = (U16)(tail - convert); 3368 ARG_SET( convert, data_slot ); 3369 /* Store the offset to the first unabsorbed branch in 3370 jump[0], which is otherwise unused by the jump logic. 3371 We use this when dumping a trie and during optimisation. */ 3372 if (trie->jump) 3373 trie->jump[0] = (U16)(nextbranch - convert); 3374 3375 /* If the start state is not accepting (meaning there is no empty string/NOTHING) 3376 * and there is a bitmap 3377 * and the first "jump target" node we found leaves enough room 3378 * then convert the TRIE node into a TRIEC node, with the bitmap 3379 * embedded inline in the opcode - this is hypothetically faster. 3380 */ 3381 if ( !trie->states[trie->startstate].wordnum 3382 && trie->bitmap 3383 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) 3384 { 3385 OP( convert ) = TRIEC; 3386 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); 3387 PerlMemShared_free(trie->bitmap); 3388 trie->bitmap= NULL; 3389 } else 3390 OP( convert ) = TRIE; 3391 3392 /* store the type in the flags */ 3393 convert->flags = nodetype; 3394 DEBUG_r({ 3395 optimize = convert 3396 + NODE_STEP_REGNODE 3397 + regarglen[ OP( convert ) ]; 3398 }); 3399 /* XXX We really should free up the resource in trie now, 3400 as we won't use them - (which resources?) dmq */ 3401 } 3402 /* needed for dumping*/ 3403 DEBUG_r(if (optimize) { 3404 regnode *opt = convert; 3405 3406 while ( ++opt < optimize) { 3407 Set_Node_Offset_Length(opt,0,0); 3408 } 3409 /* 3410 Try to clean up some of the debris left after the 3411 optimisation. 3412 */ 3413 while( optimize < jumper ) { 3414 mjd_nodelen += Node_Length((optimize)); 3415 OP( optimize ) = OPTIMIZED; 3416 Set_Node_Offset_Length(optimize,0,0); 3417 optimize++; 3418 } 3419 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); 3420 }); 3421 } /* end node insert */ 3422 3423 /* Finish populating the prev field of the wordinfo array. Walk back 3424 * from each accept state until we find another accept state, and if 3425 * so, point the first word's .prev field at the second word. If the 3426 * second already has a .prev field set, stop now. This will be the 3427 * case either if we've already processed that word's accept state, 3428 * or that state had multiple words, and the overspill words were 3429 * already linked up earlier. 3430 */ 3431 { 3432 U16 word; 3433 U32 state; 3434 U16 prev; 3435 3436 for (word=1; word <= trie->wordcount; word++) { 3437 prev = 0; 3438 if (trie->wordinfo[word].prev) 3439 continue; 3440 state = trie->wordinfo[word].accept; 3441 while (state) { 3442 state = prev_states[state]; 3443 if (!state) 3444 break; 3445 prev = trie->states[state].wordnum; 3446 if (prev) 3447 break; 3448 } 3449 trie->wordinfo[word].prev = prev; 3450 } 3451 Safefree(prev_states); 3452 } 3453 3454 3455 /* and now dump out the compressed format */ 3456 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); 3457 3458 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; 3459 #ifdef DEBUGGING 3460 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; 3461 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; 3462 #else 3463 SvREFCNT_dec_NN(revcharmap); 3464 #endif 3465 return trie->jump 3466 ? MADE_JUMP_TRIE 3467 : trie->startstate>1 3468 ? MADE_EXACT_TRIE 3469 : MADE_TRIE; 3470 } 3471 3472 STATIC regnode * 3473 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) 3474 { 3475 /* The Trie is constructed and compressed now so we can build a fail array if 3476 * it's needed 3477 3478 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3479 3.32 in the 3480 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, 3481 Ullman 1985/88 3482 ISBN 0-201-10088-6 3483 3484 We find the fail state for each state in the trie, this state is the longest 3485 proper suffix of the current state's 'word' that is also a proper prefix of 3486 another word in our trie. State 1 represents the word '' and is thus the 3487 default fail state. This allows the DFA not to have to restart after its 3488 tried and failed a word at a given point, it simply continues as though it 3489 had been matching the other word in the first place. 3490 Consider 3491 'abcdgu'=~/abcdefg|cdgu/ 3492 When we get to 'd' we are still matching the first word, we would encounter 3493 'g' which would fail, which would bring us to the state representing 'd' in 3494 the second word where we would try 'g' and succeed, proceeding to match 3495 'cdgu'. 3496 */ 3497 /* add a fail transition */ 3498 const U32 trie_offset = ARG(source); 3499 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; 3500 U32 *q; 3501 const U32 ucharcount = trie->uniquecharcount; 3502 const U32 numstates = trie->statecount; 3503 const U32 ubound = trie->lasttrans + ucharcount; 3504 U32 q_read = 0; 3505 U32 q_write = 0; 3506 U32 charid; 3507 U32 base = trie->states[ 1 ].trans.base; 3508 U32 *fail; 3509 reg_ac_data *aho; 3510 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); 3511 regnode *stclass; 3512 GET_RE_DEBUG_FLAGS_DECL; 3513 3514 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; 3515 PERL_UNUSED_CONTEXT; 3516 #ifndef DEBUGGING 3517 PERL_UNUSED_ARG(depth); 3518 #endif 3519 3520 if ( OP(source) == TRIE ) { 3521 struct regnode_1 *op = (struct regnode_1 *) 3522 PerlMemShared_calloc(1, sizeof(struct regnode_1)); 3523 StructCopy(source,op,struct regnode_1); 3524 stclass = (regnode *)op; 3525 } else { 3526 struct regnode_charclass *op = (struct regnode_charclass *) 3527 PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); 3528 StructCopy(source,op,struct regnode_charclass); 3529 stclass = (regnode *)op; 3530 } 3531 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */ 3532 3533 ARG_SET( stclass, data_slot ); 3534 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); 3535 RExC_rxi->data->data[ data_slot ] = (void*)aho; 3536 aho->trie=trie_offset; 3537 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); 3538 Copy( trie->states, aho->states, numstates, reg_trie_state ); 3539 Newxz( q, numstates, U32); 3540 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); 3541 aho->refcount = 1; 3542 fail = aho->fail; 3543 /* initialize fail[0..1] to be 1 so that we always have 3544 a valid final fail state */ 3545 fail[ 0 ] = fail[ 1 ] = 1; 3546 3547 for ( charid = 0; charid < ucharcount ; charid++ ) { 3548 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); 3549 if ( newstate ) { 3550 q[ q_write ] = newstate; 3551 /* set to point at the root */ 3552 fail[ q[ q_write++ ] ]=1; 3553 } 3554 } 3555 while ( q_read < q_write) { 3556 const U32 cur = q[ q_read++ % numstates ]; 3557 base = trie->states[ cur ].trans.base; 3558 3559 for ( charid = 0 ; charid < ucharcount ; charid++ ) { 3560 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); 3561 if (ch_state) { 3562 U32 fail_state = cur; 3563 U32 fail_base; 3564 do { 3565 fail_state = fail[ fail_state ]; 3566 fail_base = aho->states[ fail_state ].trans.base; 3567 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); 3568 3569 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); 3570 fail[ ch_state ] = fail_state; 3571 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) 3572 { 3573 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; 3574 } 3575 q[ q_write++ % numstates] = ch_state; 3576 } 3577 } 3578 } 3579 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop 3580 when we fail in state 1, this allows us to use the 3581 charclass scan to find a valid start char. This is based on the principle 3582 that theres a good chance the string being searched contains lots of stuff 3583 that cant be a start char. 3584 */ 3585 fail[ 0 ] = fail[ 1 ] = 0; 3586 DEBUG_TRIE_COMPILE_r({ 3587 Perl_re_indentf( aTHX_ "Stclass Failtable (%"UVuf" states): 0", 3588 depth, (UV)numstates 3589 ); 3590 for( q_read=1; q_read<numstates; q_read++ ) { 3591 Perl_re_printf( aTHX_ ", %"UVuf, (UV)fail[q_read]); 3592 } 3593 Perl_re_printf( aTHX_ "\n"); 3594 }); 3595 Safefree(q); 3596 /*RExC_seen |= REG_TRIEDFA_SEEN;*/ 3597 return stclass; 3598 } 3599 3600 3601 #define DEBUG_PEEP(str,scan,depth) \ 3602 DEBUG_OPTIMISE_r({if (scan){ \ 3603 regnode *Next = regnext(scan); \ 3604 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\ 3605 Perl_re_indentf( aTHX_ "" str ">%3d: %s (%d)", \ 3606 depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\ 3607 Next ? (REG_NODE_NUM(Next)) : 0 );\ 3608 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\ 3609 Perl_re_printf( aTHX_ "\n"); \ 3610 }}); 3611 3612 /* The below joins as many adjacent EXACTish nodes as possible into a single 3613 * one. The regop may be changed if the node(s) contain certain sequences that 3614 * require special handling. The joining is only done if: 3615 * 1) there is room in the current conglomerated node to entirely contain the 3616 * next one. 3617 * 2) they are the exact same node type 3618 * 3619 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and 3620 * these get optimized out 3621 * 3622 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full 3623 * as possible, even if that means splitting an existing node so that its first 3624 * part is moved to the preceeding node. This would maximise the efficiency of 3625 * memEQ during matching. Elsewhere in this file, khw proposes splitting 3626 * EXACTFish nodes into portions that don't change under folding vs those that 3627 * do. Those portions that don't change may be the only things in the pattern that 3628 * could be used to find fixed and floating strings. 3629 * 3630 * If a node is to match under /i (folded), the number of characters it matches 3631 * can be different than its character length if it contains a multi-character 3632 * fold. *min_subtract is set to the total delta number of characters of the 3633 * input nodes. 3634 * 3635 * And *unfolded_multi_char is set to indicate whether or not the node contains 3636 * an unfolded multi-char fold. This happens when whether the fold is valid or 3637 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN 3638 * SMALL LETTER SHARP S, as only if the target string being matched against 3639 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose 3640 * folding rules depend on the locale in force at runtime. (Multi-char folds 3641 * whose components are all above the Latin1 range are not run-time locale 3642 * dependent, and have already been folded by the time this function is 3643 * called.) 3644 * 3645 * This is as good a place as any to discuss the design of handling these 3646 * multi-character fold sequences. It's been wrong in Perl for a very long 3647 * time. There are three code points in Unicode whose multi-character folds 3648 * were long ago discovered to mess things up. The previous designs for 3649 * dealing with these involved assigning a special node for them. This 3650 * approach doesn't always work, as evidenced by this example: 3651 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches 3652 * Both sides fold to "sss", but if the pattern is parsed to create a node that 3653 * would match just the \xDF, it won't be able to handle the case where a 3654 * successful match would have to cross the node's boundary. The new approach 3655 * that hopefully generally solves the problem generates an EXACTFU_SS node 3656 * that is "sss" in this case. 3657 * 3658 * It turns out that there are problems with all multi-character folds, and not 3659 * just these three. Now the code is general, for all such cases. The 3660 * approach taken is: 3661 * 1) This routine examines each EXACTFish node that could contain multi- 3662 * character folded sequences. Since a single character can fold into 3663 * such a sequence, the minimum match length for this node is less than 3664 * the number of characters in the node. This routine returns in 3665 * *min_subtract how many characters to subtract from the the actual 3666 * length of the string to get a real minimum match length; it is 0 if 3667 * there are no multi-char foldeds. This delta is used by the caller to 3668 * adjust the min length of the match, and the delta between min and max, 3669 * so that the optimizer doesn't reject these possibilities based on size 3670 * constraints. 3671 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS 3672 * is used for an EXACTFU node that contains at least one "ss" sequence in 3673 * it. For non-UTF-8 patterns and strings, this is the only case where 3674 * there is a possible fold length change. That means that a regular 3675 * EXACTFU node without UTF-8 involvement doesn't have to concern itself 3676 * with length changes, and so can be processed faster. regexec.c takes 3677 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is 3678 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't 3679 * known until runtime). This saves effort in regex matching. However, 3680 * the pre-folding isn't done for non-UTF8 patterns because the fold of 3681 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by 3682 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, 3683 * again, EXACTFL) nodes fold to isn't known until runtime. The fold 3684 * possibilities for the non-UTF8 patterns are quite simple, except for 3685 * the sharp s. All the ones that don't involve a UTF-8 target string are 3686 * members of a fold-pair, and arrays are set up for all of them so that 3687 * the other member of the pair can be found quickly. Code elsewhere in 3688 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to 3689 * 'ss', even if the pattern isn't UTF-8. This avoids the issues 3690 * described in the next item. 3691 * 3) A problem remains for unfolded multi-char folds. (These occur when the 3692 * validity of the fold won't be known until runtime, and so must remain 3693 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA 3694 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot 3695 * be an EXACTF node with a UTF-8 pattern.) They also occur for various 3696 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) 3697 * The reason this is a problem is that the optimizer part of regexec.c 3698 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption 3699 * that a character in the pattern corresponds to at most a single 3700 * character in the target string. (And I do mean character, and not byte 3701 * here, unlike other parts of the documentation that have never been 3702 * updated to account for multibyte Unicode.) sharp s in EXACTF and 3703 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes 3704 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL 3705 * nodes, violate the assumption, and they are the only instances where it 3706 * is violated. I'm reluctant to try to change the assumption, as the 3707 * code involved is impenetrable to me (khw), so instead the code here 3708 * punts. This routine examines EXACTFL nodes, and (when the pattern 3709 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a 3710 * boolean indicating whether or not the node contains such a fold. When 3711 * it is true, the caller sets a flag that later causes the optimizer in 3712 * this file to not set values for the floating and fixed string lengths, 3713 * and thus avoids the optimizer code in regexec.c that makes the invalid 3714 * assumption. Thus, there is no optimization based on string lengths for 3715 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern 3716 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the 3717 * assumption is wrong only in these cases is that all other non-UTF-8 3718 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to 3719 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in 3720 * EXACTF nodes because we don't know at compile time if it actually 3721 * matches 'ss' or not. For EXACTF nodes it will match iff the target 3722 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it 3723 * always matches; and EXACTFA where it never does. In an EXACTFA node in 3724 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the 3725 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 3726 * string would require the pattern to be forced into UTF-8, the overhead 3727 * of which we want to avoid. Similarly the unfolded multi-char folds in 3728 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 3729 * locale.) 3730 * 3731 * Similarly, the code that generates tries doesn't currently handle 3732 * not-already-folded multi-char folds, and it looks like a pain to change 3733 * that. Therefore, trie generation of EXACTFA nodes with the sharp s 3734 * doesn't work. Instead, such an EXACTFA is turned into a new regnode, 3735 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people 3736 * using /iaa matching will be doing so almost entirely with ASCII 3737 * strings, so this should rarely be encountered in practice */ 3738 3739 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ 3740 if (PL_regkind[OP(scan)] == EXACT) \ 3741 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) 3742 3743 STATIC U32 3744 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, 3745 UV *min_subtract, bool *unfolded_multi_char, 3746 U32 flags,regnode *val, U32 depth) 3747 { 3748 /* Merge several consecutive EXACTish nodes into one. */ 3749 regnode *n = regnext(scan); 3750 U32 stringok = 1; 3751 regnode *next = scan + NODE_SZ_STR(scan); 3752 U32 merged = 0; 3753 U32 stopnow = 0; 3754 #ifdef DEBUGGING 3755 regnode *stop = scan; 3756 GET_RE_DEBUG_FLAGS_DECL; 3757 #else 3758 PERL_UNUSED_ARG(depth); 3759 #endif 3760 3761 PERL_ARGS_ASSERT_JOIN_EXACT; 3762 #ifndef EXPERIMENTAL_INPLACESCAN 3763 PERL_UNUSED_ARG(flags); 3764 PERL_UNUSED_ARG(val); 3765 #endif 3766 DEBUG_PEEP("join",scan,depth); 3767 3768 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge 3769 * EXACT ones that are mergeable to the current one. */ 3770 while (n 3771 && (PL_regkind[OP(n)] == NOTHING 3772 || (stringok && OP(n) == OP(scan))) 3773 && NEXT_OFF(n) 3774 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) 3775 { 3776 3777 if (OP(n) == TAIL || n > next) 3778 stringok = 0; 3779 if (PL_regkind[OP(n)] == NOTHING) { 3780 DEBUG_PEEP("skip:",n,depth); 3781 NEXT_OFF(scan) += NEXT_OFF(n); 3782 next = n + NODE_STEP_REGNODE; 3783 #ifdef DEBUGGING 3784 if (stringok) 3785 stop = n; 3786 #endif 3787 n = regnext(n); 3788 } 3789 else if (stringok) { 3790 const unsigned int oldl = STR_LEN(scan); 3791 regnode * const nnext = regnext(n); 3792 3793 /* XXX I (khw) kind of doubt that this works on platforms (should 3794 * Perl ever run on one) where U8_MAX is above 255 because of lots 3795 * of other assumptions */ 3796 /* Don't join if the sum can't fit into a single node */ 3797 if (oldl + STR_LEN(n) > U8_MAX) 3798 break; 3799 3800 DEBUG_PEEP("merg",n,depth); 3801 merged++; 3802 3803 NEXT_OFF(scan) += NEXT_OFF(n); 3804 STR_LEN(scan) += STR_LEN(n); 3805 next = n + NODE_SZ_STR(n); 3806 /* Now we can overwrite *n : */ 3807 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 3808 #ifdef DEBUGGING 3809 stop = next - 1; 3810 #endif 3811 n = nnext; 3812 if (stopnow) break; 3813 } 3814 3815 #ifdef EXPERIMENTAL_INPLACESCAN 3816 if (flags && !NEXT_OFF(n)) { 3817 DEBUG_PEEP("atch", val, depth); 3818 if (reg_off_by_arg[OP(n)]) { 3819 ARG_SET(n, val - n); 3820 } 3821 else { 3822 NEXT_OFF(n) = val - n; 3823 } 3824 stopnow = 1; 3825 } 3826 #endif 3827 } 3828 3829 *min_subtract = 0; 3830 *unfolded_multi_char = FALSE; 3831 3832 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We 3833 * can now analyze for sequences of problematic code points. (Prior to 3834 * this final joining, sequences could have been split over boundaries, and 3835 * hence missed). The sequences only happen in folding, hence for any 3836 * non-EXACT EXACTish node */ 3837 if (OP(scan) != EXACT && OP(scan) != EXACTL) { 3838 U8* s0 = (U8*) STRING(scan); 3839 U8* s = s0; 3840 U8* s_end = s0 + STR_LEN(scan); 3841 3842 int total_count_delta = 0; /* Total delta number of characters that 3843 multi-char folds expand to */ 3844 3845 /* One pass is made over the node's string looking for all the 3846 * possibilities. To avoid some tests in the loop, there are two main 3847 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and 3848 * non-UTF-8 */ 3849 if (UTF) { 3850 U8* folded = NULL; 3851 3852 if (OP(scan) == EXACTFL) { 3853 U8 *d; 3854 3855 /* An EXACTFL node would already have been changed to another 3856 * node type unless there is at least one character in it that 3857 * is problematic; likely a character whose fold definition 3858 * won't be known until runtime, and so has yet to be folded. 3859 * For all but the UTF-8 locale, folds are 1-1 in length, but 3860 * to handle the UTF-8 case, we need to create a temporary 3861 * folded copy using UTF-8 locale rules in order to analyze it. 3862 * This is because our macros that look to see if a sequence is 3863 * a multi-char fold assume everything is folded (otherwise the 3864 * tests in those macros would be too complicated and slow). 3865 * Note that here, the non-problematic folds will have already 3866 * been done, so we can just copy such characters. We actually 3867 * don't completely fold the EXACTFL string. We skip the 3868 * unfolded multi-char folds, as that would just create work 3869 * below to figure out the size they already are */ 3870 3871 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); 3872 d = folded; 3873 while (s < s_end) { 3874 STRLEN s_len = UTF8SKIP(s); 3875 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { 3876 Copy(s, d, s_len, U8); 3877 d += s_len; 3878 } 3879 else if (is_FOLDS_TO_MULTI_utf8(s)) { 3880 *unfolded_multi_char = TRUE; 3881 Copy(s, d, s_len, U8); 3882 d += s_len; 3883 } 3884 else if (isASCII(*s)) { 3885 *(d++) = toFOLD(*s); 3886 } 3887 else { 3888 STRLEN len; 3889 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); 3890 d += len; 3891 } 3892 s += s_len; 3893 } 3894 3895 /* Point the remainder of the routine to look at our temporary 3896 * folded copy */ 3897 s = folded; 3898 s_end = d; 3899 } /* End of creating folded copy of EXACTFL string */ 3900 3901 /* Examine the string for a multi-character fold sequence. UTF-8 3902 * patterns have all characters pre-folded by the time this code is 3903 * executed */ 3904 while (s < s_end - 1) /* Can stop 1 before the end, as minimum 3905 length sequence we are looking for is 2 */ 3906 { 3907 int count = 0; /* How many characters in a multi-char fold */ 3908 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); 3909 if (! len) { /* Not a multi-char fold: get next char */ 3910 s += UTF8SKIP(s); 3911 continue; 3912 } 3913 3914 /* Nodes with 'ss' require special handling, except for 3915 * EXACTFA-ish for which there is no multi-char fold to this */ 3916 if (len == 2 && *s == 's' && *(s+1) == 's' 3917 && OP(scan) != EXACTFA 3918 && OP(scan) != EXACTFA_NO_TRIE) 3919 { 3920 count = 2; 3921 if (OP(scan) != EXACTFL) { 3922 OP(scan) = EXACTFU_SS; 3923 } 3924 s += 2; 3925 } 3926 else { /* Here is a generic multi-char fold. */ 3927 U8* multi_end = s + len; 3928 3929 /* Count how many characters are in it. In the case of 3930 * /aa, no folds which contain ASCII code points are 3931 * allowed, so check for those, and skip if found. */ 3932 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { 3933 count = utf8_length(s, multi_end); 3934 s = multi_end; 3935 } 3936 else { 3937 while (s < multi_end) { 3938 if (isASCII(*s)) { 3939 s++; 3940 goto next_iteration; 3941 } 3942 else { 3943 s += UTF8SKIP(s); 3944 } 3945 count++; 3946 } 3947 } 3948 } 3949 3950 /* The delta is how long the sequence is minus 1 (1 is how long 3951 * the character that folds to the sequence is) */ 3952 total_count_delta += count - 1; 3953 next_iteration: ; 3954 } 3955 3956 /* We created a temporary folded copy of the string in EXACTFL 3957 * nodes. Therefore we need to be sure it doesn't go below zero, 3958 * as the real string could be shorter */ 3959 if (OP(scan) == EXACTFL) { 3960 int total_chars = utf8_length((U8*) STRING(scan), 3961 (U8*) STRING(scan) + STR_LEN(scan)); 3962 if (total_count_delta > total_chars) { 3963 total_count_delta = total_chars; 3964 } 3965 } 3966 3967 *min_subtract += total_count_delta; 3968 Safefree(folded); 3969 } 3970 else if (OP(scan) == EXACTFA) { 3971 3972 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char 3973 * fold to the ASCII range (and there are no existing ones in the 3974 * upper latin1 range). But, as outlined in the comments preceding 3975 * this function, we need to flag any occurrences of the sharp s. 3976 * This character forbids trie formation (because of added 3977 * complexity) */ 3978 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 3979 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 3980 || UNICODE_DOT_DOT_VERSION > 0) 3981 while (s < s_end) { 3982 if (*s == LATIN_SMALL_LETTER_SHARP_S) { 3983 OP(scan) = EXACTFA_NO_TRIE; 3984 *unfolded_multi_char = TRUE; 3985 break; 3986 } 3987 s++; 3988 } 3989 } 3990 else { 3991 3992 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char 3993 * folds that are all Latin1. As explained in the comments 3994 * preceding this function, we look also for the sharp s in EXACTF 3995 * and EXACTFL nodes; it can be in the final position. Otherwise 3996 * we can stop looking 1 byte earlier because have to find at least 3997 * two characters for a multi-fold */ 3998 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) 3999 ? s_end 4000 : s_end -1; 4001 4002 while (s < upper) { 4003 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); 4004 if (! len) { /* Not a multi-char fold. */ 4005 if (*s == LATIN_SMALL_LETTER_SHARP_S 4006 && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) 4007 { 4008 *unfolded_multi_char = TRUE; 4009 } 4010 s++; 4011 continue; 4012 } 4013 4014 if (len == 2 4015 && isALPHA_FOLD_EQ(*s, 's') 4016 && isALPHA_FOLD_EQ(*(s+1), 's')) 4017 { 4018 4019 /* EXACTF nodes need to know that the minimum length 4020 * changed so that a sharp s in the string can match this 4021 * ss in the pattern, but they remain EXACTF nodes, as they 4022 * won't match this unless the target string is is UTF-8, 4023 * which we don't know until runtime. EXACTFL nodes can't 4024 * transform into EXACTFU nodes */ 4025 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { 4026 OP(scan) = EXACTFU_SS; 4027 } 4028 } 4029 4030 *min_subtract += len - 1; 4031 s += len; 4032 } 4033 #endif 4034 } 4035 } 4036 4037 #ifdef DEBUGGING 4038 /* Allow dumping but overwriting the collection of skipped 4039 * ops and/or strings with fake optimized ops */ 4040 n = scan + NODE_SZ_STR(scan); 4041 while (n <= stop) { 4042 OP(n) = OPTIMIZED; 4043 FLAGS(n) = 0; 4044 NEXT_OFF(n) = 0; 4045 n++; 4046 } 4047 #endif 4048 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); 4049 return stopnow; 4050 } 4051 4052 /* REx optimizer. Converts nodes into quicker variants "in place". 4053 Finds fixed substrings. */ 4054 4055 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set 4056 to the position after last scanned or to NULL. */ 4057 4058 #define INIT_AND_WITHP \ 4059 assert(!and_withp); \ 4060 Newx(and_withp,1, regnode_ssc); \ 4061 SAVEFREEPV(and_withp) 4062 4063 4064 static void 4065 S_unwind_scan_frames(pTHX_ const void *p) 4066 { 4067 scan_frame *f= (scan_frame *)p; 4068 do { 4069 scan_frame *n= f->next_frame; 4070 Safefree(f); 4071 f= n; 4072 } while (f); 4073 } 4074 4075 4076 STATIC SSize_t 4077 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, 4078 SSize_t *minlenp, SSize_t *deltap, 4079 regnode *last, 4080 scan_data_t *data, 4081 I32 stopparen, 4082 U32 recursed_depth, 4083 regnode_ssc *and_withp, 4084 U32 flags, U32 depth) 4085 /* scanp: Start here (read-write). */ 4086 /* deltap: Write maxlen-minlen here. */ 4087 /* last: Stop before this one. */ 4088 /* data: string data about the pattern */ 4089 /* stopparen: treat close N as END */ 4090 /* recursed: which subroutines have we recursed into */ 4091 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ 4092 { 4093 /* There must be at least this number of characters to match */ 4094 SSize_t min = 0; 4095 I32 pars = 0, code; 4096 regnode *scan = *scanp, *next; 4097 SSize_t delta = 0; 4098 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 4099 int is_inf_internal = 0; /* The studied chunk is infinite */ 4100 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; 4101 scan_data_t data_fake; 4102 SV *re_trie_maxbuff = NULL; 4103 regnode *first_non_open = scan; 4104 SSize_t stopmin = SSize_t_MAX; 4105 scan_frame *frame = NULL; 4106 GET_RE_DEBUG_FLAGS_DECL; 4107 4108 PERL_ARGS_ASSERT_STUDY_CHUNK; 4109 RExC_study_started= 1; 4110 4111 4112 if ( depth == 0 ) { 4113 while (first_non_open && OP(first_non_open) == OPEN) 4114 first_non_open=regnext(first_non_open); 4115 } 4116 4117 4118 fake_study_recurse: 4119 DEBUG_r( 4120 RExC_study_chunk_recursed_count++; 4121 ); 4122 DEBUG_OPTIMISE_MORE_r( 4123 { 4124 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", 4125 depth, (long)stopparen, 4126 (unsigned long)RExC_study_chunk_recursed_count, 4127 (unsigned long)depth, (unsigned long)recursed_depth, 4128 scan, 4129 last); 4130 if (recursed_depth) { 4131 U32 i; 4132 U32 j; 4133 for ( j = 0 ; j < recursed_depth ; j++ ) { 4134 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) { 4135 if ( 4136 PAREN_TEST(RExC_study_chunk_recursed + 4137 ( j * RExC_study_chunk_recursed_bytes), i ) 4138 && ( 4139 !j || 4140 !PAREN_TEST(RExC_study_chunk_recursed + 4141 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i) 4142 ) 4143 ) { 4144 Perl_re_printf( aTHX_ " %d",(int)i); 4145 break; 4146 } 4147 } 4148 if ( j + 1 < recursed_depth ) { 4149 Perl_re_printf( aTHX_ ","); 4150 } 4151 } 4152 } 4153 Perl_re_printf( aTHX_ "\n"); 4154 } 4155 ); 4156 while ( scan && OP(scan) != END && scan < last ){ 4157 UV min_subtract = 0; /* How mmany chars to subtract from the minimum 4158 node length to get a real minimum (because 4159 the folded version may be shorter) */ 4160 bool unfolded_multi_char = FALSE; 4161 /* Peephole optimizer: */ 4162 DEBUG_STUDYDATA("Peep:", data, depth); 4163 DEBUG_PEEP("Peep", scan, depth); 4164 4165 4166 /* The reason we do this here is that we need to deal with things like 4167 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT 4168 * parsing code, as each (?:..) is handled by a different invocation of 4169 * reg() -- Yves 4170 */ 4171 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); 4172 4173 /* Follow the next-chain of the current node and optimize 4174 away all the NOTHINGs from it. */ 4175 if (OP(scan) != CURLYX) { 4176 const int max = (reg_off_by_arg[OP(scan)] 4177 ? I32_MAX 4178 /* I32 may be smaller than U16 on CRAYs! */ 4179 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 4180 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); 4181 int noff; 4182 regnode *n = scan; 4183 4184 /* Skip NOTHING and LONGJMP. */ 4185 while ((n = regnext(n)) 4186 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) 4187 || ((OP(n) == LONGJMP) && (noff = ARG(n)))) 4188 && off + noff < max) 4189 off += noff; 4190 if (reg_off_by_arg[OP(scan)]) 4191 ARG(scan) = off; 4192 else 4193 NEXT_OFF(scan) = off; 4194 } 4195 4196 /* The principal pseudo-switch. Cannot be a switch, since we 4197 look into several different things. */ 4198 if ( OP(scan) == DEFINEP ) { 4199 SSize_t minlen = 0; 4200 SSize_t deltanext = 0; 4201 SSize_t fake_last_close = 0; 4202 I32 f = SCF_IN_DEFINE; 4203 4204 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 4205 scan = regnext(scan); 4206 assert( OP(scan) == IFTHEN ); 4207 DEBUG_PEEP("expect IFTHEN", scan, depth); 4208 4209 data_fake.last_closep= &fake_last_close; 4210 minlen = *minlenp; 4211 next = regnext(scan); 4212 scan = NEXTOPER(NEXTOPER(scan)); 4213 DEBUG_PEEP("scan", scan, depth); 4214 DEBUG_PEEP("next", next, depth); 4215 4216 /* we suppose the run is continuous, last=next... 4217 * NOTE we dont use the return here! */ 4218 (void)study_chunk(pRExC_state, &scan, &minlen, 4219 &deltanext, next, &data_fake, stopparen, 4220 recursed_depth, NULL, f, depth+1); 4221 4222 scan = next; 4223 } else 4224 if ( 4225 OP(scan) == BRANCH || 4226 OP(scan) == BRANCHJ || 4227 OP(scan) == IFTHEN 4228 ) { 4229 next = regnext(scan); 4230 code = OP(scan); 4231 4232 /* The op(next)==code check below is to see if we 4233 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN" 4234 * IFTHEN is special as it might not appear in pairs. 4235 * Not sure whether BRANCH-BRANCHJ is possible, regardless 4236 * we dont handle it cleanly. */ 4237 if (OP(next) == code || code == IFTHEN) { 4238 /* NOTE - There is similar code to this block below for 4239 * handling TRIE nodes on a re-study. If you change stuff here 4240 * check there too. */ 4241 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; 4242 regnode_ssc accum; 4243 regnode * const startbranch=scan; 4244 4245 if (flags & SCF_DO_SUBSTR) { 4246 /* Cannot merge strings after this. */ 4247 scan_commit(pRExC_state, data, minlenp, is_inf); 4248 } 4249 4250 if (flags & SCF_DO_STCLASS) 4251 ssc_init_zero(pRExC_state, &accum); 4252 4253 while (OP(scan) == code) { 4254 SSize_t deltanext, minnext, fake; 4255 I32 f = 0; 4256 regnode_ssc this_class; 4257 4258 DEBUG_PEEP("Branch", scan, depth); 4259 4260 num++; 4261 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 4262 if (data) { 4263 data_fake.whilem_c = data->whilem_c; 4264 data_fake.last_closep = data->last_closep; 4265 } 4266 else 4267 data_fake.last_closep = &fake; 4268 4269 data_fake.pos_delta = delta; 4270 next = regnext(scan); 4271 4272 scan = NEXTOPER(scan); /* everything */ 4273 if (code != BRANCH) /* everything but BRANCH */ 4274 scan = NEXTOPER(scan); 4275 4276 if (flags & SCF_DO_STCLASS) { 4277 ssc_init(pRExC_state, &this_class); 4278 data_fake.start_class = &this_class; 4279 f = SCF_DO_STCLASS_AND; 4280 } 4281 if (flags & SCF_WHILEM_VISITED_POS) 4282 f |= SCF_WHILEM_VISITED_POS; 4283 4284 /* we suppose the run is continuous, last=next...*/ 4285 minnext = study_chunk(pRExC_state, &scan, minlenp, 4286 &deltanext, next, &data_fake, stopparen, 4287 recursed_depth, NULL, f,depth+1); 4288 4289 if (min1 > minnext) 4290 min1 = minnext; 4291 if (deltanext == SSize_t_MAX) { 4292 is_inf = is_inf_internal = 1; 4293 max1 = SSize_t_MAX; 4294 } else if (max1 < minnext + deltanext) 4295 max1 = minnext + deltanext; 4296 scan = next; 4297 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 4298 pars++; 4299 if (data_fake.flags & SCF_SEEN_ACCEPT) { 4300 if ( stopmin > minnext) 4301 stopmin = min + min1; 4302 flags &= ~SCF_DO_SUBSTR; 4303 if (data) 4304 data->flags |= SCF_SEEN_ACCEPT; 4305 } 4306 if (data) { 4307 if (data_fake.flags & SF_HAS_EVAL) 4308 data->flags |= SF_HAS_EVAL; 4309 data->whilem_c = data_fake.whilem_c; 4310 } 4311 if (flags & SCF_DO_STCLASS) 4312 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); 4313 } 4314 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 4315 min1 = 0; 4316 if (flags & SCF_DO_SUBSTR) { 4317 data->pos_min += min1; 4318 if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) 4319 data->pos_delta = SSize_t_MAX; 4320 else 4321 data->pos_delta += max1 - min1; 4322 if (max1 != min1 || is_inf) 4323 data->longest = &(data->longest_float); 4324 } 4325 min += min1; 4326 if (delta == SSize_t_MAX 4327 || SSize_t_MAX - delta - (max1 - min1) < 0) 4328 delta = SSize_t_MAX; 4329 else 4330 delta += max1 - min1; 4331 if (flags & SCF_DO_STCLASS_OR) { 4332 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); 4333 if (min1) { 4334 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 4335 flags &= ~SCF_DO_STCLASS; 4336 } 4337 } 4338 else if (flags & SCF_DO_STCLASS_AND) { 4339 if (min1) { 4340 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); 4341 flags &= ~SCF_DO_STCLASS; 4342 } 4343 else { 4344 /* Switch to OR mode: cache the old value of 4345 * data->start_class */ 4346 INIT_AND_WITHP; 4347 StructCopy(data->start_class, and_withp, regnode_ssc); 4348 flags &= ~SCF_DO_STCLASS_AND; 4349 StructCopy(&accum, data->start_class, regnode_ssc); 4350 flags |= SCF_DO_STCLASS_OR; 4351 } 4352 } 4353 4354 if (PERL_ENABLE_TRIE_OPTIMISATION && 4355 OP( startbranch ) == BRANCH ) 4356 { 4357 /* demq. 4358 4359 Assuming this was/is a branch we are dealing with: 'scan' 4360 now points at the item that follows the branch sequence, 4361 whatever it is. We now start at the beginning of the 4362 sequence and look for subsequences of 4363 4364 BRANCH->EXACT=>x1 4365 BRANCH->EXACT=>x2 4366 tail 4367 4368 which would be constructed from a pattern like 4369 /A|LIST|OF|WORDS/ 4370 4371 If we can find such a subsequence we need to turn the first 4372 element into a trie and then add the subsequent branch exact 4373 strings to the trie. 4374 4375 We have two cases 4376 4377 1. patterns where the whole set of branches can be 4378 converted. 4379 4380 2. patterns where only a subset can be converted. 4381 4382 In case 1 we can replace the whole set with a single regop 4383 for the trie. In case 2 we need to keep the start and end 4384 branches so 4385 4386 'BRANCH EXACT; BRANCH EXACT; BRANCH X' 4387 becomes BRANCH TRIE; BRANCH X; 4388 4389 There is an additional case, that being where there is a 4390 common prefix, which gets split out into an EXACT like node 4391 preceding the TRIE node. 4392 4393 If x(1..n)==tail then we can do a simple trie, if not we make 4394 a "jump" trie, such that when we match the appropriate word 4395 we "jump" to the appropriate tail node. Essentially we turn 4396 a nested if into a case structure of sorts. 4397 4398 */ 4399 4400 int made=0; 4401 if (!re_trie_maxbuff) { 4402 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 4403 if (!SvIOK(re_trie_maxbuff)) 4404 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 4405 } 4406 if ( SvIV(re_trie_maxbuff)>=0 ) { 4407 regnode *cur; 4408 regnode *first = (regnode *)NULL; 4409 regnode *last = (regnode *)NULL; 4410 regnode *tail = scan; 4411 U8 trietype = 0; 4412 U32 count=0; 4413 4414 /* var tail is used because there may be a TAIL 4415 regop in the way. Ie, the exacts will point to the 4416 thing following the TAIL, but the last branch will 4417 point at the TAIL. So we advance tail. If we 4418 have nested (?:) we may have to move through several 4419 tails. 4420 */ 4421 4422 while ( OP( tail ) == TAIL ) { 4423 /* this is the TAIL generated by (?:) */ 4424 tail = regnext( tail ); 4425 } 4426 4427 4428 DEBUG_TRIE_COMPILE_r({ 4429 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state); 4430 Perl_re_indentf( aTHX_ "%s %"UVuf":%s\n", 4431 depth+1, 4432 "Looking for TRIE'able sequences. Tail node is ", 4433 (UV)(tail - RExC_emit_start), 4434 SvPV_nolen_const( RExC_mysv ) 4435 ); 4436 }); 4437 4438 /* 4439 4440 Step through the branches 4441 cur represents each branch, 4442 noper is the first thing to be matched as part 4443 of that branch 4444 noper_next is the regnext() of that node. 4445 4446 We normally handle a case like this 4447 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also 4448 support building with NOJUMPTRIE, which restricts 4449 the trie logic to structures like /FOO|BAR/. 4450 4451 If noper is a trieable nodetype then the branch is 4452 a possible optimization target. If we are building 4453 under NOJUMPTRIE then we require that noper_next is 4454 the same as scan (our current position in the regex 4455 program). 4456 4457 Once we have two or more consecutive such branches 4458 we can create a trie of the EXACT's contents and 4459 stitch it in place into the program. 4460 4461 If the sequence represents all of the branches in 4462 the alternation we replace the entire thing with a 4463 single TRIE node. 4464 4465 Otherwise when it is a subsequence we need to 4466 stitch it in place and replace only the relevant 4467 branches. This means the first branch has to remain 4468 as it is used by the alternation logic, and its 4469 next pointer, and needs to be repointed at the item 4470 on the branch chain following the last branch we 4471 have optimized away. 4472 4473 This could be either a BRANCH, in which case the 4474 subsequence is internal, or it could be the item 4475 following the branch sequence in which case the 4476 subsequence is at the end (which does not 4477 necessarily mean the first node is the start of the 4478 alternation). 4479 4480 TRIE_TYPE(X) is a define which maps the optype to a 4481 trietype. 4482 4483 optype | trietype 4484 ----------------+----------- 4485 NOTHING | NOTHING 4486 EXACT | EXACT 4487 EXACTFU | EXACTFU 4488 EXACTFU_SS | EXACTFU 4489 EXACTFA | EXACTFA 4490 EXACTL | EXACTL 4491 EXACTFLU8 | EXACTFLU8 4492 4493 4494 */ 4495 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ 4496 ? NOTHING \ 4497 : ( EXACT == (X) ) \ 4498 ? EXACT \ 4499 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \ 4500 ? EXACTFU \ 4501 : ( EXACTFA == (X) ) \ 4502 ? EXACTFA \ 4503 : ( EXACTL == (X) ) \ 4504 ? EXACTL \ 4505 : ( EXACTFLU8 == (X) ) \ 4506 ? EXACTFLU8 \ 4507 : 0 ) 4508 4509 /* dont use tail as the end marker for this traverse */ 4510 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { 4511 regnode * const noper = NEXTOPER( cur ); 4512 U8 noper_type = OP( noper ); 4513 U8 noper_trietype = TRIE_TYPE( noper_type ); 4514 #if defined(DEBUGGING) || defined(NOJUMPTRIE) 4515 regnode * const noper_next = regnext( noper ); 4516 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; 4517 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0; 4518 #endif 4519 4520 DEBUG_TRIE_COMPILE_r({ 4521 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); 4522 Perl_re_indentf( aTHX_ "- %d:%s (%d)", 4523 depth+1, 4524 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); 4525 4526 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state); 4527 Perl_re_printf( aTHX_ " -> %d:%s", 4528 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv)); 4529 4530 if ( noper_next ) { 4531 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state); 4532 Perl_re_printf( aTHX_ "\t=> %d:%s\t", 4533 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); 4534 } 4535 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", 4536 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), 4537 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] 4538 ); 4539 }); 4540 4541 /* Is noper a trieable nodetype that can be merged 4542 * with the current trie (if there is one)? */ 4543 if ( noper_trietype 4544 && 4545 ( 4546 ( noper_trietype == NOTHING ) 4547 || ( trietype == NOTHING ) 4548 || ( trietype == noper_trietype ) 4549 ) 4550 #ifdef NOJUMPTRIE 4551 && noper_next >= tail 4552 #endif 4553 && count < U16_MAX) 4554 { 4555 /* Handle mergable triable node Either we are 4556 * the first node in a new trieable sequence, 4557 * in which case we do some bookkeeping, 4558 * otherwise we update the end pointer. */ 4559 if ( !first ) { 4560 first = cur; 4561 if ( noper_trietype == NOTHING ) { 4562 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE) 4563 regnode * const noper_next = regnext( noper ); 4564 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; 4565 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; 4566 #endif 4567 4568 if ( noper_next_trietype ) { 4569 trietype = noper_next_trietype; 4570 } else if (noper_next_type) { 4571 /* a NOTHING regop is 1 regop wide. 4572 * We need at least two for a trie 4573 * so we can't merge this in */ 4574 first = NULL; 4575 } 4576 } else { 4577 trietype = noper_trietype; 4578 } 4579 } else { 4580 if ( trietype == NOTHING ) 4581 trietype = noper_trietype; 4582 last = cur; 4583 } 4584 if (first) 4585 count++; 4586 } /* end handle mergable triable node */ 4587 else { 4588 /* handle unmergable node - 4589 * noper may either be a triable node which can 4590 * not be tried together with the current trie, 4591 * or a non triable node */ 4592 if ( last ) { 4593 /* If last is set and trietype is not 4594 * NOTHING then we have found at least two 4595 * triable branch sequences in a row of a 4596 * similar trietype so we can turn them 4597 * into a trie. If/when we allow NOTHING to 4598 * start a trie sequence this condition 4599 * will be required, and it isn't expensive 4600 * so we leave it in for now. */ 4601 if ( trietype && trietype != NOTHING ) 4602 make_trie( pRExC_state, 4603 startbranch, first, cur, tail, 4604 count, trietype, depth+1 ); 4605 last = NULL; /* note: we clear/update 4606 first, trietype etc below, 4607 so we dont do it here */ 4608 } 4609 if ( noper_trietype 4610 #ifdef NOJUMPTRIE 4611 && noper_next >= tail 4612 #endif 4613 ){ 4614 /* noper is triable, so we can start a new 4615 * trie sequence */ 4616 count = 1; 4617 first = cur; 4618 trietype = noper_trietype; 4619 } else if (first) { 4620 /* if we already saw a first but the 4621 * current node is not triable then we have 4622 * to reset the first information. */ 4623 count = 0; 4624 first = NULL; 4625 trietype = 0; 4626 } 4627 } /* end handle unmergable node */ 4628 } /* loop over branches */ 4629 DEBUG_TRIE_COMPILE_r({ 4630 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); 4631 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ", 4632 depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur)); 4633 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", 4634 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), 4635 PL_reg_name[trietype] 4636 ); 4637 4638 }); 4639 if ( last && trietype ) { 4640 if ( trietype != NOTHING ) { 4641 /* the last branch of the sequence was part of 4642 * a trie, so we have to construct it here 4643 * outside of the loop */ 4644 made= make_trie( pRExC_state, startbranch, 4645 first, scan, tail, count, 4646 trietype, depth+1 ); 4647 #ifdef TRIE_STUDY_OPT 4648 if ( ((made == MADE_EXACT_TRIE && 4649 startbranch == first) 4650 || ( first_non_open == first )) && 4651 depth==0 ) { 4652 flags |= SCF_TRIE_RESTUDY; 4653 if ( startbranch == first 4654 && scan >= tail ) 4655 { 4656 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; 4657 } 4658 } 4659 #endif 4660 } else { 4661 /* at this point we know whatever we have is a 4662 * NOTHING sequence/branch AND if 'startbranch' 4663 * is 'first' then we can turn the whole thing 4664 * into a NOTHING 4665 */ 4666 if ( startbranch == first ) { 4667 regnode *opt; 4668 /* the entire thing is a NOTHING sequence, 4669 * something like this: (?:|) So we can 4670 * turn it into a plain NOTHING op. */ 4671 DEBUG_TRIE_COMPILE_r({ 4672 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); 4673 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n", 4674 depth+1, 4675 SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur)); 4676 4677 }); 4678 OP(startbranch)= NOTHING; 4679 NEXT_OFF(startbranch)= tail - startbranch; 4680 for ( opt= startbranch + 1; opt < tail ; opt++ ) 4681 OP(opt)= OPTIMIZED; 4682 } 4683 } 4684 } /* end if ( last) */ 4685 } /* TRIE_MAXBUF is non zero */ 4686 4687 } /* do trie */ 4688 4689 } 4690 else if ( code == BRANCHJ ) { /* single branch is optimized. */ 4691 scan = NEXTOPER(NEXTOPER(scan)); 4692 } else /* single branch is optimized. */ 4693 scan = NEXTOPER(scan); 4694 continue; 4695 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) { 4696 I32 paren = 0; 4697 regnode *start = NULL; 4698 regnode *end = NULL; 4699 U32 my_recursed_depth= recursed_depth; 4700 4701 if (OP(scan) != SUSPEND) { /* GOSUB */ 4702 /* Do setup, note this code has side effects beyond 4703 * the rest of this block. Specifically setting 4704 * RExC_recurse[] must happen at least once during 4705 * study_chunk(). */ 4706 paren = ARG(scan); 4707 RExC_recurse[ARG2L(scan)] = scan; 4708 start = RExC_open_parens[paren]; 4709 end = RExC_close_parens[paren]; 4710 4711 /* NOTE we MUST always execute the above code, even 4712 * if we do nothing with a GOSUB */ 4713 if ( 4714 ( flags & SCF_IN_DEFINE ) 4715 || 4716 ( 4717 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF)) 4718 && 4719 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 ) 4720 ) 4721 ) { 4722 /* no need to do anything here if we are in a define. */ 4723 /* or we are after some kind of infinite construct 4724 * so we can skip recursing into this item. 4725 * Since it is infinite we will not change the maxlen 4726 * or delta, and if we miss something that might raise 4727 * the minlen it will merely pessimise a little. 4728 * 4729 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/ 4730 * might result in a minlen of 1 and not of 4, 4731 * but this doesn't make us mismatch, just try a bit 4732 * harder than we should. 4733 * */ 4734 scan= regnext(scan); 4735 continue; 4736 } 4737 4738 if ( 4739 !recursed_depth 4740 || 4741 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) 4742 ) { 4743 /* it is quite possible that there are more efficient ways 4744 * to do this. We maintain a bitmap per level of recursion 4745 * of which patterns we have entered so we can detect if a 4746 * pattern creates a possible infinite loop. When we 4747 * recurse down a level we copy the previous levels bitmap 4748 * down. When we are at recursion level 0 we zero the top 4749 * level bitmap. It would be nice to implement a different 4750 * more efficient way of doing this. In particular the top 4751 * level bitmap may be unnecessary. 4752 */ 4753 if (!recursed_depth) { 4754 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); 4755 } else { 4756 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), 4757 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), 4758 RExC_study_chunk_recursed_bytes, U8); 4759 } 4760 /* we havent recursed into this paren yet, so recurse into it */ 4761 DEBUG_STUDYDATA("gosub-set:", data,depth); 4762 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); 4763 my_recursed_depth= recursed_depth + 1; 4764 } else { 4765 DEBUG_STUDYDATA("gosub-inf:", data,depth); 4766 /* some form of infinite recursion, assume infinite length 4767 * */ 4768 if (flags & SCF_DO_SUBSTR) { 4769 scan_commit(pRExC_state, data, minlenp, is_inf); 4770 data->longest = &(data->longest_float); 4771 } 4772 is_inf = is_inf_internal = 1; 4773 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 4774 ssc_anything(data->start_class); 4775 flags &= ~SCF_DO_STCLASS; 4776 4777 start= NULL; /* reset start so we dont recurse later on. */ 4778 } 4779 } else { 4780 paren = stopparen; 4781 start = scan + 2; 4782 end = regnext(scan); 4783 } 4784 if (start) { 4785 scan_frame *newframe; 4786 assert(end); 4787 if (!RExC_frame_last) { 4788 Newxz(newframe, 1, scan_frame); 4789 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe); 4790 RExC_frame_head= newframe; 4791 RExC_frame_count++; 4792 } else if (!RExC_frame_last->next_frame) { 4793 Newxz(newframe,1,scan_frame); 4794 RExC_frame_last->next_frame= newframe; 4795 newframe->prev_frame= RExC_frame_last; 4796 RExC_frame_count++; 4797 } else { 4798 newframe= RExC_frame_last->next_frame; 4799 } 4800 RExC_frame_last= newframe; 4801 4802 newframe->next_regnode = regnext(scan); 4803 newframe->last_regnode = last; 4804 newframe->stopparen = stopparen; 4805 newframe->prev_recursed_depth = recursed_depth; 4806 newframe->this_prev_frame= frame; 4807 4808 DEBUG_STUDYDATA("frame-new:",data,depth); 4809 DEBUG_PEEP("fnew", scan, depth); 4810 4811 frame = newframe; 4812 scan = start; 4813 stopparen = paren; 4814 last = end; 4815 depth = depth + 1; 4816 recursed_depth= my_recursed_depth; 4817 4818 continue; 4819 } 4820 } 4821 else if (OP(scan) == EXACT || OP(scan) == EXACTL) { 4822 SSize_t l = STR_LEN(scan); 4823 UV uc; 4824 if (UTF) { 4825 const U8 * const s = (U8*)STRING(scan); 4826 uc = utf8_to_uvchr_buf(s, s + l, NULL); 4827 l = utf8_length(s, s + l); 4828 } else { 4829 uc = *((U8*)STRING(scan)); 4830 } 4831 min += l; 4832 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 4833 /* The code below prefers earlier match for fixed 4834 offset, later match for variable offset. */ 4835 if (data->last_end == -1) { /* Update the start info. */ 4836 data->last_start_min = data->pos_min; 4837 data->last_start_max = is_inf 4838 ? SSize_t_MAX : data->pos_min + data->pos_delta; 4839 } 4840 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); 4841 if (UTF) 4842 SvUTF8_on(data->last_found); 4843 { 4844 SV * const sv = data->last_found; 4845 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 4846 mg_find(sv, PERL_MAGIC_utf8) : NULL; 4847 if (mg && mg->mg_len >= 0) 4848 mg->mg_len += utf8_length((U8*)STRING(scan), 4849 (U8*)STRING(scan)+STR_LEN(scan)); 4850 } 4851 data->last_end = data->pos_min + l; 4852 data->pos_min += l; /* As in the first entry. */ 4853 data->flags &= ~SF_BEFORE_EOL; 4854 } 4855 4856 /* ANDing the code point leaves at most it, and not in locale, and 4857 * can't match null string */ 4858 if (flags & SCF_DO_STCLASS_AND) { 4859 ssc_cp_and(data->start_class, uc); 4860 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 4861 ssc_clear_locale(data->start_class); 4862 } 4863 else if (flags & SCF_DO_STCLASS_OR) { 4864 ssc_add_cp(data->start_class, uc); 4865 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 4866 4867 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 4868 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 4869 } 4870 flags &= ~SCF_DO_STCLASS; 4871 } 4872 else if (PL_regkind[OP(scan)] == EXACT) { 4873 /* But OP != EXACT!, so is EXACTFish */ 4874 SSize_t l = STR_LEN(scan); 4875 const U8 * s = (U8*)STRING(scan); 4876 4877 /* Search for fixed substrings supports EXACT only. */ 4878 if (flags & SCF_DO_SUBSTR) { 4879 assert(data); 4880 scan_commit(pRExC_state, data, minlenp, is_inf); 4881 } 4882 if (UTF) { 4883 l = utf8_length(s, s + l); 4884 } 4885 if (unfolded_multi_char) { 4886 RExC_seen |= REG_UNFOLDED_MULTI_SEEN; 4887 } 4888 min += l - min_subtract; 4889 assert (min >= 0); 4890 delta += min_subtract; 4891 if (flags & SCF_DO_SUBSTR) { 4892 data->pos_min += l - min_subtract; 4893 if (data->pos_min < 0) { 4894 data->pos_min = 0; 4895 } 4896 data->pos_delta += min_subtract; 4897 if (min_subtract) { 4898 data->longest = &(data->longest_float); 4899 } 4900 } 4901 4902 if (flags & SCF_DO_STCLASS) { 4903 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan); 4904 4905 assert(EXACTF_invlist); 4906 if (flags & SCF_DO_STCLASS_AND) { 4907 if (OP(scan) != EXACTFL) 4908 ssc_clear_locale(data->start_class); 4909 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 4910 ANYOF_POSIXL_ZERO(data->start_class); 4911 ssc_intersection(data->start_class, EXACTF_invlist, FALSE); 4912 } 4913 else { /* SCF_DO_STCLASS_OR */ 4914 ssc_union(data->start_class, EXACTF_invlist, FALSE); 4915 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 4916 4917 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 4918 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 4919 } 4920 flags &= ~SCF_DO_STCLASS; 4921 SvREFCNT_dec(EXACTF_invlist); 4922 } 4923 } 4924 else if (REGNODE_VARIES(OP(scan))) { 4925 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; 4926 I32 fl = 0, f = flags; 4927 regnode * const oscan = scan; 4928 regnode_ssc this_class; 4929 regnode_ssc *oclass = NULL; 4930 I32 next_is_eval = 0; 4931 4932 switch (PL_regkind[OP(scan)]) { 4933 case WHILEM: /* End of (?:...)* . */ 4934 scan = NEXTOPER(scan); 4935 goto finish; 4936 case PLUS: 4937 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 4938 next = NEXTOPER(scan); 4939 if (OP(next) == EXACT 4940 || OP(next) == EXACTL 4941 || (flags & SCF_DO_STCLASS)) 4942 { 4943 mincount = 1; 4944 maxcount = REG_INFTY; 4945 next = regnext(scan); 4946 scan = NEXTOPER(scan); 4947 goto do_curly; 4948 } 4949 } 4950 if (flags & SCF_DO_SUBSTR) 4951 data->pos_min++; 4952 min++; 4953 /* FALLTHROUGH */ 4954 case STAR: 4955 if (flags & SCF_DO_STCLASS) { 4956 mincount = 0; 4957 maxcount = REG_INFTY; 4958 next = regnext(scan); 4959 scan = NEXTOPER(scan); 4960 goto do_curly; 4961 } 4962 if (flags & SCF_DO_SUBSTR) { 4963 scan_commit(pRExC_state, data, minlenp, is_inf); 4964 /* Cannot extend fixed substrings */ 4965 data->longest = &(data->longest_float); 4966 } 4967 is_inf = is_inf_internal = 1; 4968 scan = regnext(scan); 4969 goto optimize_curly_tail; 4970 case CURLY: 4971 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) 4972 && (scan->flags == stopparen)) 4973 { 4974 mincount = 1; 4975 maxcount = 1; 4976 } else { 4977 mincount = ARG1(scan); 4978 maxcount = ARG2(scan); 4979 } 4980 next = regnext(scan); 4981 if (OP(scan) == CURLYX) { 4982 I32 lp = (data ? *(data->last_closep) : 0); 4983 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); 4984 } 4985 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 4986 next_is_eval = (OP(scan) == EVAL); 4987 do_curly: 4988 if (flags & SCF_DO_SUBSTR) { 4989 if (mincount == 0) 4990 scan_commit(pRExC_state, data, minlenp, is_inf); 4991 /* Cannot extend fixed substrings */ 4992 pos_before = data->pos_min; 4993 } 4994 if (data) { 4995 fl = data->flags; 4996 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 4997 if (is_inf) 4998 data->flags |= SF_IS_INF; 4999 } 5000 if (flags & SCF_DO_STCLASS) { 5001 ssc_init(pRExC_state, &this_class); 5002 oclass = data->start_class; 5003 data->start_class = &this_class; 5004 f |= SCF_DO_STCLASS_AND; 5005 f &= ~SCF_DO_STCLASS_OR; 5006 } 5007 /* Exclude from super-linear cache processing any {n,m} 5008 regops for which the combination of input pos and regex 5009 pos is not enough information to determine if a match 5010 will be possible. 5011 5012 For example, in the regex /foo(bar\s*){4,8}baz/ with the 5013 regex pos at the \s*, the prospects for a match depend not 5014 only on the input position but also on how many (bar\s*) 5015 repeats into the {4,8} we are. */ 5016 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) 5017 f &= ~SCF_WHILEM_VISITED_POS; 5018 5019 /* This will finish on WHILEM, setting scan, or on NULL: */ 5020 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 5021 last, data, stopparen, recursed_depth, NULL, 5022 (mincount == 0 5023 ? (f & ~SCF_DO_SUBSTR) 5024 : f) 5025 ,depth+1); 5026 5027 if (flags & SCF_DO_STCLASS) 5028 data->start_class = oclass; 5029 if (mincount == 0 || minnext == 0) { 5030 if (flags & SCF_DO_STCLASS_OR) { 5031 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 5032 } 5033 else if (flags & SCF_DO_STCLASS_AND) { 5034 /* Switch to OR mode: cache the old value of 5035 * data->start_class */ 5036 INIT_AND_WITHP; 5037 StructCopy(data->start_class, and_withp, regnode_ssc); 5038 flags &= ~SCF_DO_STCLASS_AND; 5039 StructCopy(&this_class, data->start_class, regnode_ssc); 5040 flags |= SCF_DO_STCLASS_OR; 5041 ANYOF_FLAGS(data->start_class) 5042 |= SSC_MATCHES_EMPTY_STRING; 5043 } 5044 } else { /* Non-zero len */ 5045 if (flags & SCF_DO_STCLASS_OR) { 5046 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 5047 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5048 } 5049 else if (flags & SCF_DO_STCLASS_AND) 5050 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); 5051 flags &= ~SCF_DO_STCLASS; 5052 } 5053 if (!scan) /* It was not CURLYX, but CURLY. */ 5054 scan = next; 5055 if (!(flags & SCF_TRIE_DOING_RESTUDY) 5056 /* ? quantifier ok, except for (?{ ... }) */ 5057 && (next_is_eval || !(mincount == 0 && maxcount == 1)) 5058 && (minnext == 0) && (deltanext == 0) 5059 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 5060 && maxcount <= REG_INFTY/3) /* Complement check for big 5061 count */ 5062 { 5063 /* Fatal warnings may leak the regexp without this: */ 5064 SAVEFREESV(RExC_rx_sv); 5065 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), 5066 "Quantifier unexpected on zero-length expression " 5067 "in regex m/%"UTF8f"/", 5068 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp, 5069 RExC_precomp)); 5070 (void)ReREFCNT_inc(RExC_rx_sv); 5071 } 5072 5073 min += minnext * mincount; 5074 is_inf_internal |= deltanext == SSize_t_MAX 5075 || (maxcount == REG_INFTY && minnext + deltanext > 0); 5076 is_inf |= is_inf_internal; 5077 if (is_inf) { 5078 delta = SSize_t_MAX; 5079 } else { 5080 delta += (minnext + deltanext) * maxcount 5081 - minnext * mincount; 5082 } 5083 /* Try powerful optimization CURLYX => CURLYN. */ 5084 if ( OP(oscan) == CURLYX && data 5085 && data->flags & SF_IN_PAR 5086 && !(data->flags & SF_HAS_EVAL) 5087 && !deltanext && minnext == 1 ) { 5088 /* Try to optimize to CURLYN. */ 5089 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; 5090 regnode * const nxt1 = nxt; 5091 #ifdef DEBUGGING 5092 regnode *nxt2; 5093 #endif 5094 5095 /* Skip open. */ 5096 nxt = regnext(nxt); 5097 if (!REGNODE_SIMPLE(OP(nxt)) 5098 && !(PL_regkind[OP(nxt)] == EXACT 5099 && STR_LEN(nxt) == 1)) 5100 goto nogo; 5101 #ifdef DEBUGGING 5102 nxt2 = nxt; 5103 #endif 5104 nxt = regnext(nxt); 5105 if (OP(nxt) != CLOSE) 5106 goto nogo; 5107 if (RExC_open_parens) { 5108 RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/ 5109 RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/ 5110 } 5111 /* Now we know that nxt2 is the only contents: */ 5112 oscan->flags = (U8)ARG(nxt); 5113 OP(oscan) = CURLYN; 5114 OP(nxt1) = NOTHING; /* was OPEN. */ 5115 5116 #ifdef DEBUGGING 5117 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 5118 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ 5119 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ 5120 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 5121 OP(nxt + 1) = OPTIMIZED; /* was count. */ 5122 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ 5123 #endif 5124 } 5125 nogo: 5126 5127 /* Try optimization CURLYX => CURLYM. */ 5128 if ( OP(oscan) == CURLYX && data 5129 && !(data->flags & SF_HAS_PAR) 5130 && !(data->flags & SF_HAS_EVAL) 5131 && !deltanext /* atom is fixed width */ 5132 && minnext != 0 /* CURLYM can't handle zero width */ 5133 5134 /* Nor characters whose fold at run-time may be 5135 * multi-character */ 5136 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) 5137 ) { 5138 /* XXXX How to optimize if data == 0? */ 5139 /* Optimize to a simpler form. */ 5140 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ 5141 regnode *nxt2; 5142 5143 OP(oscan) = CURLYM; 5144 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 5145 && (OP(nxt2) != WHILEM)) 5146 nxt = nxt2; 5147 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 5148 /* Need to optimize away parenths. */ 5149 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { 5150 /* Set the parenth number. */ 5151 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ 5152 5153 oscan->flags = (U8)ARG(nxt); 5154 if (RExC_open_parens) { 5155 RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/ 5156 RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/ 5157 } 5158 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 5159 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 5160 5161 #ifdef DEBUGGING 5162 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 5163 OP(nxt + 1) = OPTIMIZED; /* was count. */ 5164 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ 5165 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ 5166 #endif 5167 #if 0 5168 while ( nxt1 && (OP(nxt1) != WHILEM)) { 5169 regnode *nnxt = regnext(nxt1); 5170 if (nnxt == nxt) { 5171 if (reg_off_by_arg[OP(nxt1)]) 5172 ARG_SET(nxt1, nxt2 - nxt1); 5173 else if (nxt2 - nxt1 < U16_MAX) 5174 NEXT_OFF(nxt1) = nxt2 - nxt1; 5175 else 5176 OP(nxt) = NOTHING; /* Cannot beautify */ 5177 } 5178 nxt1 = nnxt; 5179 } 5180 #endif 5181 /* Optimize again: */ 5182 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, 5183 NULL, stopparen, recursed_depth, NULL, 0,depth+1); 5184 } 5185 else 5186 oscan->flags = 0; 5187 } 5188 else if ((OP(oscan) == CURLYX) 5189 && (flags & SCF_WHILEM_VISITED_POS) 5190 /* See the comment on a similar expression above. 5191 However, this time it's not a subexpression 5192 we care about, but the expression itself. */ 5193 && (maxcount == REG_INFTY) 5194 && data && ++data->whilem_c < 16) { 5195 /* This stays as CURLYX, we can put the count/of pair. */ 5196 /* Find WHILEM (as in regexec.c) */ 5197 regnode *nxt = oscan + NEXT_OFF(oscan); 5198 5199 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ 5200 nxt += ARG(nxt); 5201 PREVOPER(nxt)->flags = (U8)(data->whilem_c 5202 | (RExC_whilem_seen << 4)); /* On WHILEM */ 5203 } 5204 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 5205 pars++; 5206 if (flags & SCF_DO_SUBSTR) { 5207 SV *last_str = NULL; 5208 STRLEN last_chrs = 0; 5209 int counted = mincount != 0; 5210 5211 if (data->last_end > 0 && mincount != 0) { /* Ends with a 5212 string. */ 5213 SSize_t b = pos_before >= data->last_start_min 5214 ? pos_before : data->last_start_min; 5215 STRLEN l; 5216 const char * const s = SvPV_const(data->last_found, l); 5217 SSize_t old = b - data->last_start_min; 5218 5219 if (UTF) 5220 old = utf8_hop((U8*)s, old) - (U8*)s; 5221 l -= old; 5222 /* Get the added string: */ 5223 last_str = newSVpvn_utf8(s + old, l, UTF); 5224 last_chrs = UTF ? utf8_length((U8*)(s + old), 5225 (U8*)(s + old + l)) : l; 5226 if (deltanext == 0 && pos_before == b) { 5227 /* What was added is a constant string */ 5228 if (mincount > 1) { 5229 5230 SvGROW(last_str, (mincount * l) + 1); 5231 repeatcpy(SvPVX(last_str) + l, 5232 SvPVX_const(last_str), l, 5233 mincount - 1); 5234 SvCUR_set(last_str, SvCUR(last_str) * mincount); 5235 /* Add additional parts. */ 5236 SvCUR_set(data->last_found, 5237 SvCUR(data->last_found) - l); 5238 sv_catsv(data->last_found, last_str); 5239 { 5240 SV * sv = data->last_found; 5241 MAGIC *mg = 5242 SvUTF8(sv) && SvMAGICAL(sv) ? 5243 mg_find(sv, PERL_MAGIC_utf8) : NULL; 5244 if (mg && mg->mg_len >= 0) 5245 mg->mg_len += last_chrs * (mincount-1); 5246 } 5247 last_chrs *= mincount; 5248 data->last_end += l * (mincount - 1); 5249 } 5250 } else { 5251 /* start offset must point into the last copy */ 5252 data->last_start_min += minnext * (mincount - 1); 5253 data->last_start_max = 5254 is_inf 5255 ? SSize_t_MAX 5256 : data->last_start_max + 5257 (maxcount - 1) * (minnext + data->pos_delta); 5258 } 5259 } 5260 /* It is counted once already... */ 5261 data->pos_min += minnext * (mincount - counted); 5262 #if 0 5263 Perl_re_printf( aTHX_ "counted=%"UVuf" deltanext=%"UVuf 5264 " SSize_t_MAX=%"UVuf" minnext=%"UVuf 5265 " maxcount=%"UVuf" mincount=%"UVuf"\n", 5266 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, 5267 (UV)mincount); 5268 if (deltanext != SSize_t_MAX) 5269 Perl_re_printf( aTHX_ "LHS=%"UVuf" RHS=%"UVuf"\n", 5270 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount 5271 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); 5272 #endif 5273 if (deltanext == SSize_t_MAX 5274 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) 5275 data->pos_delta = SSize_t_MAX; 5276 else 5277 data->pos_delta += - counted * deltanext + 5278 (minnext + deltanext) * maxcount - minnext * mincount; 5279 if (mincount != maxcount) { 5280 /* Cannot extend fixed substrings found inside 5281 the group. */ 5282 scan_commit(pRExC_state, data, minlenp, is_inf); 5283 if (mincount && last_str) { 5284 SV * const sv = data->last_found; 5285 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 5286 mg_find(sv, PERL_MAGIC_utf8) : NULL; 5287 5288 if (mg) 5289 mg->mg_len = -1; 5290 sv_setsv(sv, last_str); 5291 data->last_end = data->pos_min; 5292 data->last_start_min = data->pos_min - last_chrs; 5293 data->last_start_max = is_inf 5294 ? SSize_t_MAX 5295 : data->pos_min + data->pos_delta - last_chrs; 5296 } 5297 data->longest = &(data->longest_float); 5298 } 5299 SvREFCNT_dec(last_str); 5300 } 5301 if (data && (fl & SF_HAS_EVAL)) 5302 data->flags |= SF_HAS_EVAL; 5303 optimize_curly_tail: 5304 if (OP(oscan) != CURLYX) { 5305 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING 5306 && NEXT_OFF(next)) 5307 NEXT_OFF(oscan) += NEXT_OFF(next); 5308 } 5309 continue; 5310 5311 default: 5312 #ifdef DEBUGGING 5313 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", 5314 OP(scan)); 5315 #endif 5316 case REF: 5317 case CLUMP: 5318 if (flags & SCF_DO_SUBSTR) { 5319 /* Cannot expect anything... */ 5320 scan_commit(pRExC_state, data, minlenp, is_inf); 5321 data->longest = &(data->longest_float); 5322 } 5323 is_inf = is_inf_internal = 1; 5324 if (flags & SCF_DO_STCLASS_OR) { 5325 if (OP(scan) == CLUMP) { 5326 /* Actually is any start char, but very few code points 5327 * aren't start characters */ 5328 ssc_match_all_cp(data->start_class); 5329 } 5330 else { 5331 ssc_anything(data->start_class); 5332 } 5333 } 5334 flags &= ~SCF_DO_STCLASS; 5335 break; 5336 } 5337 } 5338 else if (OP(scan) == LNBREAK) { 5339 if (flags & SCF_DO_STCLASS) { 5340 if (flags & SCF_DO_STCLASS_AND) { 5341 ssc_intersection(data->start_class, 5342 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); 5343 ssc_clear_locale(data->start_class); 5344 ANYOF_FLAGS(data->start_class) 5345 &= ~SSC_MATCHES_EMPTY_STRING; 5346 } 5347 else if (flags & SCF_DO_STCLASS_OR) { 5348 ssc_union(data->start_class, 5349 PL_XPosix_ptrs[_CC_VERTSPACE], 5350 FALSE); 5351 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5352 5353 /* See commit msg for 5354 * 749e076fceedeb708a624933726e7989f2302f6a */ 5355 ANYOF_FLAGS(data->start_class) 5356 &= ~SSC_MATCHES_EMPTY_STRING; 5357 } 5358 flags &= ~SCF_DO_STCLASS; 5359 } 5360 min++; 5361 if (delta != SSize_t_MAX) 5362 delta++; /* Because of the 2 char string cr-lf */ 5363 if (flags & SCF_DO_SUBSTR) { 5364 /* Cannot expect anything... */ 5365 scan_commit(pRExC_state, data, minlenp, is_inf); 5366 data->pos_min += 1; 5367 data->pos_delta += 1; 5368 data->longest = &(data->longest_float); 5369 } 5370 } 5371 else if (REGNODE_SIMPLE(OP(scan))) { 5372 5373 if (flags & SCF_DO_SUBSTR) { 5374 scan_commit(pRExC_state, data, minlenp, is_inf); 5375 data->pos_min++; 5376 } 5377 min++; 5378 if (flags & SCF_DO_STCLASS) { 5379 bool invert = 0; 5380 SV* my_invlist = NULL; 5381 U8 namedclass; 5382 5383 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ 5384 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; 5385 5386 /* Some of the logic below assumes that switching 5387 locale on will only add false positives. */ 5388 switch (OP(scan)) { 5389 5390 default: 5391 #ifdef DEBUGGING 5392 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", 5393 OP(scan)); 5394 #endif 5395 case SANY: 5396 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 5397 ssc_match_all_cp(data->start_class); 5398 break; 5399 5400 case REG_ANY: 5401 { 5402 SV* REG_ANY_invlist = _new_invlist(2); 5403 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, 5404 '\n'); 5405 if (flags & SCF_DO_STCLASS_OR) { 5406 ssc_union(data->start_class, 5407 REG_ANY_invlist, 5408 TRUE /* TRUE => invert, hence all but \n 5409 */ 5410 ); 5411 } 5412 else if (flags & SCF_DO_STCLASS_AND) { 5413 ssc_intersection(data->start_class, 5414 REG_ANY_invlist, 5415 TRUE /* TRUE => invert */ 5416 ); 5417 ssc_clear_locale(data->start_class); 5418 } 5419 SvREFCNT_dec_NN(REG_ANY_invlist); 5420 } 5421 break; 5422 5423 case ANYOFD: 5424 case ANYOFL: 5425 case ANYOF: 5426 if (flags & SCF_DO_STCLASS_AND) 5427 ssc_and(pRExC_state, data->start_class, 5428 (regnode_charclass *) scan); 5429 else 5430 ssc_or(pRExC_state, data->start_class, 5431 (regnode_charclass *) scan); 5432 break; 5433 5434 case NPOSIXL: 5435 invert = 1; 5436 /* FALLTHROUGH */ 5437 5438 case POSIXL: 5439 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; 5440 if (flags & SCF_DO_STCLASS_AND) { 5441 bool was_there = cBOOL( 5442 ANYOF_POSIXL_TEST(data->start_class, 5443 namedclass)); 5444 ANYOF_POSIXL_ZERO(data->start_class); 5445 if (was_there) { /* Do an AND */ 5446 ANYOF_POSIXL_SET(data->start_class, namedclass); 5447 } 5448 /* No individual code points can now match */ 5449 data->start_class->invlist 5450 = sv_2mortal(_new_invlist(0)); 5451 } 5452 else { 5453 int complement = namedclass + ((invert) ? -1 : 1); 5454 5455 assert(flags & SCF_DO_STCLASS_OR); 5456 5457 /* If the complement of this class was already there, 5458 * the result is that they match all code points, 5459 * (\d + \D == everything). Remove the classes from 5460 * future consideration. Locale is not relevant in 5461 * this case */ 5462 if (ANYOF_POSIXL_TEST(data->start_class, complement)) { 5463 ssc_match_all_cp(data->start_class); 5464 ANYOF_POSIXL_CLEAR(data->start_class, namedclass); 5465 ANYOF_POSIXL_CLEAR(data->start_class, complement); 5466 } 5467 else { /* The usual case; just add this class to the 5468 existing set */ 5469 ANYOF_POSIXL_SET(data->start_class, namedclass); 5470 } 5471 } 5472 break; 5473 5474 case NPOSIXA: /* For these, we always know the exact set of 5475 what's matched */ 5476 invert = 1; 5477 /* FALLTHROUGH */ 5478 case POSIXA: 5479 if (FLAGS(scan) == _CC_ASCII) { 5480 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]); 5481 } 5482 else { 5483 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], 5484 PL_XPosix_ptrs[_CC_ASCII], 5485 &my_invlist); 5486 } 5487 goto join_posix; 5488 5489 case NPOSIXD: 5490 case NPOSIXU: 5491 invert = 1; 5492 /* FALLTHROUGH */ 5493 case POSIXD: 5494 case POSIXU: 5495 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); 5496 5497 /* NPOSIXD matches all upper Latin1 code points unless the 5498 * target string being matched is UTF-8, which is 5499 * unknowable until match time. Since we are going to 5500 * invert, we want to get rid of all of them so that the 5501 * inversion will match all */ 5502 if (OP(scan) == NPOSIXD) { 5503 _invlist_subtract(my_invlist, PL_UpperLatin1, 5504 &my_invlist); 5505 } 5506 5507 join_posix: 5508 5509 if (flags & SCF_DO_STCLASS_AND) { 5510 ssc_intersection(data->start_class, my_invlist, invert); 5511 ssc_clear_locale(data->start_class); 5512 } 5513 else { 5514 assert(flags & SCF_DO_STCLASS_OR); 5515 ssc_union(data->start_class, my_invlist, invert); 5516 } 5517 SvREFCNT_dec(my_invlist); 5518 } 5519 if (flags & SCF_DO_STCLASS_OR) 5520 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5521 flags &= ~SCF_DO_STCLASS; 5522 } 5523 } 5524 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { 5525 data->flags |= (OP(scan) == MEOL 5526 ? SF_BEFORE_MEOL 5527 : SF_BEFORE_SEOL); 5528 scan_commit(pRExC_state, data, minlenp, is_inf); 5529 5530 } 5531 else if ( PL_regkind[OP(scan)] == BRANCHJ 5532 /* Lookbehind, or need to calculate parens/evals/stclass: */ 5533 && (scan->flags || data || (flags & SCF_DO_STCLASS)) 5534 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) 5535 { 5536 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY 5537 || OP(scan) == UNLESSM ) 5538 { 5539 /* Negative Lookahead/lookbehind 5540 In this case we can't do fixed string optimisation. 5541 */ 5542 5543 SSize_t deltanext, minnext, fake = 0; 5544 regnode *nscan; 5545 regnode_ssc intrnl; 5546 int f = 0; 5547 5548 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 5549 if (data) { 5550 data_fake.whilem_c = data->whilem_c; 5551 data_fake.last_closep = data->last_closep; 5552 } 5553 else 5554 data_fake.last_closep = &fake; 5555 data_fake.pos_delta = delta; 5556 if ( flags & SCF_DO_STCLASS && !scan->flags 5557 && OP(scan) == IFMATCH ) { /* Lookahead */ 5558 ssc_init(pRExC_state, &intrnl); 5559 data_fake.start_class = &intrnl; 5560 f |= SCF_DO_STCLASS_AND; 5561 } 5562 if (flags & SCF_WHILEM_VISITED_POS) 5563 f |= SCF_WHILEM_VISITED_POS; 5564 next = regnext(scan); 5565 nscan = NEXTOPER(NEXTOPER(scan)); 5566 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, 5567 last, &data_fake, stopparen, 5568 recursed_depth, NULL, f, depth+1); 5569 if (scan->flags) { 5570 if (deltanext) { 5571 FAIL("Variable length lookbehind not implemented"); 5572 } 5573 else if (minnext > (I32)U8_MAX) { 5574 FAIL2("Lookbehind longer than %"UVuf" not implemented", 5575 (UV)U8_MAX); 5576 } 5577 scan->flags = (U8)minnext; 5578 } 5579 if (data) { 5580 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 5581 pars++; 5582 if (data_fake.flags & SF_HAS_EVAL) 5583 data->flags |= SF_HAS_EVAL; 5584 data->whilem_c = data_fake.whilem_c; 5585 } 5586 if (f & SCF_DO_STCLASS_AND) { 5587 if (flags & SCF_DO_STCLASS_OR) { 5588 /* OR before, AND after: ideally we would recurse with 5589 * data_fake to get the AND applied by study of the 5590 * remainder of the pattern, and then derecurse; 5591 * *** HACK *** for now just treat as "no information". 5592 * See [perl #56690]. 5593 */ 5594 ssc_init(pRExC_state, data->start_class); 5595 } else { 5596 /* AND before and after: combine and continue. These 5597 * assertions are zero-length, so can match an EMPTY 5598 * string */ 5599 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); 5600 ANYOF_FLAGS(data->start_class) 5601 |= SSC_MATCHES_EMPTY_STRING; 5602 } 5603 } 5604 } 5605 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY 5606 else { 5607 /* Positive Lookahead/lookbehind 5608 In this case we can do fixed string optimisation, 5609 but we must be careful about it. Note in the case of 5610 lookbehind the positions will be offset by the minimum 5611 length of the pattern, something we won't know about 5612 until after the recurse. 5613 */ 5614 SSize_t deltanext, fake = 0; 5615 regnode *nscan; 5616 regnode_ssc intrnl; 5617 int f = 0; 5618 /* We use SAVEFREEPV so that when the full compile 5619 is finished perl will clean up the allocated 5620 minlens when it's all done. This way we don't 5621 have to worry about freeing them when we know 5622 they wont be used, which would be a pain. 5623 */ 5624 SSize_t *minnextp; 5625 Newx( minnextp, 1, SSize_t ); 5626 SAVEFREEPV(minnextp); 5627 5628 if (data) { 5629 StructCopy(data, &data_fake, scan_data_t); 5630 if ((flags & SCF_DO_SUBSTR) && data->last_found) { 5631 f |= SCF_DO_SUBSTR; 5632 if (scan->flags) 5633 scan_commit(pRExC_state, &data_fake, minlenp, is_inf); 5634 data_fake.last_found=newSVsv(data->last_found); 5635 } 5636 } 5637 else 5638 data_fake.last_closep = &fake; 5639 data_fake.flags = 0; 5640 data_fake.pos_delta = delta; 5641 if (is_inf) 5642 data_fake.flags |= SF_IS_INF; 5643 if ( flags & SCF_DO_STCLASS && !scan->flags 5644 && OP(scan) == IFMATCH ) { /* Lookahead */ 5645 ssc_init(pRExC_state, &intrnl); 5646 data_fake.start_class = &intrnl; 5647 f |= SCF_DO_STCLASS_AND; 5648 } 5649 if (flags & SCF_WHILEM_VISITED_POS) 5650 f |= SCF_WHILEM_VISITED_POS; 5651 next = regnext(scan); 5652 nscan = NEXTOPER(NEXTOPER(scan)); 5653 5654 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, 5655 &deltanext, last, &data_fake, 5656 stopparen, recursed_depth, NULL, 5657 f,depth+1); 5658 if (scan->flags) { 5659 if (deltanext) { 5660 FAIL("Variable length lookbehind not implemented"); 5661 } 5662 else if (*minnextp > (I32)U8_MAX) { 5663 FAIL2("Lookbehind longer than %"UVuf" not implemented", 5664 (UV)U8_MAX); 5665 } 5666 scan->flags = (U8)*minnextp; 5667 } 5668 5669 *minnextp += min; 5670 5671 if (f & SCF_DO_STCLASS_AND) { 5672 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); 5673 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING; 5674 } 5675 if (data) { 5676 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 5677 pars++; 5678 if (data_fake.flags & SF_HAS_EVAL) 5679 data->flags |= SF_HAS_EVAL; 5680 data->whilem_c = data_fake.whilem_c; 5681 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { 5682 if (RExC_rx->minlen<*minnextp) 5683 RExC_rx->minlen=*minnextp; 5684 scan_commit(pRExC_state, &data_fake, minnextp, is_inf); 5685 SvREFCNT_dec_NN(data_fake.last_found); 5686 5687 if ( data_fake.minlen_fixed != minlenp ) 5688 { 5689 data->offset_fixed= data_fake.offset_fixed; 5690 data->minlen_fixed= data_fake.minlen_fixed; 5691 data->lookbehind_fixed+= scan->flags; 5692 } 5693 if ( data_fake.minlen_float != minlenp ) 5694 { 5695 data->minlen_float= data_fake.minlen_float; 5696 data->offset_float_min=data_fake.offset_float_min; 5697 data->offset_float_max=data_fake.offset_float_max; 5698 data->lookbehind_float+= scan->flags; 5699 } 5700 } 5701 } 5702 } 5703 #endif 5704 } 5705 else if (OP(scan) == OPEN) { 5706 if (stopparen != (I32)ARG(scan)) 5707 pars++; 5708 } 5709 else if (OP(scan) == CLOSE) { 5710 if (stopparen == (I32)ARG(scan)) { 5711 break; 5712 } 5713 if ((I32)ARG(scan) == is_par) { 5714 next = regnext(scan); 5715 5716 if ( next && (OP(next) != WHILEM) && next < last) 5717 is_par = 0; /* Disable optimization */ 5718 } 5719 if (data) 5720 *(data->last_closep) = ARG(scan); 5721 } 5722 else if (OP(scan) == EVAL) { 5723 if (data) 5724 data->flags |= SF_HAS_EVAL; 5725 } 5726 else if ( PL_regkind[OP(scan)] == ENDLIKE ) { 5727 if (flags & SCF_DO_SUBSTR) { 5728 scan_commit(pRExC_state, data, minlenp, is_inf); 5729 flags &= ~SCF_DO_SUBSTR; 5730 } 5731 if (data && OP(scan)==ACCEPT) { 5732 data->flags |= SCF_SEEN_ACCEPT; 5733 if (stopmin > min) 5734 stopmin = min; 5735 } 5736 } 5737 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ 5738 { 5739 if (flags & SCF_DO_SUBSTR) { 5740 scan_commit(pRExC_state, data, minlenp, is_inf); 5741 data->longest = &(data->longest_float); 5742 } 5743 is_inf = is_inf_internal = 1; 5744 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 5745 ssc_anything(data->start_class); 5746 flags &= ~SCF_DO_STCLASS; 5747 } 5748 else if (OP(scan) == GPOS) { 5749 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && 5750 !(delta || is_inf || (data && data->pos_delta))) 5751 { 5752 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) 5753 RExC_rx->intflags |= PREGf_ANCH_GPOS; 5754 if (RExC_rx->gofs < (STRLEN)min) 5755 RExC_rx->gofs = min; 5756 } else { 5757 RExC_rx->intflags |= PREGf_GPOS_FLOAT; 5758 RExC_rx->gofs = 0; 5759 } 5760 } 5761 #ifdef TRIE_STUDY_OPT 5762 #ifdef FULL_TRIE_STUDY 5763 else if (PL_regkind[OP(scan)] == TRIE) { 5764 /* NOTE - There is similar code to this block above for handling 5765 BRANCH nodes on the initial study. If you change stuff here 5766 check there too. */ 5767 regnode *trie_node= scan; 5768 regnode *tail= regnext(scan); 5769 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 5770 SSize_t max1 = 0, min1 = SSize_t_MAX; 5771 regnode_ssc accum; 5772 5773 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ 5774 /* Cannot merge strings after this. */ 5775 scan_commit(pRExC_state, data, minlenp, is_inf); 5776 } 5777 if (flags & SCF_DO_STCLASS) 5778 ssc_init_zero(pRExC_state, &accum); 5779 5780 if (!trie->jump) { 5781 min1= trie->minlen; 5782 max1= trie->maxlen; 5783 } else { 5784 const regnode *nextbranch= NULL; 5785 U32 word; 5786 5787 for ( word=1 ; word <= trie->wordcount ; word++) 5788 { 5789 SSize_t deltanext=0, minnext=0, f = 0, fake; 5790 regnode_ssc this_class; 5791 5792 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 5793 if (data) { 5794 data_fake.whilem_c = data->whilem_c; 5795 data_fake.last_closep = data->last_closep; 5796 } 5797 else 5798 data_fake.last_closep = &fake; 5799 data_fake.pos_delta = delta; 5800 if (flags & SCF_DO_STCLASS) { 5801 ssc_init(pRExC_state, &this_class); 5802 data_fake.start_class = &this_class; 5803 f = SCF_DO_STCLASS_AND; 5804 } 5805 if (flags & SCF_WHILEM_VISITED_POS) 5806 f |= SCF_WHILEM_VISITED_POS; 5807 5808 if (trie->jump[word]) { 5809 if (!nextbranch) 5810 nextbranch = trie_node + trie->jump[0]; 5811 scan= trie_node + trie->jump[word]; 5812 /* We go from the jump point to the branch that follows 5813 it. Note this means we need the vestigal unused 5814 branches even though they arent otherwise used. */ 5815 minnext = study_chunk(pRExC_state, &scan, minlenp, 5816 &deltanext, (regnode *)nextbranch, &data_fake, 5817 stopparen, recursed_depth, NULL, f,depth+1); 5818 } 5819 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 5820 nextbranch= regnext((regnode*)nextbranch); 5821 5822 if (min1 > (SSize_t)(minnext + trie->minlen)) 5823 min1 = minnext + trie->minlen; 5824 if (deltanext == SSize_t_MAX) { 5825 is_inf = is_inf_internal = 1; 5826 max1 = SSize_t_MAX; 5827 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) 5828 max1 = minnext + deltanext + trie->maxlen; 5829 5830 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 5831 pars++; 5832 if (data_fake.flags & SCF_SEEN_ACCEPT) { 5833 if ( stopmin > min + min1) 5834 stopmin = min + min1; 5835 flags &= ~SCF_DO_SUBSTR; 5836 if (data) 5837 data->flags |= SCF_SEEN_ACCEPT; 5838 } 5839 if (data) { 5840 if (data_fake.flags & SF_HAS_EVAL) 5841 data->flags |= SF_HAS_EVAL; 5842 data->whilem_c = data_fake.whilem_c; 5843 } 5844 if (flags & SCF_DO_STCLASS) 5845 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); 5846 } 5847 } 5848 if (flags & SCF_DO_SUBSTR) { 5849 data->pos_min += min1; 5850 data->pos_delta += max1 - min1; 5851 if (max1 != min1 || is_inf) 5852 data->longest = &(data->longest_float); 5853 } 5854 min += min1; 5855 if (delta != SSize_t_MAX) 5856 delta += max1 - min1; 5857 if (flags & SCF_DO_STCLASS_OR) { 5858 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); 5859 if (min1) { 5860 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5861 flags &= ~SCF_DO_STCLASS; 5862 } 5863 } 5864 else if (flags & SCF_DO_STCLASS_AND) { 5865 if (min1) { 5866 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); 5867 flags &= ~SCF_DO_STCLASS; 5868 } 5869 else { 5870 /* Switch to OR mode: cache the old value of 5871 * data->start_class */ 5872 INIT_AND_WITHP; 5873 StructCopy(data->start_class, and_withp, regnode_ssc); 5874 flags &= ~SCF_DO_STCLASS_AND; 5875 StructCopy(&accum, data->start_class, regnode_ssc); 5876 flags |= SCF_DO_STCLASS_OR; 5877 } 5878 } 5879 scan= tail; 5880 continue; 5881 } 5882 #else 5883 else if (PL_regkind[OP(scan)] == TRIE) { 5884 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 5885 U8*bang=NULL; 5886 5887 min += trie->minlen; 5888 delta += (trie->maxlen - trie->minlen); 5889 flags &= ~SCF_DO_STCLASS; /* xxx */ 5890 if (flags & SCF_DO_SUBSTR) { 5891 /* Cannot expect anything... */ 5892 scan_commit(pRExC_state, data, minlenp, is_inf); 5893 data->pos_min += trie->minlen; 5894 data->pos_delta += (trie->maxlen - trie->minlen); 5895 if (trie->maxlen != trie->minlen) 5896 data->longest = &(data->longest_float); 5897 } 5898 if (trie->jump) /* no more substrings -- for now /grr*/ 5899 flags &= ~SCF_DO_SUBSTR; 5900 } 5901 #endif /* old or new */ 5902 #endif /* TRIE_STUDY_OPT */ 5903 5904 /* Else: zero-length, ignore. */ 5905 scan = regnext(scan); 5906 } 5907 5908 finish: 5909 if (frame) { 5910 /* we need to unwind recursion. */ 5911 depth = depth - 1; 5912 5913 DEBUG_STUDYDATA("frame-end:",data,depth); 5914 DEBUG_PEEP("fend", scan, depth); 5915 5916 /* restore previous context */ 5917 last = frame->last_regnode; 5918 scan = frame->next_regnode; 5919 stopparen = frame->stopparen; 5920 recursed_depth = frame->prev_recursed_depth; 5921 5922 RExC_frame_last = frame->prev_frame; 5923 frame = frame->this_prev_frame; 5924 goto fake_study_recurse; 5925 } 5926 5927 assert(!frame); 5928 DEBUG_STUDYDATA("pre-fin:",data,depth); 5929 5930 *scanp = scan; 5931 *deltap = is_inf_internal ? SSize_t_MAX : delta; 5932 5933 if (flags & SCF_DO_SUBSTR && is_inf) 5934 data->pos_delta = SSize_t_MAX - data->pos_min; 5935 if (is_par > (I32)U8_MAX) 5936 is_par = 0; 5937 if (is_par && pars==1 && data) { 5938 data->flags |= SF_IN_PAR; 5939 data->flags &= ~SF_HAS_PAR; 5940 } 5941 else if (pars && data) { 5942 data->flags |= SF_HAS_PAR; 5943 data->flags &= ~SF_IN_PAR; 5944 } 5945 if (flags & SCF_DO_STCLASS_OR) 5946 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); 5947 if (flags & SCF_TRIE_RESTUDY) 5948 data->flags |= SCF_TRIE_RESTUDY; 5949 5950 DEBUG_STUDYDATA("post-fin:",data,depth); 5951 5952 { 5953 SSize_t final_minlen= min < stopmin ? min : stopmin; 5954 5955 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { 5956 if (final_minlen > SSize_t_MAX - delta) 5957 RExC_maxlen = SSize_t_MAX; 5958 else if (RExC_maxlen < final_minlen + delta) 5959 RExC_maxlen = final_minlen + delta; 5960 } 5961 return final_minlen; 5962 } 5963 NOT_REACHED; /* NOTREACHED */ 5964 } 5965 5966 STATIC U32 5967 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) 5968 { 5969 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; 5970 5971 PERL_ARGS_ASSERT_ADD_DATA; 5972 5973 Renewc(RExC_rxi->data, 5974 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), 5975 char, struct reg_data); 5976 if(count) 5977 Renew(RExC_rxi->data->what, count + n, U8); 5978 else 5979 Newx(RExC_rxi->data->what, n, U8); 5980 RExC_rxi->data->count = count + n; 5981 Copy(s, RExC_rxi->data->what + count, n, U8); 5982 return count; 5983 } 5984 5985 /*XXX: todo make this not included in a non debugging perl, but appears to be 5986 * used anyway there, in 'use re' */ 5987 #ifndef PERL_IN_XSUB_RE 5988 void 5989 Perl_reginitcolors(pTHX) 5990 { 5991 const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); 5992 if (s) { 5993 char *t = savepv(s); 5994 int i = 0; 5995 PL_colors[0] = t; 5996 while (++i < 6) { 5997 t = strchr(t, '\t'); 5998 if (t) { 5999 *t = '\0'; 6000 PL_colors[i] = ++t; 6001 } 6002 else 6003 PL_colors[i] = t = (char *)""; 6004 } 6005 } else { 6006 int i = 0; 6007 while (i < 6) 6008 PL_colors[i++] = (char *)""; 6009 } 6010 PL_colorset = 1; 6011 } 6012 #endif 6013 6014 6015 #ifdef TRIE_STUDY_OPT 6016 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ 6017 STMT_START { \ 6018 if ( \ 6019 (data.flags & SCF_TRIE_RESTUDY) \ 6020 && ! restudied++ \ 6021 ) { \ 6022 dOsomething; \ 6023 goto reStudy; \ 6024 } \ 6025 } STMT_END 6026 #else 6027 #define CHECK_RESTUDY_GOTO_butfirst 6028 #endif 6029 6030 /* 6031 * pregcomp - compile a regular expression into internal code 6032 * 6033 * Decides which engine's compiler to call based on the hint currently in 6034 * scope 6035 */ 6036 6037 #ifndef PERL_IN_XSUB_RE 6038 6039 /* return the currently in-scope regex engine (or the default if none) */ 6040 6041 regexp_engine const * 6042 Perl_current_re_engine(pTHX) 6043 { 6044 if (IN_PERL_COMPILETIME) { 6045 HV * const table = GvHV(PL_hintgv); 6046 SV **ptr; 6047 6048 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) 6049 return &PL_core_reg_engine; 6050 ptr = hv_fetchs(table, "regcomp", FALSE); 6051 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) 6052 return &PL_core_reg_engine; 6053 return INT2PTR(regexp_engine*,SvIV(*ptr)); 6054 } 6055 else { 6056 SV *ptr; 6057 if (!PL_curcop->cop_hints_hash) 6058 return &PL_core_reg_engine; 6059 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); 6060 if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) 6061 return &PL_core_reg_engine; 6062 return INT2PTR(regexp_engine*,SvIV(ptr)); 6063 } 6064 } 6065 6066 6067 REGEXP * 6068 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) 6069 { 6070 regexp_engine const *eng = current_re_engine(); 6071 GET_RE_DEBUG_FLAGS_DECL; 6072 6073 PERL_ARGS_ASSERT_PREGCOMP; 6074 6075 /* Dispatch a request to compile a regexp to correct regexp engine. */ 6076 DEBUG_COMPILE_r({ 6077 Perl_re_printf( aTHX_ "Using engine %"UVxf"\n", 6078 PTR2UV(eng)); 6079 }); 6080 return CALLREGCOMP_ENG(eng, pattern, flags); 6081 } 6082 #endif 6083 6084 /* public(ish) entry point for the perl core's own regex compiling code. 6085 * It's actually a wrapper for Perl_re_op_compile that only takes an SV 6086 * pattern rather than a list of OPs, and uses the internal engine rather 6087 * than the current one */ 6088 6089 REGEXP * 6090 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) 6091 { 6092 SV *pat = pattern; /* defeat constness! */ 6093 PERL_ARGS_ASSERT_RE_COMPILE; 6094 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, 6095 #ifdef PERL_IN_XSUB_RE 6096 &my_reg_engine, 6097 #else 6098 &PL_core_reg_engine, 6099 #endif 6100 NULL, NULL, rx_flags, 0); 6101 } 6102 6103 6104 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code 6105 * blocks, recalculate the indices. Update pat_p and plen_p in-place to 6106 * point to the realloced string and length. 6107 * 6108 * This is essentially a copy of Perl_bytes_to_utf8() with the code index 6109 * stuff added */ 6110 6111 static void 6112 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, 6113 char **pat_p, STRLEN *plen_p, int num_code_blocks) 6114 { 6115 U8 *const src = (U8*)*pat_p; 6116 U8 *dst, *d; 6117 int n=0; 6118 STRLEN s = 0; 6119 bool do_end = 0; 6120 GET_RE_DEBUG_FLAGS_DECL; 6121 6122 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 6123 "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); 6124 6125 Newx(dst, *plen_p * 2 + 1, U8); 6126 d = dst; 6127 6128 while (s < *plen_p) { 6129 append_utf8_from_native_byte(src[s], &d); 6130 if (n < num_code_blocks) { 6131 if (!do_end && pRExC_state->code_blocks[n].start == s) { 6132 pRExC_state->code_blocks[n].start = d - dst - 1; 6133 assert(*(d - 1) == '('); 6134 do_end = 1; 6135 } 6136 else if (do_end && pRExC_state->code_blocks[n].end == s) { 6137 pRExC_state->code_blocks[n].end = d - dst - 1; 6138 assert(*(d - 1) == ')'); 6139 do_end = 0; 6140 n++; 6141 } 6142 } 6143 s++; 6144 } 6145 *d = '\0'; 6146 *plen_p = d - dst; 6147 *pat_p = (char*) dst; 6148 SAVEFREEPV(*pat_p); 6149 RExC_orig_utf8 = RExC_utf8 = 1; 6150 } 6151 6152 6153 6154 /* S_concat_pat(): concatenate a list of args to the pattern string pat, 6155 * while recording any code block indices, and handling overloading, 6156 * nested qr// objects etc. If pat is null, it will allocate a new 6157 * string, or just return the first arg, if there's only one. 6158 * 6159 * Returns the malloced/updated pat. 6160 * patternp and pat_count is the array of SVs to be concatted; 6161 * oplist is the optional list of ops that generated the SVs; 6162 * recompile_p is a pointer to a boolean that will be set if 6163 * the regex will need to be recompiled. 6164 * delim, if non-null is an SV that will be inserted between each element 6165 */ 6166 6167 static SV* 6168 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, 6169 SV *pat, SV ** const patternp, int pat_count, 6170 OP *oplist, bool *recompile_p, SV *delim) 6171 { 6172 SV **svp; 6173 int n = 0; 6174 bool use_delim = FALSE; 6175 bool alloced = FALSE; 6176 6177 /* if we know we have at least two args, create an empty string, 6178 * then concatenate args to that. For no args, return an empty string */ 6179 if (!pat && pat_count != 1) { 6180 pat = newSVpvs(""); 6181 SAVEFREESV(pat); 6182 alloced = TRUE; 6183 } 6184 6185 for (svp = patternp; svp < patternp + pat_count; svp++) { 6186 SV *sv; 6187 SV *rx = NULL; 6188 STRLEN orig_patlen = 0; 6189 bool code = 0; 6190 SV *msv = use_delim ? delim : *svp; 6191 if (!msv) msv = &PL_sv_undef; 6192 6193 /* if we've got a delimiter, we go round the loop twice for each 6194 * svp slot (except the last), using the delimiter the second 6195 * time round */ 6196 if (use_delim) { 6197 svp--; 6198 use_delim = FALSE; 6199 } 6200 else if (delim) 6201 use_delim = TRUE; 6202 6203 if (SvTYPE(msv) == SVt_PVAV) { 6204 /* we've encountered an interpolated array within 6205 * the pattern, e.g. /...@a..../. Expand the list of elements, 6206 * then recursively append elements. 6207 * The code in this block is based on S_pushav() */ 6208 6209 AV *const av = (AV*)msv; 6210 const SSize_t maxarg = AvFILL(av) + 1; 6211 SV **array; 6212 6213 if (oplist) { 6214 assert(oplist->op_type == OP_PADAV 6215 || oplist->op_type == OP_RV2AV); 6216 oplist = OpSIBLING(oplist); 6217 } 6218 6219 if (SvRMAGICAL(av)) { 6220 SSize_t i; 6221 6222 Newx(array, maxarg, SV*); 6223 SAVEFREEPV(array); 6224 for (i=0; i < maxarg; i++) { 6225 SV ** const svp = av_fetch(av, i, FALSE); 6226 array[i] = svp ? *svp : &PL_sv_undef; 6227 } 6228 } 6229 else 6230 array = AvARRAY(av); 6231 6232 pat = S_concat_pat(aTHX_ pRExC_state, pat, 6233 array, maxarg, NULL, recompile_p, 6234 /* $" */ 6235 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); 6236 6237 continue; 6238 } 6239 6240 6241 /* we make the assumption here that each op in the list of 6242 * op_siblings maps to one SV pushed onto the stack, 6243 * except for code blocks, with have both an OP_NULL and 6244 * and OP_CONST. 6245 * This allows us to match up the list of SVs against the 6246 * list of OPs to find the next code block. 6247 * 6248 * Note that PUSHMARK PADSV PADSV .. 6249 * is optimised to 6250 * PADRANGE PADSV PADSV .. 6251 * so the alignment still works. */ 6252 6253 if (oplist) { 6254 if (oplist->op_type == OP_NULL 6255 && (oplist->op_flags & OPf_SPECIAL)) 6256 { 6257 assert(n < pRExC_state->num_code_blocks); 6258 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0; 6259 pRExC_state->code_blocks[n].block = oplist; 6260 pRExC_state->code_blocks[n].src_regex = NULL; 6261 n++; 6262 code = 1; 6263 oplist = OpSIBLING(oplist); /* skip CONST */ 6264 assert(oplist); 6265 } 6266 oplist = OpSIBLING(oplist);; 6267 } 6268 6269 /* apply magic and QR overloading to arg */ 6270 6271 SvGETMAGIC(msv); 6272 if (SvROK(msv) && SvAMAGIC(msv)) { 6273 SV *sv = AMG_CALLunary(msv, regexp_amg); 6274 if (sv) { 6275 if (SvROK(sv)) 6276 sv = SvRV(sv); 6277 if (SvTYPE(sv) != SVt_REGEXP) 6278 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); 6279 msv = sv; 6280 } 6281 } 6282 6283 /* try concatenation overload ... */ 6284 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && 6285 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) 6286 { 6287 sv_setsv(pat, sv); 6288 /* overloading involved: all bets are off over literal 6289 * code. Pretend we haven't seen it */ 6290 pRExC_state->num_code_blocks -= n; 6291 n = 0; 6292 } 6293 else { 6294 /* ... or failing that, try "" overload */ 6295 while (SvAMAGIC(msv) 6296 && (sv = AMG_CALLunary(msv, string_amg)) 6297 && sv != msv 6298 && !( SvROK(msv) 6299 && SvROK(sv) 6300 && SvRV(msv) == SvRV(sv)) 6301 ) { 6302 msv = sv; 6303 SvGETMAGIC(msv); 6304 } 6305 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) 6306 msv = SvRV(msv); 6307 6308 if (pat) { 6309 /* this is a partially unrolled 6310 * sv_catsv_nomg(pat, msv); 6311 * that allows us to adjust code block indices if 6312 * needed */ 6313 STRLEN dlen; 6314 char *dst = SvPV_force_nomg(pat, dlen); 6315 orig_patlen = dlen; 6316 if (SvUTF8(msv) && !SvUTF8(pat)) { 6317 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); 6318 sv_setpvn(pat, dst, dlen); 6319 SvUTF8_on(pat); 6320 } 6321 sv_catsv_nomg(pat, msv); 6322 rx = msv; 6323 } 6324 else 6325 pat = msv; 6326 6327 if (code) 6328 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; 6329 } 6330 6331 /* extract any code blocks within any embedded qr//'s */ 6332 if (rx && SvTYPE(rx) == SVt_REGEXP 6333 && RX_ENGINE((REGEXP*)rx)->op_comp) 6334 { 6335 6336 RXi_GET_DECL(ReANY((REGEXP *)rx), ri); 6337 if (ri->num_code_blocks) { 6338 int i; 6339 /* the presence of an embedded qr// with code means 6340 * we should always recompile: the text of the 6341 * qr// may not have changed, but it may be a 6342 * different closure than last time */ 6343 *recompile_p = 1; 6344 Renew(pRExC_state->code_blocks, 6345 pRExC_state->num_code_blocks + ri->num_code_blocks, 6346 struct reg_code_block); 6347 pRExC_state->num_code_blocks += ri->num_code_blocks; 6348 6349 for (i=0; i < ri->num_code_blocks; i++) { 6350 struct reg_code_block *src, *dst; 6351 STRLEN offset = orig_patlen 6352 + ReANY((REGEXP *)rx)->pre_prefix; 6353 assert(n < pRExC_state->num_code_blocks); 6354 src = &ri->code_blocks[i]; 6355 dst = &pRExC_state->code_blocks[n]; 6356 dst->start = src->start + offset; 6357 dst->end = src->end + offset; 6358 dst->block = src->block; 6359 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) 6360 src->src_regex 6361 ? src->src_regex 6362 : (REGEXP*)rx); 6363 n++; 6364 } 6365 } 6366 } 6367 } 6368 /* avoid calling magic multiple times on a single element e.g. =~ $qr */ 6369 if (alloced) 6370 SvSETMAGIC(pat); 6371 6372 return pat; 6373 } 6374 6375 6376 6377 /* see if there are any run-time code blocks in the pattern. 6378 * False positives are allowed */ 6379 6380 static bool 6381 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 6382 char *pat, STRLEN plen) 6383 { 6384 int n = 0; 6385 STRLEN s; 6386 6387 PERL_UNUSED_CONTEXT; 6388 6389 for (s = 0; s < plen; s++) { 6390 if (n < pRExC_state->num_code_blocks 6391 && s == pRExC_state->code_blocks[n].start) 6392 { 6393 s = pRExC_state->code_blocks[n].end; 6394 n++; 6395 continue; 6396 } 6397 /* TODO ideally should handle [..], (#..), /#.../x to reduce false 6398 * positives here */ 6399 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && 6400 (pat[s+2] == '{' 6401 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) 6402 ) 6403 return 1; 6404 } 6405 return 0; 6406 } 6407 6408 /* Handle run-time code blocks. We will already have compiled any direct 6409 * or indirect literal code blocks. Now, take the pattern 'pat' and make a 6410 * copy of it, but with any literal code blocks blanked out and 6411 * appropriate chars escaped; then feed it into 6412 * 6413 * eval "qr'modified_pattern'" 6414 * 6415 * For example, 6416 * 6417 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno 6418 * 6419 * becomes 6420 * 6421 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' 6422 * 6423 * After eval_sv()-ing that, grab any new code blocks from the returned qr 6424 * and merge them with any code blocks of the original regexp. 6425 * 6426 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; 6427 * instead, just save the qr and return FALSE; this tells our caller that 6428 * the original pattern needs upgrading to utf8. 6429 */ 6430 6431 static bool 6432 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 6433 char *pat, STRLEN plen) 6434 { 6435 SV *qr; 6436 6437 GET_RE_DEBUG_FLAGS_DECL; 6438 6439 if (pRExC_state->runtime_code_qr) { 6440 /* this is the second time we've been called; this should 6441 * only happen if the main pattern got upgraded to utf8 6442 * during compilation; re-use the qr we compiled first time 6443 * round (which should be utf8 too) 6444 */ 6445 qr = pRExC_state->runtime_code_qr; 6446 pRExC_state->runtime_code_qr = NULL; 6447 assert(RExC_utf8 && SvUTF8(qr)); 6448 } 6449 else { 6450 int n = 0; 6451 STRLEN s; 6452 char *p, *newpat; 6453 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ 6454 SV *sv, *qr_ref; 6455 dSP; 6456 6457 /* determine how many extra chars we need for ' and \ escaping */ 6458 for (s = 0; s < plen; s++) { 6459 if (pat[s] == '\'' || pat[s] == '\\') 6460 newlen++; 6461 } 6462 6463 Newx(newpat, newlen, char); 6464 p = newpat; 6465 *p++ = 'q'; *p++ = 'r'; *p++ = '\''; 6466 6467 for (s = 0; s < plen; s++) { 6468 if (n < pRExC_state->num_code_blocks 6469 && s == pRExC_state->code_blocks[n].start) 6470 { 6471 /* blank out literal code block */ 6472 assert(pat[s] == '('); 6473 while (s <= pRExC_state->code_blocks[n].end) { 6474 *p++ = '_'; 6475 s++; 6476 } 6477 s--; 6478 n++; 6479 continue; 6480 } 6481 if (pat[s] == '\'' || pat[s] == '\\') 6482 *p++ = '\\'; 6483 *p++ = pat[s]; 6484 } 6485 *p++ = '\''; 6486 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) 6487 *p++ = 'x'; 6488 *p++ = '\0'; 6489 DEBUG_COMPILE_r({ 6490 Perl_re_printf( aTHX_ 6491 "%sre-parsing pattern for runtime code:%s %s\n", 6492 PL_colors[4],PL_colors[5],newpat); 6493 }); 6494 6495 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); 6496 Safefree(newpat); 6497 6498 ENTER; 6499 SAVETMPS; 6500 save_re_context(); 6501 PUSHSTACKi(PERLSI_REQUIRE); 6502 /* G_RE_REPARSING causes the toker to collapse \\ into \ when 6503 * parsing qr''; normally only q'' does this. It also alters 6504 * hints handling */ 6505 eval_sv(sv, G_SCALAR|G_RE_REPARSING); 6506 SvREFCNT_dec_NN(sv); 6507 SPAGAIN; 6508 qr_ref = POPs; 6509 PUTBACK; 6510 { 6511 SV * const errsv = ERRSV; 6512 if (SvTRUE_NN(errsv)) 6513 { 6514 Safefree(pRExC_state->code_blocks); 6515 /* use croak_sv ? */ 6516 Perl_croak_nocontext("%"SVf, SVfARG(errsv)); 6517 } 6518 } 6519 assert(SvROK(qr_ref)); 6520 qr = SvRV(qr_ref); 6521 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); 6522 /* the leaving below frees the tmp qr_ref. 6523 * Give qr a life of its own */ 6524 SvREFCNT_inc(qr); 6525 POPSTACK; 6526 FREETMPS; 6527 LEAVE; 6528 6529 } 6530 6531 if (!RExC_utf8 && SvUTF8(qr)) { 6532 /* first time through; the pattern got upgraded; save the 6533 * qr for the next time through */ 6534 assert(!pRExC_state->runtime_code_qr); 6535 pRExC_state->runtime_code_qr = qr; 6536 return 0; 6537 } 6538 6539 6540 /* extract any code blocks within the returned qr// */ 6541 6542 6543 /* merge the main (r1) and run-time (r2) code blocks into one */ 6544 { 6545 RXi_GET_DECL(ReANY((REGEXP *)qr), r2); 6546 struct reg_code_block *new_block, *dst; 6547 RExC_state_t * const r1 = pRExC_state; /* convenient alias */ 6548 int i1 = 0, i2 = 0; 6549 6550 if (!r2->num_code_blocks) /* we guessed wrong */ 6551 { 6552 SvREFCNT_dec_NN(qr); 6553 return 1; 6554 } 6555 6556 Newx(new_block, 6557 r1->num_code_blocks + r2->num_code_blocks, 6558 struct reg_code_block); 6559 dst = new_block; 6560 6561 while ( i1 < r1->num_code_blocks 6562 || i2 < r2->num_code_blocks) 6563 { 6564 struct reg_code_block *src; 6565 bool is_qr = 0; 6566 6567 if (i1 == r1->num_code_blocks) { 6568 src = &r2->code_blocks[i2++]; 6569 is_qr = 1; 6570 } 6571 else if (i2 == r2->num_code_blocks) 6572 src = &r1->code_blocks[i1++]; 6573 else if ( r1->code_blocks[i1].start 6574 < r2->code_blocks[i2].start) 6575 { 6576 src = &r1->code_blocks[i1++]; 6577 assert(src->end < r2->code_blocks[i2].start); 6578 } 6579 else { 6580 assert( r1->code_blocks[i1].start 6581 > r2->code_blocks[i2].start); 6582 src = &r2->code_blocks[i2++]; 6583 is_qr = 1; 6584 assert(src->end < r1->code_blocks[i1].start); 6585 } 6586 6587 assert(pat[src->start] == '('); 6588 assert(pat[src->end] == ')'); 6589 dst->start = src->start; 6590 dst->end = src->end; 6591 dst->block = src->block; 6592 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) 6593 : src->src_regex; 6594 dst++; 6595 } 6596 r1->num_code_blocks += r2->num_code_blocks; 6597 Safefree(r1->code_blocks); 6598 r1->code_blocks = new_block; 6599 } 6600 6601 SvREFCNT_dec_NN(qr); 6602 return 1; 6603 } 6604 6605 6606 STATIC bool 6607 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, 6608 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, 6609 SSize_t lookbehind, SSize_t offset, SSize_t *minlen, 6610 STRLEN longest_length, bool eol, bool meol) 6611 { 6612 /* This is the common code for setting up the floating and fixed length 6613 * string data extracted from Perl_re_op_compile() below. Returns a boolean 6614 * as to whether succeeded or not */ 6615 6616 I32 t; 6617 SSize_t ml; 6618 6619 if (! (longest_length 6620 || (eol /* Can't have SEOL and MULTI */ 6621 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) 6622 ) 6623 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ 6624 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) 6625 { 6626 return FALSE; 6627 } 6628 6629 /* copy the information about the longest from the reg_scan_data 6630 over to the program. */ 6631 if (SvUTF8(sv_longest)) { 6632 *rx_utf8 = sv_longest; 6633 *rx_substr = NULL; 6634 } else { 6635 *rx_substr = sv_longest; 6636 *rx_utf8 = NULL; 6637 } 6638 /* end_shift is how many chars that must be matched that 6639 follow this item. We calculate it ahead of time as once the 6640 lookbehind offset is added in we lose the ability to correctly 6641 calculate it.*/ 6642 ml = minlen ? *(minlen) : (SSize_t)longest_length; 6643 *rx_end_shift = ml - offset 6644 - longest_length + (SvTAIL(sv_longest) != 0) 6645 + lookbehind; 6646 6647 t = (eol/* Can't have SEOL and MULTI */ 6648 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); 6649 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); 6650 6651 return TRUE; 6652 } 6653 6654 /* 6655 * Perl_re_op_compile - the perl internal RE engine's function to compile a 6656 * regular expression into internal code. 6657 * The pattern may be passed either as: 6658 * a list of SVs (patternp plus pat_count) 6659 * a list of OPs (expr) 6660 * If both are passed, the SV list is used, but the OP list indicates 6661 * which SVs are actually pre-compiled code blocks 6662 * 6663 * The SVs in the list have magic and qr overloading applied to them (and 6664 * the list may be modified in-place with replacement SVs in the latter 6665 * case). 6666 * 6667 * If the pattern hasn't changed from old_re, then old_re will be 6668 * returned. 6669 * 6670 * eng is the current engine. If that engine has an op_comp method, then 6671 * handle directly (i.e. we assume that op_comp was us); otherwise, just 6672 * do the initial concatenation of arguments and pass on to the external 6673 * engine. 6674 * 6675 * If is_bare_re is not null, set it to a boolean indicating whether the 6676 * arg list reduced (after overloading) to a single bare regex which has 6677 * been returned (i.e. /$qr/). 6678 * 6679 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. 6680 * 6681 * pm_flags contains the PMf_* flags, typically based on those from the 6682 * pm_flags field of the related PMOP. Currently we're only interested in 6683 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. 6684 * 6685 * We can't allocate space until we know how big the compiled form will be, 6686 * but we can't compile it (and thus know how big it is) until we've got a 6687 * place to put the code. So we cheat: we compile it twice, once with code 6688 * generation turned off and size counting turned on, and once "for real". 6689 * This also means that we don't allocate space until we are sure that the 6690 * thing really will compile successfully, and we never have to move the 6691 * code and thus invalidate pointers into it. (Note that it has to be in 6692 * one piece because free() must be able to free it all.) [NB: not true in perl] 6693 * 6694 * Beware that the optimization-preparation code in here knows about some 6695 * of the structure of the compiled regexp. [I'll say.] 6696 */ 6697 6698 REGEXP * 6699 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, 6700 OP *expr, const regexp_engine* eng, REGEXP *old_re, 6701 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) 6702 { 6703 REGEXP *rx; 6704 struct regexp *r; 6705 regexp_internal *ri; 6706 STRLEN plen; 6707 char *exp; 6708 regnode *scan; 6709 I32 flags; 6710 SSize_t minlen = 0; 6711 U32 rx_flags; 6712 SV *pat; 6713 SV *code_blocksv = NULL; 6714 SV** new_patternp = patternp; 6715 6716 /* these are all flags - maybe they should be turned 6717 * into a single int with different bit masks */ 6718 I32 sawlookahead = 0; 6719 I32 sawplus = 0; 6720 I32 sawopen = 0; 6721 I32 sawminmod = 0; 6722 6723 regex_charset initial_charset = get_regex_charset(orig_rx_flags); 6724 bool recompile = 0; 6725 bool runtime_code = 0; 6726 scan_data_t data; 6727 RExC_state_t RExC_state; 6728 RExC_state_t * const pRExC_state = &RExC_state; 6729 #ifdef TRIE_STUDY_OPT 6730 int restudied = 0; 6731 RExC_state_t copyRExC_state; 6732 #endif 6733 GET_RE_DEBUG_FLAGS_DECL; 6734 6735 PERL_ARGS_ASSERT_RE_OP_COMPILE; 6736 6737 DEBUG_r(if (!PL_colorset) reginitcolors()); 6738 6739 /* Initialize these here instead of as-needed, as is quick and avoids 6740 * having to test them each time otherwise */ 6741 if (! PL_AboveLatin1) { 6742 #ifdef DEBUGGING 6743 char * dump_len_string; 6744 #endif 6745 6746 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); 6747 PL_Latin1 = _new_invlist_C_array(Latin1_invlist); 6748 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); 6749 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); 6750 PL_HasMultiCharFold = 6751 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); 6752 6753 /* This is calculated here, because the Perl program that generates the 6754 * static global ones doesn't currently have access to 6755 * NUM_ANYOF_CODE_POINTS */ 6756 PL_InBitmap = _new_invlist(2); 6757 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0, 6758 NUM_ANYOF_CODE_POINTS - 1); 6759 #ifdef DEBUGGING 6760 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN"); 6761 if ( ! dump_len_string 6762 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL)) 6763 { 6764 PL_dump_re_max_len = 0; 6765 } 6766 #endif 6767 } 6768 6769 pRExC_state->warn_text = NULL; 6770 pRExC_state->code_blocks = NULL; 6771 pRExC_state->num_code_blocks = 0; 6772 6773 if (is_bare_re) 6774 *is_bare_re = FALSE; 6775 6776 if (expr && (expr->op_type == OP_LIST || 6777 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { 6778 /* allocate code_blocks if needed */ 6779 OP *o; 6780 int ncode = 0; 6781 6782 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) 6783 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) 6784 ncode++; /* count of DO blocks */ 6785 if (ncode) { 6786 pRExC_state->num_code_blocks = ncode; 6787 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); 6788 } 6789 } 6790 6791 if (!pat_count) { 6792 /* compile-time pattern with just OP_CONSTs and DO blocks */ 6793 6794 int n; 6795 OP *o; 6796 6797 /* find how many CONSTs there are */ 6798 assert(expr); 6799 n = 0; 6800 if (expr->op_type == OP_CONST) 6801 n = 1; 6802 else 6803 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { 6804 if (o->op_type == OP_CONST) 6805 n++; 6806 } 6807 6808 /* fake up an SV array */ 6809 6810 assert(!new_patternp); 6811 Newx(new_patternp, n, SV*); 6812 SAVEFREEPV(new_patternp); 6813 pat_count = n; 6814 6815 n = 0; 6816 if (expr->op_type == OP_CONST) 6817 new_patternp[n] = cSVOPx_sv(expr); 6818 else 6819 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { 6820 if (o->op_type == OP_CONST) 6821 new_patternp[n++] = cSVOPo_sv; 6822 } 6823 6824 } 6825 6826 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 6827 "Assembling pattern from %d elements%s\n", pat_count, 6828 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 6829 6830 /* set expr to the first arg op */ 6831 6832 if (pRExC_state->num_code_blocks 6833 && expr->op_type != OP_CONST) 6834 { 6835 expr = cLISTOPx(expr)->op_first; 6836 assert( expr->op_type == OP_PUSHMARK 6837 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) 6838 || expr->op_type == OP_PADRANGE); 6839 expr = OpSIBLING(expr); 6840 } 6841 6842 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, 6843 expr, &recompile, NULL); 6844 6845 /* handle bare (possibly after overloading) regex: foo =~ $re */ 6846 { 6847 SV *re = pat; 6848 if (SvROK(re)) 6849 re = SvRV(re); 6850 if (SvTYPE(re) == SVt_REGEXP) { 6851 if (is_bare_re) 6852 *is_bare_re = TRUE; 6853 SvREFCNT_inc(re); 6854 Safefree(pRExC_state->code_blocks); 6855 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 6856 "Precompiled pattern%s\n", 6857 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 6858 6859 return (REGEXP*)re; 6860 } 6861 } 6862 6863 exp = SvPV_nomg(pat, plen); 6864 6865 if (!eng->op_comp) { 6866 if ((SvUTF8(pat) && IN_BYTES) 6867 || SvGMAGICAL(pat) || SvAMAGIC(pat)) 6868 { 6869 /* make a temporary copy; either to convert to bytes, 6870 * or to avoid repeating get-magic / overloaded stringify */ 6871 pat = newSVpvn_flags(exp, plen, SVs_TEMP | 6872 (IN_BYTES ? 0 : SvUTF8(pat))); 6873 } 6874 Safefree(pRExC_state->code_blocks); 6875 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); 6876 } 6877 6878 /* ignore the utf8ness if the pattern is 0 length */ 6879 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); 6880 6881 RExC_uni_semantics = 0; 6882 RExC_seen_unfolded_sharp_s = 0; 6883 RExC_contains_locale = 0; 6884 RExC_contains_i = 0; 6885 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); 6886 RExC_study_started = 0; 6887 pRExC_state->runtime_code_qr = NULL; 6888 RExC_frame_head= NULL; 6889 RExC_frame_last= NULL; 6890 RExC_frame_count= 0; 6891 6892 DEBUG_r({ 6893 RExC_mysv1= sv_newmortal(); 6894 RExC_mysv2= sv_newmortal(); 6895 }); 6896 DEBUG_COMPILE_r({ 6897 SV *dsv= sv_newmortal(); 6898 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60); 6899 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n", 6900 PL_colors[4],PL_colors[5],s); 6901 }); 6902 6903 redo_first_pass: 6904 /* we jump here if we have to recompile, e.g., from upgrading the pattern 6905 * to utf8 */ 6906 6907 if ((pm_flags & PMf_USE_RE_EVAL) 6908 /* this second condition covers the non-regex literal case, 6909 * i.e. $foo =~ '(?{})'. */ 6910 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) 6911 ) 6912 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); 6913 6914 /* return old regex if pattern hasn't changed */ 6915 /* XXX: note in the below we have to check the flags as well as the 6916 * pattern. 6917 * 6918 * Things get a touch tricky as we have to compare the utf8 flag 6919 * independently from the compile flags. */ 6920 6921 if ( old_re 6922 && !recompile 6923 && !!RX_UTF8(old_re) == !!RExC_utf8 6924 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) 6925 && RX_PRECOMP(old_re) 6926 && RX_PRELEN(old_re) == plen 6927 && memEQ(RX_PRECOMP(old_re), exp, plen) 6928 && !runtime_code /* with runtime code, always recompile */ ) 6929 { 6930 Safefree(pRExC_state->code_blocks); 6931 return old_re; 6932 } 6933 6934 rx_flags = orig_rx_flags; 6935 6936 if (rx_flags & PMf_FOLD) { 6937 RExC_contains_i = 1; 6938 } 6939 if ( initial_charset == REGEX_DEPENDS_CHARSET 6940 && (RExC_utf8 ||RExC_uni_semantics)) 6941 { 6942 6943 /* Set to use unicode semantics if the pattern is in utf8 and has the 6944 * 'depends' charset specified, as it means unicode when utf8 */ 6945 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); 6946 } 6947 6948 RExC_precomp = exp; 6949 RExC_precomp_adj = 0; 6950 RExC_flags = rx_flags; 6951 RExC_pm_flags = pm_flags; 6952 6953 if (runtime_code) { 6954 assert(TAINTING_get || !TAINT_get); 6955 if (TAINT_get) 6956 Perl_croak(aTHX_ "Eval-group in insecure regular expression"); 6957 6958 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { 6959 /* whoops, we have a non-utf8 pattern, whilst run-time code 6960 * got compiled as utf8. Try again with a utf8 pattern */ 6961 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 6962 pRExC_state->num_code_blocks); 6963 goto redo_first_pass; 6964 } 6965 } 6966 assert(!pRExC_state->runtime_code_qr); 6967 6968 RExC_sawback = 0; 6969 6970 RExC_seen = 0; 6971 RExC_maxlen = 0; 6972 RExC_in_lookbehind = 0; 6973 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 6974 RExC_extralen = 0; 6975 RExC_override_recoding = 0; 6976 #ifdef EBCDIC 6977 RExC_recode_x_to_native = 0; 6978 #endif 6979 RExC_in_multi_char_class = 0; 6980 6981 /* First pass: determine size, legality. */ 6982 RExC_parse = exp; 6983 RExC_start = RExC_adjusted_start = exp; 6984 RExC_end = exp + plen; 6985 RExC_precomp_end = RExC_end; 6986 RExC_naughty = 0; 6987 RExC_npar = 1; 6988 RExC_nestroot = 0; 6989 RExC_size = 0L; 6990 RExC_emit = (regnode *) &RExC_emit_dummy; 6991 RExC_whilem_seen = 0; 6992 RExC_open_parens = NULL; 6993 RExC_close_parens = NULL; 6994 RExC_end_op = NULL; 6995 RExC_paren_names = NULL; 6996 #ifdef DEBUGGING 6997 RExC_paren_name_list = NULL; 6998 #endif 6999 RExC_recurse = NULL; 7000 RExC_study_chunk_recursed = NULL; 7001 RExC_study_chunk_recursed_bytes= 0; 7002 RExC_recurse_count = 0; 7003 pRExC_state->code_index = 0; 7004 7005 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv 7006 * code makes sure the final byte is an uncounted NUL. But should this 7007 * ever not be the case, lots of things could read beyond the end of the 7008 * buffer: loops like 7009 * while(isFOO(*RExC_parse)) RExC_parse++; 7010 * strchr(RExC_parse, "foo"); 7011 * etc. So it is worth noting. */ 7012 assert(*RExC_end == '\0'); 7013 7014 DEBUG_PARSE_r( 7015 Perl_re_printf( aTHX_ "Starting first pass (sizing)\n"); 7016 RExC_lastnum=0; 7017 RExC_lastparse=NULL; 7018 ); 7019 /* reg may croak on us, not giving us a chance to free 7020 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may 7021 need it to survive as long as the regexp (qr/(?{})/). 7022 We must check that code_blocksv is not already set, because we may 7023 have jumped back to restart the sizing pass. */ 7024 if (pRExC_state->code_blocks && !code_blocksv) { 7025 code_blocksv = newSV_type(SVt_PV); 7026 SAVEFREESV(code_blocksv); 7027 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); 7028 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ 7029 } 7030 if (reg(pRExC_state, 0, &flags,1) == NULL) { 7031 /* It's possible to write a regexp in ascii that represents Unicode 7032 codepoints outside of the byte range, such as via \x{100}. If we 7033 detect such a sequence we have to convert the entire pattern to utf8 7034 and then recompile, as our sizing calculation will have been based 7035 on 1 byte == 1 character, but we will need to use utf8 to encode 7036 at least some part of the pattern, and therefore must convert the whole 7037 thing. 7038 -- dmq */ 7039 if (flags & RESTART_PASS1) { 7040 if (flags & NEED_UTF8) { 7041 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 7042 pRExC_state->num_code_blocks); 7043 } 7044 else { 7045 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 7046 "Need to redo pass 1\n")); 7047 } 7048 7049 goto redo_first_pass; 7050 } 7051 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); 7052 } 7053 if (code_blocksv) 7054 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ 7055 7056 DEBUG_PARSE_r({ 7057 Perl_re_printf( aTHX_ 7058 "Required size %"IVdf" nodes\n" 7059 "Starting second pass (creation)\n", 7060 (IV)RExC_size); 7061 RExC_lastnum=0; 7062 RExC_lastparse=NULL; 7063 }); 7064 7065 /* The first pass could have found things that force Unicode semantics */ 7066 if ((RExC_utf8 || RExC_uni_semantics) 7067 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) 7068 { 7069 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); 7070 } 7071 7072 /* Small enough for pointer-storage convention? 7073 If extralen==0, this means that we will not need long jumps. */ 7074 if (RExC_size >= 0x10000L && RExC_extralen) 7075 RExC_size += RExC_extralen; 7076 else 7077 RExC_extralen = 0; 7078 if (RExC_whilem_seen > 15) 7079 RExC_whilem_seen = 15; 7080 7081 /* Allocate space and zero-initialize. Note, the two step process 7082 of zeroing when in debug mode, thus anything assigned has to 7083 happen after that */ 7084 rx = (REGEXP*) newSV_type(SVt_REGEXP); 7085 r = ReANY(rx); 7086 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), 7087 char, regexp_internal); 7088 if ( r == NULL || ri == NULL ) 7089 FAIL("Regexp out of space"); 7090 #ifdef DEBUGGING 7091 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ 7092 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), 7093 char); 7094 #else 7095 /* bulk initialize base fields with 0. */ 7096 Zero(ri, sizeof(regexp_internal), char); 7097 #endif 7098 7099 /* non-zero initialization begins here */ 7100 RXi_SET( r, ri ); 7101 r->engine= eng; 7102 r->extflags = rx_flags; 7103 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; 7104 7105 if (pm_flags & PMf_IS_QR) { 7106 ri->code_blocks = pRExC_state->code_blocks; 7107 ri->num_code_blocks = pRExC_state->num_code_blocks; 7108 } 7109 else 7110 { 7111 int n; 7112 for (n = 0; n < pRExC_state->num_code_blocks; n++) 7113 if (pRExC_state->code_blocks[n].src_regex) 7114 SAVEFREESV(pRExC_state->code_blocks[n].src_regex); 7115 if(pRExC_state->code_blocks) 7116 SAVEFREEPV(pRExC_state->code_blocks); /* often null */ 7117 } 7118 7119 { 7120 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); 7121 bool has_charset = (get_regex_charset(r->extflags) 7122 != REGEX_DEPENDS_CHARSET); 7123 7124 /* The caret is output if there are any defaults: if not all the STD 7125 * flags are set, or if no character set specifier is needed */ 7126 bool has_default = 7127 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) 7128 || ! has_charset); 7129 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) 7130 == REG_RUN_ON_COMMENT_SEEN); 7131 U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD) 7132 >> RXf_PMf_STD_PMMOD_SHIFT); 7133 const char *fptr = STD_PAT_MODS; /*"msixn"*/ 7134 char *p; 7135 7136 /* We output all the necessary flags; we never output a minus, as all 7137 * those are defaults, so are 7138 * covered by the caret */ 7139 const STRLEN wraplen = plen + has_p + has_runon 7140 + has_default /* If needs a caret */ 7141 + PL_bitcount[reganch] /* 1 char for each set standard flag */ 7142 7143 /* If needs a character set specifier */ 7144 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) 7145 + (sizeof("(?:)") - 1); 7146 7147 /* make sure PL_bitcount bounds not exceeded */ 7148 assert(sizeof(STD_PAT_MODS) <= 8); 7149 7150 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ 7151 r->xpv_len_u.xpvlenu_pv = p; 7152 if (RExC_utf8) 7153 SvFLAGS(rx) |= SVf_UTF8; 7154 *p++='('; *p++='?'; 7155 7156 /* If a default, cover it using the caret */ 7157 if (has_default) { 7158 *p++= DEFAULT_PAT_MOD; 7159 } 7160 if (has_charset) { 7161 STRLEN len; 7162 const char* const name = get_regex_charset_name(r->extflags, &len); 7163 Copy(name, p, len, char); 7164 p += len; 7165 } 7166 if (has_p) 7167 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ 7168 { 7169 char ch; 7170 while((ch = *fptr++)) { 7171 if(reganch & 1) 7172 *p++ = ch; 7173 reganch >>= 1; 7174 } 7175 } 7176 7177 *p++ = ':'; 7178 Copy(RExC_precomp, p, plen, char); 7179 assert ((RX_WRAPPED(rx) - p) < 16); 7180 r->pre_prefix = p - RX_WRAPPED(rx); 7181 p += plen; 7182 if (has_runon) 7183 *p++ = '\n'; 7184 *p++ = ')'; 7185 *p = 0; 7186 SvCUR_set(rx, p - RX_WRAPPED(rx)); 7187 } 7188 7189 r->intflags = 0; 7190 r->nparens = RExC_npar - 1; /* set early to validate backrefs */ 7191 7192 /* Useful during FAIL. */ 7193 #ifdef RE_TRACK_PATTERN_OFFSETS 7194 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ 7195 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_ 7196 "%s %"UVuf" bytes for offset annotations.\n", 7197 ri->u.offsets ? "Got" : "Couldn't get", 7198 (UV)((2*RExC_size+1) * sizeof(U32)))); 7199 #endif 7200 SetProgLen(ri,RExC_size); 7201 RExC_rx_sv = rx; 7202 RExC_rx = r; 7203 RExC_rxi = ri; 7204 7205 /* Second pass: emit code. */ 7206 RExC_flags = rx_flags; /* don't let top level (?i) bleed */ 7207 RExC_pm_flags = pm_flags; 7208 RExC_parse = exp; 7209 RExC_end = exp + plen; 7210 RExC_naughty = 0; 7211 RExC_emit_start = ri->program; 7212 RExC_emit = ri->program; 7213 RExC_emit_bound = ri->program + RExC_size + 1; 7214 pRExC_state->code_index = 0; 7215 7216 *((char*) RExC_emit++) = (char) REG_MAGIC; 7217 /* setup various meta data about recursion, this all requires 7218 * RExC_npar to be correctly set, and a bit later on we clear it */ 7219 if (RExC_seen & REG_RECURSE_SEEN) { 7220 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 7221 "%*s%*s Setting up open/close parens\n", 7222 22, "| |", (int)(0 * 2 + 1), "")); 7223 7224 /* setup RExC_open_parens, which holds the address of each 7225 * OPEN tag, and to make things simpler for the 0 index 7226 * the start of the program - this is used later for offsets */ 7227 Newxz(RExC_open_parens, RExC_npar,regnode *); 7228 SAVEFREEPV(RExC_open_parens); 7229 RExC_open_parens[0] = RExC_emit; 7230 7231 /* setup RExC_close_parens, which holds the address of each 7232 * CLOSE tag, and to make things simpler for the 0 index 7233 * the end of the program - this is used later for offsets */ 7234 Newxz(RExC_close_parens, RExC_npar,regnode *); 7235 SAVEFREEPV(RExC_close_parens); 7236 /* we dont know where end op starts yet, so we dont 7237 * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */ 7238 7239 /* Note, RExC_npar is 1 + the number of parens in a pattern. 7240 * So its 1 if there are no parens. */ 7241 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + 7242 ((RExC_npar & 0x07) != 0); 7243 Newx(RExC_study_chunk_recursed, 7244 RExC_study_chunk_recursed_bytes * RExC_npar, U8); 7245 SAVEFREEPV(RExC_study_chunk_recursed); 7246 } 7247 RExC_npar = 1; 7248 if (reg(pRExC_state, 0, &flags,1) == NULL) { 7249 ReREFCNT_dec(rx); 7250 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); 7251 } 7252 DEBUG_OPTIMISE_r( 7253 Perl_re_printf( aTHX_ "Starting post parse optimization\n"); 7254 ); 7255 7256 /* XXXX To minimize changes to RE engine we always allocate 7257 3-units-long substrs field. */ 7258 Newx(r->substrs, 1, struct reg_substr_data); 7259 if (RExC_recurse_count) { 7260 Newxz(RExC_recurse,RExC_recurse_count,regnode *); 7261 SAVEFREEPV(RExC_recurse); 7262 } 7263 7264 reStudy: 7265 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; 7266 DEBUG_r( 7267 RExC_study_chunk_recursed_count= 0; 7268 ); 7269 Zero(r->substrs, 1, struct reg_substr_data); 7270 if (RExC_study_chunk_recursed) { 7271 Zero(RExC_study_chunk_recursed, 7272 RExC_study_chunk_recursed_bytes * RExC_npar, U8); 7273 } 7274 7275 7276 #ifdef TRIE_STUDY_OPT 7277 if (!restudied) { 7278 StructCopy(&zero_scan_data, &data, scan_data_t); 7279 copyRExC_state = RExC_state; 7280 } else { 7281 U32 seen=RExC_seen; 7282 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n")); 7283 7284 RExC_state = copyRExC_state; 7285 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) 7286 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 7287 else 7288 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; 7289 StructCopy(&zero_scan_data, &data, scan_data_t); 7290 } 7291 #else 7292 StructCopy(&zero_scan_data, &data, scan_data_t); 7293 #endif 7294 7295 /* Dig out information for optimizations. */ 7296 r->extflags = RExC_flags; /* was pm_op */ 7297 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ 7298 7299 if (UTF) 7300 SvUTF8_on(rx); /* Unicode in it? */ 7301 ri->regstclass = NULL; 7302 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */ 7303 r->intflags |= PREGf_NAUGHTY; 7304 scan = ri->program + 1; /* First BRANCH. */ 7305 7306 /* testing for BRANCH here tells us whether there is "must appear" 7307 data in the pattern. If there is then we can use it for optimisations */ 7308 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. 7309 */ 7310 SSize_t fake; 7311 STRLEN longest_float_length, longest_fixed_length; 7312 regnode_ssc ch_class; /* pointed to by data */ 7313 int stclass_flag; 7314 SSize_t last_close = 0; /* pointed to by data */ 7315 regnode *first= scan; 7316 regnode *first_next= regnext(first); 7317 /* 7318 * Skip introductions and multiplicators >= 1 7319 * so that we can extract the 'meat' of the pattern that must 7320 * match in the large if() sequence following. 7321 * NOTE that EXACT is NOT covered here, as it is normally 7322 * picked up by the optimiser separately. 7323 * 7324 * This is unfortunate as the optimiser isnt handling lookahead 7325 * properly currently. 7326 * 7327 */ 7328 while ((OP(first) == OPEN && (sawopen = 1)) || 7329 /* An OR of *one* alternative - should not happen now. */ 7330 (OP(first) == BRANCH && OP(first_next) != BRANCH) || 7331 /* for now we can't handle lookbehind IFMATCH*/ 7332 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || 7333 (OP(first) == PLUS) || 7334 (OP(first) == MINMOD) || 7335 /* An {n,m} with n>0 */ 7336 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || 7337 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) 7338 { 7339 /* 7340 * the only op that could be a regnode is PLUS, all the rest 7341 * will be regnode_1 or regnode_2. 7342 * 7343 * (yves doesn't think this is true) 7344 */ 7345 if (OP(first) == PLUS) 7346 sawplus = 1; 7347 else { 7348 if (OP(first) == MINMOD) 7349 sawminmod = 1; 7350 first += regarglen[OP(first)]; 7351 } 7352 first = NEXTOPER(first); 7353 first_next= regnext(first); 7354 } 7355 7356 /* Starting-point info. */ 7357 again: 7358 DEBUG_PEEP("first:",first,0); 7359 /* Ignore EXACT as we deal with it later. */ 7360 if (PL_regkind[OP(first)] == EXACT) { 7361 if (OP(first) == EXACT || OP(first) == EXACTL) 7362 NOOP; /* Empty, get anchored substr later. */ 7363 else 7364 ri->regstclass = first; 7365 } 7366 #ifdef TRIE_STCLASS 7367 else if (PL_regkind[OP(first)] == TRIE && 7368 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) 7369 { 7370 /* this can happen only on restudy */ 7371 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); 7372 } 7373 #endif 7374 else if (REGNODE_SIMPLE(OP(first))) 7375 ri->regstclass = first; 7376 else if (PL_regkind[OP(first)] == BOUND || 7377 PL_regkind[OP(first)] == NBOUND) 7378 ri->regstclass = first; 7379 else if (PL_regkind[OP(first)] == BOL) { 7380 r->intflags |= (OP(first) == MBOL 7381 ? PREGf_ANCH_MBOL 7382 : PREGf_ANCH_SBOL); 7383 first = NEXTOPER(first); 7384 goto again; 7385 } 7386 else if (OP(first) == GPOS) { 7387 r->intflags |= PREGf_ANCH_GPOS; 7388 first = NEXTOPER(first); 7389 goto again; 7390 } 7391 else if ((!sawopen || !RExC_sawback) && 7392 !sawlookahead && 7393 (OP(first) == STAR && 7394 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && 7395 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) 7396 { 7397 /* turn .* into ^.* with an implied $*=1 */ 7398 const int type = 7399 (OP(NEXTOPER(first)) == REG_ANY) 7400 ? PREGf_ANCH_MBOL 7401 : PREGf_ANCH_SBOL; 7402 r->intflags |= (type | PREGf_IMPLICIT); 7403 first = NEXTOPER(first); 7404 goto again; 7405 } 7406 if (sawplus && !sawminmod && !sawlookahead 7407 && (!sawopen || !RExC_sawback) 7408 && !pRExC_state->num_code_blocks) /* May examine pos and $& */ 7409 /* x+ must match at the 1st pos of run of x's */ 7410 r->intflags |= PREGf_SKIP; 7411 7412 /* Scan is after the zeroth branch, first is atomic matcher. */ 7413 #ifdef TRIE_STUDY_OPT 7414 DEBUG_PARSE_r( 7415 if (!restudied) 7416 Perl_re_printf( aTHX_ "first at %"IVdf"\n", 7417 (IV)(first - scan + 1)) 7418 ); 7419 #else 7420 DEBUG_PARSE_r( 7421 Perl_re_printf( aTHX_ "first at %"IVdf"\n", 7422 (IV)(first - scan + 1)) 7423 ); 7424 #endif 7425 7426 7427 /* 7428 * If there's something expensive in the r.e., find the 7429 * longest literal string that must appear and make it the 7430 * regmust. Resolve ties in favor of later strings, since 7431 * the regstart check works with the beginning of the r.e. 7432 * and avoiding duplication strengthens checking. Not a 7433 * strong reason, but sufficient in the absence of others. 7434 * [Now we resolve ties in favor of the earlier string if 7435 * it happens that c_offset_min has been invalidated, since the 7436 * earlier string may buy us something the later one won't.] 7437 */ 7438 7439 data.longest_fixed = newSVpvs(""); 7440 data.longest_float = newSVpvs(""); 7441 data.last_found = newSVpvs(""); 7442 data.longest = &(data.longest_fixed); 7443 ENTER_with_name("study_chunk"); 7444 SAVEFREESV(data.longest_fixed); 7445 SAVEFREESV(data.longest_float); 7446 SAVEFREESV(data.last_found); 7447 first = scan; 7448 if (!ri->regstclass) { 7449 ssc_init(pRExC_state, &ch_class); 7450 data.start_class = &ch_class; 7451 stclass_flag = SCF_DO_STCLASS_AND; 7452 } else /* XXXX Check for BOUND? */ 7453 stclass_flag = 0; 7454 data.last_closep = &last_close; 7455 7456 DEBUG_RExC_seen(); 7457 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, 7458 scan + RExC_size, /* Up to end */ 7459 &data, -1, 0, NULL, 7460 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag 7461 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 7462 0); 7463 7464 7465 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); 7466 7467 7468 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) 7469 && data.last_start_min == 0 && data.last_end > 0 7470 && !RExC_seen_zerolen 7471 && !(RExC_seen & REG_VERBARG_SEEN) 7472 && !(RExC_seen & REG_GPOS_SEEN) 7473 ){ 7474 r->extflags |= RXf_CHECK_ALL; 7475 } 7476 scan_commit(pRExC_state, &data,&minlen,0); 7477 7478 longest_float_length = CHR_SVLEN(data.longest_float); 7479 7480 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ 7481 && data.offset_fixed == data.offset_float_min 7482 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) 7483 && S_setup_longest (aTHX_ pRExC_state, 7484 data.longest_float, 7485 &(r->float_utf8), 7486 &(r->float_substr), 7487 &(r->float_end_shift), 7488 data.lookbehind_float, 7489 data.offset_float_min, 7490 data.minlen_float, 7491 longest_float_length, 7492 cBOOL(data.flags & SF_FL_BEFORE_EOL), 7493 cBOOL(data.flags & SF_FL_BEFORE_MEOL))) 7494 { 7495 r->float_min_offset = data.offset_float_min - data.lookbehind_float; 7496 r->float_max_offset = data.offset_float_max; 7497 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ 7498 r->float_max_offset -= data.lookbehind_float; 7499 SvREFCNT_inc_simple_void_NN(data.longest_float); 7500 } 7501 else { 7502 r->float_substr = r->float_utf8 = NULL; 7503 longest_float_length = 0; 7504 } 7505 7506 longest_fixed_length = CHR_SVLEN(data.longest_fixed); 7507 7508 if (S_setup_longest (aTHX_ pRExC_state, 7509 data.longest_fixed, 7510 &(r->anchored_utf8), 7511 &(r->anchored_substr), 7512 &(r->anchored_end_shift), 7513 data.lookbehind_fixed, 7514 data.offset_fixed, 7515 data.minlen_fixed, 7516 longest_fixed_length, 7517 cBOOL(data.flags & SF_FIX_BEFORE_EOL), 7518 cBOOL(data.flags & SF_FIX_BEFORE_MEOL))) 7519 { 7520 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; 7521 SvREFCNT_inc_simple_void_NN(data.longest_fixed); 7522 } 7523 else { 7524 r->anchored_substr = r->anchored_utf8 = NULL; 7525 longest_fixed_length = 0; 7526 } 7527 LEAVE_with_name("study_chunk"); 7528 7529 if (ri->regstclass 7530 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) 7531 ri->regstclass = NULL; 7532 7533 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) 7534 && stclass_flag 7535 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) 7536 && is_ssc_worth_it(pRExC_state, data.start_class)) 7537 { 7538 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); 7539 7540 ssc_finalize(pRExC_state, data.start_class); 7541 7542 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 7543 StructCopy(data.start_class, 7544 (regnode_ssc*)RExC_rxi->data->data[n], 7545 regnode_ssc); 7546 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 7547 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 7548 DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); 7549 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state); 7550 Perl_re_printf( aTHX_ 7551 "synthetic stclass \"%s\".\n", 7552 SvPVX_const(sv));}); 7553 data.start_class = NULL; 7554 } 7555 7556 /* A temporary algorithm prefers floated substr to fixed one to dig 7557 * more info. */ 7558 if (longest_fixed_length > longest_float_length) { 7559 r->substrs->check_ix = 0; 7560 r->check_end_shift = r->anchored_end_shift; 7561 r->check_substr = r->anchored_substr; 7562 r->check_utf8 = r->anchored_utf8; 7563 r->check_offset_min = r->check_offset_max = r->anchored_offset; 7564 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) 7565 r->intflags |= PREGf_NOSCAN; 7566 } 7567 else { 7568 r->substrs->check_ix = 1; 7569 r->check_end_shift = r->float_end_shift; 7570 r->check_substr = r->float_substr; 7571 r->check_utf8 = r->float_utf8; 7572 r->check_offset_min = r->float_min_offset; 7573 r->check_offset_max = r->float_max_offset; 7574 } 7575 if ((r->check_substr || r->check_utf8) ) { 7576 r->extflags |= RXf_USE_INTUIT; 7577 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) 7578 r->extflags |= RXf_INTUIT_TAIL; 7579 } 7580 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; 7581 7582 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) 7583 if ( (STRLEN)minlen < longest_float_length ) 7584 minlen= longest_float_length; 7585 if ( (STRLEN)minlen < longest_fixed_length ) 7586 minlen= longest_fixed_length; 7587 */ 7588 } 7589 else { 7590 /* Several toplevels. Best we can is to set minlen. */ 7591 SSize_t fake; 7592 regnode_ssc ch_class; 7593 SSize_t last_close = 0; 7594 7595 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n")); 7596 7597 scan = ri->program + 1; 7598 ssc_init(pRExC_state, &ch_class); 7599 data.start_class = &ch_class; 7600 data.last_closep = &last_close; 7601 7602 DEBUG_RExC_seen(); 7603 minlen = study_chunk(pRExC_state, 7604 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, 7605 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied 7606 ? SCF_TRIE_DOING_RESTUDY 7607 : 0), 7608 0); 7609 7610 CHECK_RESTUDY_GOTO_butfirst(NOOP); 7611 7612 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 7613 = r->float_substr = r->float_utf8 = NULL; 7614 7615 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) 7616 && is_ssc_worth_it(pRExC_state, data.start_class)) 7617 { 7618 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); 7619 7620 ssc_finalize(pRExC_state, data.start_class); 7621 7622 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 7623 StructCopy(data.start_class, 7624 (regnode_ssc*)RExC_rxi->data->data[n], 7625 regnode_ssc); 7626 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 7627 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 7628 DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); 7629 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state); 7630 Perl_re_printf( aTHX_ 7631 "synthetic stclass \"%s\".\n", 7632 SvPVX_const(sv));}); 7633 data.start_class = NULL; 7634 } 7635 } 7636 7637 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { 7638 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; 7639 r->maxlen = REG_INFTY; 7640 } 7641 else { 7642 r->maxlen = RExC_maxlen; 7643 } 7644 7645 /* Guard against an embedded (?=) or (?<=) with a longer minlen than 7646 the "real" pattern. */ 7647 DEBUG_OPTIMISE_r({ 7648 Perl_re_printf( aTHX_ "minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n", 7649 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen); 7650 }); 7651 r->minlenret = minlen; 7652 if (r->minlen < minlen) 7653 r->minlen = minlen; 7654 7655 if (RExC_seen & REG_RECURSE_SEEN ) { 7656 r->intflags |= PREGf_RECURSE_SEEN; 7657 Newxz(r->recurse_locinput, r->nparens + 1, char *); 7658 } 7659 if (RExC_seen & REG_GPOS_SEEN) 7660 r->intflags |= PREGf_GPOS_SEEN; 7661 if (RExC_seen & REG_LOOKBEHIND_SEEN) 7662 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the 7663 lookbehind */ 7664 if (pRExC_state->num_code_blocks) 7665 r->extflags |= RXf_EVAL_SEEN; 7666 if (RExC_seen & REG_VERBARG_SEEN) 7667 { 7668 r->intflags |= PREGf_VERBARG_SEEN; 7669 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ 7670 } 7671 if (RExC_seen & REG_CUTGROUP_SEEN) 7672 r->intflags |= PREGf_CUTGROUP_SEEN; 7673 if (pm_flags & PMf_USE_RE_EVAL) 7674 r->intflags |= PREGf_USE_RE_EVAL; 7675 if (RExC_paren_names) 7676 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); 7677 else 7678 RXp_PAREN_NAMES(r) = NULL; 7679 7680 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED 7681 * so it can be used in pp.c */ 7682 if (r->intflags & PREGf_ANCH) 7683 r->extflags |= RXf_IS_ANCHORED; 7684 7685 7686 { 7687 /* this is used to identify "special" patterns that might result 7688 * in Perl NOT calling the regex engine and instead doing the match "itself", 7689 * particularly special cases in split//. By having the regex compiler 7690 * do this pattern matching at a regop level (instead of by inspecting the pattern) 7691 * we avoid weird issues with equivalent patterns resulting in different behavior, 7692 * AND we allow non Perl engines to get the same optimizations by the setting the 7693 * flags appropriately - Yves */ 7694 regnode *first = ri->program + 1; 7695 U8 fop = OP(first); 7696 regnode *next = regnext(first); 7697 U8 nop = OP(next); 7698 7699 if (PL_regkind[fop] == NOTHING && nop == END) 7700 r->extflags |= RXf_NULL; 7701 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END) 7702 /* when fop is SBOL first->flags will be true only when it was 7703 * produced by parsing /\A/, and not when parsing /^/. This is 7704 * very important for the split code as there we want to 7705 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m. 7706 * See rt #122761 for more details. -- Yves */ 7707 r->extflags |= RXf_START_ONLY; 7708 else if (fop == PLUS 7709 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE 7710 && nop == END) 7711 r->extflags |= RXf_WHITE; 7712 else if ( r->extflags & RXf_SPLIT 7713 && (fop == EXACT || fop == EXACTL) 7714 && STR_LEN(first) == 1 7715 && *(STRING(first)) == ' ' 7716 && nop == END ) 7717 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 7718 7719 } 7720 7721 if (RExC_contains_locale) { 7722 RXp_EXTFLAGS(r) |= RXf_TAINTED; 7723 } 7724 7725 #ifdef DEBUGGING 7726 if (RExC_paren_names) { 7727 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); 7728 ri->data->data[ri->name_list_idx] 7729 = (void*)SvREFCNT_inc(RExC_paren_name_list); 7730 } else 7731 #endif 7732 ri->name_list_idx = 0; 7733 7734 while ( RExC_recurse_count > 0 ) { 7735 const regnode *scan = RExC_recurse[ --RExC_recurse_count ]; 7736 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan ); 7737 } 7738 7739 Newxz(r->offs, RExC_npar, regexp_paren_pair); 7740 /* assume we don't need to swap parens around before we match */ 7741 DEBUG_TEST_r({ 7742 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n", 7743 (unsigned long)RExC_study_chunk_recursed_count); 7744 }); 7745 DEBUG_DUMP_r({ 7746 DEBUG_RExC_seen(); 7747 Perl_re_printf( aTHX_ "Final program:\n"); 7748 regdump(r); 7749 }); 7750 #ifdef RE_TRACK_PATTERN_OFFSETS 7751 DEBUG_OFFSETS_r(if (ri->u.offsets) { 7752 const STRLEN len = ri->u.offsets[0]; 7753 STRLEN i; 7754 GET_RE_DEBUG_FLAGS_DECL; 7755 Perl_re_printf( aTHX_ 7756 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); 7757 for (i = 1; i <= len; i++) { 7758 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) 7759 Perl_re_printf( aTHX_ "%"UVuf":%"UVuf"[%"UVuf"] ", 7760 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); 7761 } 7762 Perl_re_printf( aTHX_ "\n"); 7763 }); 7764 #endif 7765 7766 #ifdef USE_ITHREADS 7767 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated 7768 * by setting the regexp SV to readonly-only instead. If the 7769 * pattern's been recompiled, the USEDness should remain. */ 7770 if (old_re && SvREADONLY(old_re)) 7771 SvREADONLY_on(rx); 7772 #endif 7773 return rx; 7774 } 7775 7776 7777 SV* 7778 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, 7779 const U32 flags) 7780 { 7781 PERL_ARGS_ASSERT_REG_NAMED_BUFF; 7782 7783 PERL_UNUSED_ARG(value); 7784 7785 if (flags & RXapif_FETCH) { 7786 return reg_named_buff_fetch(rx, key, flags); 7787 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { 7788 Perl_croak_no_modify(); 7789 return NULL; 7790 } else if (flags & RXapif_EXISTS) { 7791 return reg_named_buff_exists(rx, key, flags) 7792 ? &PL_sv_yes 7793 : &PL_sv_no; 7794 } else if (flags & RXapif_REGNAMES) { 7795 return reg_named_buff_all(rx, flags); 7796 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { 7797 return reg_named_buff_scalar(rx, flags); 7798 } else { 7799 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); 7800 return NULL; 7801 } 7802 } 7803 7804 SV* 7805 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, 7806 const U32 flags) 7807 { 7808 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; 7809 PERL_UNUSED_ARG(lastkey); 7810 7811 if (flags & RXapif_FIRSTKEY) 7812 return reg_named_buff_firstkey(rx, flags); 7813 else if (flags & RXapif_NEXTKEY) 7814 return reg_named_buff_nextkey(rx, flags); 7815 else { 7816 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", 7817 (int)flags); 7818 return NULL; 7819 } 7820 } 7821 7822 SV* 7823 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, 7824 const U32 flags) 7825 { 7826 AV *retarray = NULL; 7827 SV *ret; 7828 struct regexp *const rx = ReANY(r); 7829 7830 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; 7831 7832 if (flags & RXapif_ALL) 7833 retarray=newAV(); 7834 7835 if (rx && RXp_PAREN_NAMES(rx)) { 7836 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); 7837 if (he_str) { 7838 IV i; 7839 SV* sv_dat=HeVAL(he_str); 7840 I32 *nums=(I32*)SvPVX(sv_dat); 7841 for ( i=0; i<SvIVX(sv_dat); i++ ) { 7842 if ((I32)(rx->nparens) >= nums[i] 7843 && rx->offs[nums[i]].start != -1 7844 && rx->offs[nums[i]].end != -1) 7845 { 7846 ret = newSVpvs(""); 7847 CALLREG_NUMBUF_FETCH(r,nums[i],ret); 7848 if (!retarray) 7849 return ret; 7850 } else { 7851 if (retarray) 7852 ret = newSVsv(&PL_sv_undef); 7853 } 7854 if (retarray) 7855 av_push(retarray, ret); 7856 } 7857 if (retarray) 7858 return newRV_noinc(MUTABLE_SV(retarray)); 7859 } 7860 } 7861 return NULL; 7862 } 7863 7864 bool 7865 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, 7866 const U32 flags) 7867 { 7868 struct regexp *const rx = ReANY(r); 7869 7870 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; 7871 7872 if (rx && RXp_PAREN_NAMES(rx)) { 7873 if (flags & RXapif_ALL) { 7874 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); 7875 } else { 7876 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); 7877 if (sv) { 7878 SvREFCNT_dec_NN(sv); 7879 return TRUE; 7880 } else { 7881 return FALSE; 7882 } 7883 } 7884 } else { 7885 return FALSE; 7886 } 7887 } 7888 7889 SV* 7890 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) 7891 { 7892 struct regexp *const rx = ReANY(r); 7893 7894 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; 7895 7896 if ( rx && RXp_PAREN_NAMES(rx) ) { 7897 (void)hv_iterinit(RXp_PAREN_NAMES(rx)); 7898 7899 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); 7900 } else { 7901 return FALSE; 7902 } 7903 } 7904 7905 SV* 7906 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) 7907 { 7908 struct regexp *const rx = ReANY(r); 7909 GET_RE_DEBUG_FLAGS_DECL; 7910 7911 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; 7912 7913 if (rx && RXp_PAREN_NAMES(rx)) { 7914 HV *hv = RXp_PAREN_NAMES(rx); 7915 HE *temphe; 7916 while ( (temphe = hv_iternext_flags(hv,0)) ) { 7917 IV i; 7918 IV parno = 0; 7919 SV* sv_dat = HeVAL(temphe); 7920 I32 *nums = (I32*)SvPVX(sv_dat); 7921 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 7922 if ((I32)(rx->lastparen) >= nums[i] && 7923 rx->offs[nums[i]].start != -1 && 7924 rx->offs[nums[i]].end != -1) 7925 { 7926 parno = nums[i]; 7927 break; 7928 } 7929 } 7930 if (parno || flags & RXapif_ALL) { 7931 return newSVhek(HeKEY_hek(temphe)); 7932 } 7933 } 7934 } 7935 return NULL; 7936 } 7937 7938 SV* 7939 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) 7940 { 7941 SV *ret; 7942 AV *av; 7943 SSize_t length; 7944 struct regexp *const rx = ReANY(r); 7945 7946 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; 7947 7948 if (rx && RXp_PAREN_NAMES(rx)) { 7949 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { 7950 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); 7951 } else if (flags & RXapif_ONE) { 7952 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); 7953 av = MUTABLE_AV(SvRV(ret)); 7954 length = av_tindex(av); 7955 SvREFCNT_dec_NN(ret); 7956 return newSViv(length + 1); 7957 } else { 7958 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", 7959 (int)flags); 7960 return NULL; 7961 } 7962 } 7963 return &PL_sv_undef; 7964 } 7965 7966 SV* 7967 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) 7968 { 7969 struct regexp *const rx = ReANY(r); 7970 AV *av = newAV(); 7971 7972 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; 7973 7974 if (rx && RXp_PAREN_NAMES(rx)) { 7975 HV *hv= RXp_PAREN_NAMES(rx); 7976 HE *temphe; 7977 (void)hv_iterinit(hv); 7978 while ( (temphe = hv_iternext_flags(hv,0)) ) { 7979 IV i; 7980 IV parno = 0; 7981 SV* sv_dat = HeVAL(temphe); 7982 I32 *nums = (I32*)SvPVX(sv_dat); 7983 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 7984 if ((I32)(rx->lastparen) >= nums[i] && 7985 rx->offs[nums[i]].start != -1 && 7986 rx->offs[nums[i]].end != -1) 7987 { 7988 parno = nums[i]; 7989 break; 7990 } 7991 } 7992 if (parno || flags & RXapif_ALL) { 7993 av_push(av, newSVhek(HeKEY_hek(temphe))); 7994 } 7995 } 7996 } 7997 7998 return newRV_noinc(MUTABLE_SV(av)); 7999 } 8000 8001 void 8002 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, 8003 SV * const sv) 8004 { 8005 struct regexp *const rx = ReANY(r); 8006 char *s = NULL; 8007 SSize_t i = 0; 8008 SSize_t s1, t1; 8009 I32 n = paren; 8010 8011 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; 8012 8013 if ( n == RX_BUFF_IDX_CARET_PREMATCH 8014 || n == RX_BUFF_IDX_CARET_FULLMATCH 8015 || n == RX_BUFF_IDX_CARET_POSTMATCH 8016 ) 8017 { 8018 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 8019 if (!keepcopy) { 8020 /* on something like 8021 * $r = qr/.../; 8022 * /$qr/p; 8023 * the KEEPCOPY is set on the PMOP rather than the regex */ 8024 if (PL_curpm && r == PM_GETRE(PL_curpm)) 8025 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 8026 } 8027 if (!keepcopy) 8028 goto ret_undef; 8029 } 8030 8031 if (!rx->subbeg) 8032 goto ret_undef; 8033 8034 if (n == RX_BUFF_IDX_CARET_FULLMATCH) 8035 /* no need to distinguish between them any more */ 8036 n = RX_BUFF_IDX_FULLMATCH; 8037 8038 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) 8039 && rx->offs[0].start != -1) 8040 { 8041 /* $`, ${^PREMATCH} */ 8042 i = rx->offs[0].start; 8043 s = rx->subbeg; 8044 } 8045 else 8046 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) 8047 && rx->offs[0].end != -1) 8048 { 8049 /* $', ${^POSTMATCH} */ 8050 s = rx->subbeg - rx->suboffset + rx->offs[0].end; 8051 i = rx->sublen + rx->suboffset - rx->offs[0].end; 8052 } 8053 else 8054 if ( 0 <= n && n <= (I32)rx->nparens && 8055 (s1 = rx->offs[n].start) != -1 && 8056 (t1 = rx->offs[n].end) != -1) 8057 { 8058 /* $&, ${^MATCH}, $1 ... */ 8059 i = t1 - s1; 8060 s = rx->subbeg + s1 - rx->suboffset; 8061 } else { 8062 goto ret_undef; 8063 } 8064 8065 assert(s >= rx->subbeg); 8066 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); 8067 if (i >= 0) { 8068 #ifdef NO_TAINT_SUPPORT 8069 sv_setpvn(sv, s, i); 8070 #else 8071 const int oldtainted = TAINT_get; 8072 TAINT_NOT; 8073 sv_setpvn(sv, s, i); 8074 TAINT_set(oldtainted); 8075 #endif 8076 if (RXp_MATCH_UTF8(rx)) 8077 SvUTF8_on(sv); 8078 else 8079 SvUTF8_off(sv); 8080 if (TAINTING_get) { 8081 if (RXp_MATCH_TAINTED(rx)) { 8082 if (SvTYPE(sv) >= SVt_PVMG) { 8083 MAGIC* const mg = SvMAGIC(sv); 8084 MAGIC* mgt; 8085 TAINT; 8086 SvMAGIC_set(sv, mg->mg_moremagic); 8087 SvTAINT(sv); 8088 if ((mgt = SvMAGIC(sv))) { 8089 mg->mg_moremagic = mgt; 8090 SvMAGIC_set(sv, mg); 8091 } 8092 } else { 8093 TAINT; 8094 SvTAINT(sv); 8095 } 8096 } else 8097 SvTAINTED_off(sv); 8098 } 8099 } else { 8100 ret_undef: 8101 sv_setsv(sv,&PL_sv_undef); 8102 return; 8103 } 8104 } 8105 8106 void 8107 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, 8108 SV const * const value) 8109 { 8110 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; 8111 8112 PERL_UNUSED_ARG(rx); 8113 PERL_UNUSED_ARG(paren); 8114 PERL_UNUSED_ARG(value); 8115 8116 if (!PL_localizing) 8117 Perl_croak_no_modify(); 8118 } 8119 8120 I32 8121 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, 8122 const I32 paren) 8123 { 8124 struct regexp *const rx = ReANY(r); 8125 I32 i; 8126 I32 s1, t1; 8127 8128 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; 8129 8130 if ( paren == RX_BUFF_IDX_CARET_PREMATCH 8131 || paren == RX_BUFF_IDX_CARET_FULLMATCH 8132 || paren == RX_BUFF_IDX_CARET_POSTMATCH 8133 ) 8134 { 8135 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 8136 if (!keepcopy) { 8137 /* on something like 8138 * $r = qr/.../; 8139 * /$qr/p; 8140 * the KEEPCOPY is set on the PMOP rather than the regex */ 8141 if (PL_curpm && r == PM_GETRE(PL_curpm)) 8142 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 8143 } 8144 if (!keepcopy) 8145 goto warn_undef; 8146 } 8147 8148 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ 8149 switch (paren) { 8150 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ 8151 case RX_BUFF_IDX_PREMATCH: /* $` */ 8152 if (rx->offs[0].start != -1) { 8153 i = rx->offs[0].start; 8154 if (i > 0) { 8155 s1 = 0; 8156 t1 = i; 8157 goto getlen; 8158 } 8159 } 8160 return 0; 8161 8162 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ 8163 case RX_BUFF_IDX_POSTMATCH: /* $' */ 8164 if (rx->offs[0].end != -1) { 8165 i = rx->sublen - rx->offs[0].end; 8166 if (i > 0) { 8167 s1 = rx->offs[0].end; 8168 t1 = rx->sublen; 8169 goto getlen; 8170 } 8171 } 8172 return 0; 8173 8174 default: /* $& / ${^MATCH}, $1, $2, ... */ 8175 if (paren <= (I32)rx->nparens && 8176 (s1 = rx->offs[paren].start) != -1 && 8177 (t1 = rx->offs[paren].end) != -1) 8178 { 8179 i = t1 - s1; 8180 goto getlen; 8181 } else { 8182 warn_undef: 8183 if (ckWARN(WARN_UNINITIALIZED)) 8184 report_uninit((const SV *)sv); 8185 return 0; 8186 } 8187 } 8188 getlen: 8189 if (i > 0 && RXp_MATCH_UTF8(rx)) { 8190 const char * const s = rx->subbeg - rx->suboffset + s1; 8191 const U8 *ep; 8192 STRLEN el; 8193 8194 i = t1 - s1; 8195 if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) 8196 i = el; 8197 } 8198 return i; 8199 } 8200 8201 SV* 8202 Perl_reg_qr_package(pTHX_ REGEXP * const rx) 8203 { 8204 PERL_ARGS_ASSERT_REG_QR_PACKAGE; 8205 PERL_UNUSED_ARG(rx); 8206 if (0) 8207 return NULL; 8208 else 8209 return newSVpvs("Regexp"); 8210 } 8211 8212 /* Scans the name of a named buffer from the pattern. 8213 * If flags is REG_RSN_RETURN_NULL returns null. 8214 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name 8215 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding 8216 * to the parsed name as looked up in the RExC_paren_names hash. 8217 * If there is an error throws a vFAIL().. type exception. 8218 */ 8219 8220 #define REG_RSN_RETURN_NULL 0 8221 #define REG_RSN_RETURN_NAME 1 8222 #define REG_RSN_RETURN_DATA 2 8223 8224 STATIC SV* 8225 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) 8226 { 8227 char *name_start = RExC_parse; 8228 8229 PERL_ARGS_ASSERT_REG_SCAN_NAME; 8230 8231 assert (RExC_parse <= RExC_end); 8232 if (RExC_parse == RExC_end) NOOP; 8233 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { 8234 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by 8235 * using do...while */ 8236 if (UTF) 8237 do { 8238 RExC_parse += UTF8SKIP(RExC_parse); 8239 } while (isWORDCHAR_utf8((U8*)RExC_parse)); 8240 else 8241 do { 8242 RExC_parse++; 8243 } while (isWORDCHAR(*RExC_parse)); 8244 } else { 8245 RExC_parse++; /* so the <- from the vFAIL is after the offending 8246 character */ 8247 vFAIL("Group name must start with a non-digit word character"); 8248 } 8249 if ( flags ) { 8250 SV* sv_name 8251 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), 8252 SVs_TEMP | (UTF ? SVf_UTF8 : 0)); 8253 if ( flags == REG_RSN_RETURN_NAME) 8254 return sv_name; 8255 else if (flags==REG_RSN_RETURN_DATA) { 8256 HE *he_str = NULL; 8257 SV *sv_dat = NULL; 8258 if ( ! sv_name ) /* should not happen*/ 8259 Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); 8260 if (RExC_paren_names) 8261 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); 8262 if ( he_str ) 8263 sv_dat = HeVAL(he_str); 8264 if ( ! sv_dat ) 8265 vFAIL("Reference to nonexistent named group"); 8266 return sv_dat; 8267 } 8268 else { 8269 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", 8270 (unsigned long) flags); 8271 } 8272 NOT_REACHED; /* NOTREACHED */ 8273 } 8274 return NULL; 8275 } 8276 8277 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ 8278 int num; \ 8279 if (RExC_lastparse!=RExC_parse) { \ 8280 Perl_re_printf( aTHX_ "%s", \ 8281 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \ 8282 RExC_end - RExC_parse, 16, \ 8283 "", "", \ 8284 PERL_PV_ESCAPE_UNI_DETECT | \ 8285 PERL_PV_PRETTY_ELLIPSES | \ 8286 PERL_PV_PRETTY_LTGT | \ 8287 PERL_PV_ESCAPE_RE | \ 8288 PERL_PV_PRETTY_EXACTSIZE \ 8289 ) \ 8290 ); \ 8291 } else \ 8292 Perl_re_printf( aTHX_ "%16s",""); \ 8293 \ 8294 if (SIZE_ONLY) \ 8295 num = RExC_size + 1; \ 8296 else \ 8297 num=REG_NODE_NUM(RExC_emit); \ 8298 if (RExC_lastnum!=num) \ 8299 Perl_re_printf( aTHX_ "|%4d",num); \ 8300 else \ 8301 Perl_re_printf( aTHX_ "|%4s",""); \ 8302 Perl_re_printf( aTHX_ "|%*s%-4s", \ 8303 (int)((depth*2)), "", \ 8304 (funcname) \ 8305 ); \ 8306 RExC_lastnum=num; \ 8307 RExC_lastparse=RExC_parse; \ 8308 }) 8309 8310 8311 8312 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ 8313 DEBUG_PARSE_MSG((funcname)); \ 8314 Perl_re_printf( aTHX_ "%4s","\n"); \ 8315 }) 8316 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\ 8317 DEBUG_PARSE_MSG((funcname)); \ 8318 Perl_re_printf( aTHX_ fmt "\n",args); \ 8319 }) 8320 8321 /* This section of code defines the inversion list object and its methods. The 8322 * interfaces are highly subject to change, so as much as possible is static to 8323 * this file. An inversion list is here implemented as a malloc'd C UV array 8324 * as an SVt_INVLIST scalar. 8325 * 8326 * An inversion list for Unicode is an array of code points, sorted by ordinal 8327 * number. The zeroth element is the first code point in the list. The 1th 8328 * element is the first element beyond that not in the list. In other words, 8329 * the first range is 8330 * invlist[0]..(invlist[1]-1) 8331 * The other ranges follow. Thus every element whose index is divisible by two 8332 * marks the beginning of a range that is in the list, and every element not 8333 * divisible by two marks the beginning of a range not in the list. A single 8334 * element inversion list that contains the single code point N generally 8335 * consists of two elements 8336 * invlist[0] == N 8337 * invlist[1] == N+1 8338 * (The exception is when N is the highest representable value on the 8339 * machine, in which case the list containing just it would be a single 8340 * element, itself. By extension, if the last range in the list extends to 8341 * infinity, then the first element of that range will be in the inversion list 8342 * at a position that is divisible by two, and is the final element in the 8343 * list.) 8344 * Taking the complement (inverting) an inversion list is quite simple, if the 8345 * first element is 0, remove it; otherwise add a 0 element at the beginning. 8346 * This implementation reserves an element at the beginning of each inversion 8347 * list to always contain 0; there is an additional flag in the header which 8348 * indicates if the list begins at the 0, or is offset to begin at the next 8349 * element. 8350 * 8351 * More about inversion lists can be found in "Unicode Demystified" 8352 * Chapter 13 by Richard Gillam, published by Addison-Wesley. 8353 * More will be coming when functionality is added later. 8354 * 8355 * The inversion list data structure is currently implemented as an SV pointing 8356 * to an array of UVs that the SV thinks are bytes. This allows us to have an 8357 * array of UV whose memory management is automatically handled by the existing 8358 * facilities for SV's. 8359 * 8360 * Some of the methods should always be private to the implementation, and some 8361 * should eventually be made public */ 8362 8363 /* The header definitions are in F<invlist_inline.h> */ 8364 8365 PERL_STATIC_INLINE UV* 8366 S__invlist_array_init(SV* const invlist, const bool will_have_0) 8367 { 8368 /* Returns a pointer to the first element in the inversion list's array. 8369 * This is called upon initialization of an inversion list. Where the 8370 * array begins depends on whether the list has the code point U+0000 in it 8371 * or not. The other parameter tells it whether the code that follows this 8372 * call is about to put a 0 in the inversion list or not. The first 8373 * element is either the element reserved for 0, if TRUE, or the element 8374 * after it, if FALSE */ 8375 8376 bool* offset = get_invlist_offset_addr(invlist); 8377 UV* zero_addr = (UV *) SvPVX(invlist); 8378 8379 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; 8380 8381 /* Must be empty */ 8382 assert(! _invlist_len(invlist)); 8383 8384 *zero_addr = 0; 8385 8386 /* 1^1 = 0; 1^0 = 1 */ 8387 *offset = 1 ^ will_have_0; 8388 return zero_addr + *offset; 8389 } 8390 8391 PERL_STATIC_INLINE void 8392 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) 8393 { 8394 /* Sets the current number of elements stored in the inversion list. 8395 * Updates SvCUR correspondingly */ 8396 PERL_UNUSED_CONTEXT; 8397 PERL_ARGS_ASSERT_INVLIST_SET_LEN; 8398 8399 assert(SvTYPE(invlist) == SVt_INVLIST); 8400 8401 SvCUR_set(invlist, 8402 (len == 0) 8403 ? 0 8404 : TO_INTERNAL_SIZE(len + offset)); 8405 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); 8406 } 8407 8408 #ifndef PERL_IN_XSUB_RE 8409 8410 STATIC void 8411 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) 8412 { 8413 /* Replaces the inversion list in 'src' with the one in 'dest'. It steals 8414 * the list from 'src', so 'src' is made to have a NULL list. This is 8415 * similar to what SvSetMagicSV() would do, if it were implemented on 8416 * inversion lists, though this routine avoids a copy */ 8417 8418 const UV src_len = _invlist_len(src); 8419 const bool src_offset = *get_invlist_offset_addr(src); 8420 const STRLEN src_byte_len = SvLEN(src); 8421 char * array = SvPVX(src); 8422 8423 const int oldtainted = TAINT_get; 8424 8425 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC; 8426 8427 assert(SvTYPE(src) == SVt_INVLIST); 8428 assert(SvTYPE(dest) == SVt_INVLIST); 8429 assert(! invlist_is_iterating(src)); 8430 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src)); 8431 8432 /* Make sure it ends in the right place with a NUL, as our inversion list 8433 * manipulations aren't careful to keep this true, but sv_usepvn_flags() 8434 * asserts it */ 8435 array[src_byte_len - 1] = '\0'; 8436 8437 TAINT_NOT; /* Otherwise it breaks */ 8438 sv_usepvn_flags(dest, 8439 (char *) array, 8440 src_byte_len - 1, 8441 8442 /* This flag is documented to cause a copy to be avoided */ 8443 SV_HAS_TRAILING_NUL); 8444 TAINT_set(oldtainted); 8445 SvPV_set(src, 0); 8446 SvLEN_set(src, 0); 8447 SvCUR_set(src, 0); 8448 8449 /* Finish up copying over the other fields in an inversion list */ 8450 *get_invlist_offset_addr(dest) = src_offset; 8451 invlist_set_len(dest, src_len, src_offset); 8452 *get_invlist_previous_index_addr(dest) = 0; 8453 invlist_iterfinish(dest); 8454 } 8455 8456 PERL_STATIC_INLINE IV* 8457 S_get_invlist_previous_index_addr(SV* invlist) 8458 { 8459 /* Return the address of the IV that is reserved to hold the cached index 8460 * */ 8461 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; 8462 8463 assert(SvTYPE(invlist) == SVt_INVLIST); 8464 8465 return &(((XINVLIST*) SvANY(invlist))->prev_index); 8466 } 8467 8468 PERL_STATIC_INLINE IV 8469 S_invlist_previous_index(SV* const invlist) 8470 { 8471 /* Returns cached index of previous search */ 8472 8473 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; 8474 8475 return *get_invlist_previous_index_addr(invlist); 8476 } 8477 8478 PERL_STATIC_INLINE void 8479 S_invlist_set_previous_index(SV* const invlist, const IV index) 8480 { 8481 /* Caches <index> for later retrieval */ 8482 8483 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; 8484 8485 assert(index == 0 || index < (int) _invlist_len(invlist)); 8486 8487 *get_invlist_previous_index_addr(invlist) = index; 8488 } 8489 8490 PERL_STATIC_INLINE void 8491 S_invlist_trim(SV* invlist) 8492 { 8493 /* Free the not currently-being-used space in an inversion list */ 8494 8495 /* But don't free up the space needed for the 0 UV that is always at the 8496 * beginning of the list, nor the trailing NUL */ 8497 const UV min_size = TO_INTERNAL_SIZE(1) + 1; 8498 8499 PERL_ARGS_ASSERT_INVLIST_TRIM; 8500 8501 assert(SvTYPE(invlist) == SVt_INVLIST); 8502 8503 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1)); 8504 } 8505 8506 PERL_STATIC_INLINE void 8507 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */ 8508 { 8509 PERL_ARGS_ASSERT_INVLIST_CLEAR; 8510 8511 assert(SvTYPE(invlist) == SVt_INVLIST); 8512 8513 invlist_set_len(invlist, 0, 0); 8514 invlist_trim(invlist); 8515 } 8516 8517 #endif /* ifndef PERL_IN_XSUB_RE */ 8518 8519 PERL_STATIC_INLINE bool 8520 S_invlist_is_iterating(SV* const invlist) 8521 { 8522 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; 8523 8524 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; 8525 } 8526 8527 PERL_STATIC_INLINE UV 8528 S_invlist_max(SV* const invlist) 8529 { 8530 /* Returns the maximum number of elements storable in the inversion list's 8531 * array, without having to realloc() */ 8532 8533 PERL_ARGS_ASSERT_INVLIST_MAX; 8534 8535 assert(SvTYPE(invlist) == SVt_INVLIST); 8536 8537 /* Assumes worst case, in which the 0 element is not counted in the 8538 * inversion list, so subtracts 1 for that */ 8539 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ 8540 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 8541 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; 8542 } 8543 8544 #ifndef PERL_IN_XSUB_RE 8545 SV* 8546 Perl__new_invlist(pTHX_ IV initial_size) 8547 { 8548 8549 /* Return a pointer to a newly constructed inversion list, with enough 8550 * space to store 'initial_size' elements. If that number is negative, a 8551 * system default is used instead */ 8552 8553 SV* new_list; 8554 8555 if (initial_size < 0) { 8556 initial_size = 10; 8557 } 8558 8559 /* Allocate the initial space */ 8560 new_list = newSV_type(SVt_INVLIST); 8561 8562 /* First 1 is in case the zero element isn't in the list; second 1 is for 8563 * trailing NUL */ 8564 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); 8565 invlist_set_len(new_list, 0, 0); 8566 8567 /* Force iterinit() to be used to get iteration to work */ 8568 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; 8569 8570 *get_invlist_previous_index_addr(new_list) = 0; 8571 8572 return new_list; 8573 } 8574 8575 SV* 8576 Perl__new_invlist_C_array(pTHX_ const UV* const list) 8577 { 8578 /* Return a pointer to a newly constructed inversion list, initialized to 8579 * point to <list>, which has to be in the exact correct inversion list 8580 * form, including internal fields. Thus this is a dangerous routine that 8581 * should not be used in the wrong hands. The passed in 'list' contains 8582 * several header fields at the beginning that are not part of the 8583 * inversion list body proper */ 8584 8585 const STRLEN length = (STRLEN) list[0]; 8586 const UV version_id = list[1]; 8587 const bool offset = cBOOL(list[2]); 8588 #define HEADER_LENGTH 3 8589 /* If any of the above changes in any way, you must change HEADER_LENGTH 8590 * (if appropriate) and regenerate INVLIST_VERSION_ID by running 8591 * perl -E 'say int(rand 2**31-1)' 8592 */ 8593 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and 8594 data structure type, so that one being 8595 passed in can be validated to be an 8596 inversion list of the correct vintage. 8597 */ 8598 8599 SV* invlist = newSV_type(SVt_INVLIST); 8600 8601 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; 8602 8603 if (version_id != INVLIST_VERSION_ID) { 8604 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); 8605 } 8606 8607 /* The generated array passed in includes header elements that aren't part 8608 * of the list proper, so start it just after them */ 8609 SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); 8610 8611 SvLEN_set(invlist, 0); /* Means we own the contents, and the system 8612 shouldn't touch it */ 8613 8614 *(get_invlist_offset_addr(invlist)) = offset; 8615 8616 /* The 'length' passed to us is the physical number of elements in the 8617 * inversion list. But if there is an offset the logical number is one 8618 * less than that */ 8619 invlist_set_len(invlist, length - offset, offset); 8620 8621 invlist_set_previous_index(invlist, 0); 8622 8623 /* Initialize the iteration pointer. */ 8624 invlist_iterfinish(invlist); 8625 8626 SvREADONLY_on(invlist); 8627 8628 return invlist; 8629 } 8630 #endif /* ifndef PERL_IN_XSUB_RE */ 8631 8632 STATIC void 8633 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) 8634 { 8635 /* Grow the maximum size of an inversion list */ 8636 8637 PERL_ARGS_ASSERT_INVLIST_EXTEND; 8638 8639 assert(SvTYPE(invlist) == SVt_INVLIST); 8640 8641 /* Add one to account for the zero element at the beginning which may not 8642 * be counted by the calling parameters */ 8643 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); 8644 } 8645 8646 STATIC void 8647 S__append_range_to_invlist(pTHX_ SV* const invlist, 8648 const UV start, const UV end) 8649 { 8650 /* Subject to change or removal. Append the range from 'start' to 'end' at 8651 * the end of the inversion list. The range must be above any existing 8652 * ones. */ 8653 8654 UV* array; 8655 UV max = invlist_max(invlist); 8656 UV len = _invlist_len(invlist); 8657 bool offset; 8658 8659 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; 8660 8661 if (len == 0) { /* Empty lists must be initialized */ 8662 offset = start != 0; 8663 array = _invlist_array_init(invlist, ! offset); 8664 } 8665 else { 8666 /* Here, the existing list is non-empty. The current max entry in the 8667 * list is generally the first value not in the set, except when the 8668 * set extends to the end of permissible values, in which case it is 8669 * the first entry in that final set, and so this call is an attempt to 8670 * append out-of-order */ 8671 8672 UV final_element = len - 1; 8673 array = invlist_array(invlist); 8674 if (array[final_element] > start 8675 || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) 8676 { 8677 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", 8678 array[final_element], start, 8679 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); 8680 } 8681 8682 /* Here, it is a legal append. If the new range begins with the first 8683 * value not in the set, it is extending the set, so the new first 8684 * value not in the set is one greater than the newly extended range. 8685 * */ 8686 offset = *get_invlist_offset_addr(invlist); 8687 if (array[final_element] == start) { 8688 if (end != UV_MAX) { 8689 array[final_element] = end + 1; 8690 } 8691 else { 8692 /* But if the end is the maximum representable on the machine, 8693 * just let the range that this would extend to have no end */ 8694 invlist_set_len(invlist, len - 1, offset); 8695 } 8696 return; 8697 } 8698 } 8699 8700 /* Here the new range doesn't extend any existing set. Add it */ 8701 8702 len += 2; /* Includes an element each for the start and end of range */ 8703 8704 /* If wll overflow the existing space, extend, which may cause the array to 8705 * be moved */ 8706 if (max < len) { 8707 invlist_extend(invlist, len); 8708 8709 /* Have to set len here to avoid assert failure in invlist_array() */ 8710 invlist_set_len(invlist, len, offset); 8711 8712 array = invlist_array(invlist); 8713 } 8714 else { 8715 invlist_set_len(invlist, len, offset); 8716 } 8717 8718 /* The next item on the list starts the range, the one after that is 8719 * one past the new range. */ 8720 array[len - 2] = start; 8721 if (end != UV_MAX) { 8722 array[len - 1] = end + 1; 8723 } 8724 else { 8725 /* But if the end is the maximum representable on the machine, just let 8726 * the range have no end */ 8727 invlist_set_len(invlist, len - 1, offset); 8728 } 8729 } 8730 8731 #ifndef PERL_IN_XSUB_RE 8732 8733 IV 8734 Perl__invlist_search(SV* const invlist, const UV cp) 8735 { 8736 /* Searches the inversion list for the entry that contains the input code 8737 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the 8738 * return value is the index into the list's array of the range that 8739 * contains <cp>, that is, 'i' such that 8740 * array[i] <= cp < array[i+1] 8741 */ 8742 8743 IV low = 0; 8744 IV mid; 8745 IV high = _invlist_len(invlist); 8746 const IV highest_element = high - 1; 8747 const UV* array; 8748 8749 PERL_ARGS_ASSERT__INVLIST_SEARCH; 8750 8751 /* If list is empty, return failure. */ 8752 if (high == 0) { 8753 return -1; 8754 } 8755 8756 /* (We can't get the array unless we know the list is non-empty) */ 8757 array = invlist_array(invlist); 8758 8759 mid = invlist_previous_index(invlist); 8760 assert(mid >=0); 8761 if (mid > highest_element) { 8762 mid = highest_element; 8763 } 8764 8765 /* <mid> contains the cache of the result of the previous call to this 8766 * function (0 the first time). See if this call is for the same result, 8767 * or if it is for mid-1. This is under the theory that calls to this 8768 * function will often be for related code points that are near each other. 8769 * And benchmarks show that caching gives better results. We also test 8770 * here if the code point is within the bounds of the list. These tests 8771 * replace others that would have had to be made anyway to make sure that 8772 * the array bounds were not exceeded, and these give us extra information 8773 * at the same time */ 8774 if (cp >= array[mid]) { 8775 if (cp >= array[highest_element]) { 8776 return highest_element; 8777 } 8778 8779 /* Here, array[mid] <= cp < array[highest_element]. This means that 8780 * the final element is not the answer, so can exclude it; it also 8781 * means that <mid> is not the final element, so can refer to 'mid + 1' 8782 * safely */ 8783 if (cp < array[mid + 1]) { 8784 return mid; 8785 } 8786 high--; 8787 low = mid + 1; 8788 } 8789 else { /* cp < aray[mid] */ 8790 if (cp < array[0]) { /* Fail if outside the array */ 8791 return -1; 8792 } 8793 high = mid; 8794 if (cp >= array[mid - 1]) { 8795 goto found_entry; 8796 } 8797 } 8798 8799 /* Binary search. What we are looking for is <i> such that 8800 * array[i] <= cp < array[i+1] 8801 * The loop below converges on the i+1. Note that there may not be an 8802 * (i+1)th element in the array, and things work nonetheless */ 8803 while (low < high) { 8804 mid = (low + high) / 2; 8805 assert(mid <= highest_element); 8806 if (array[mid] <= cp) { /* cp >= array[mid] */ 8807 low = mid + 1; 8808 8809 /* We could do this extra test to exit the loop early. 8810 if (cp < array[low]) { 8811 return mid; 8812 } 8813 */ 8814 } 8815 else { /* cp < array[mid] */ 8816 high = mid; 8817 } 8818 } 8819 8820 found_entry: 8821 high--; 8822 invlist_set_previous_index(invlist, high); 8823 return high; 8824 } 8825 8826 void 8827 Perl__invlist_populate_swatch(SV* const invlist, 8828 const UV start, const UV end, U8* swatch) 8829 { 8830 /* populates a swatch of a swash the same way swatch_get() does in utf8.c, 8831 * but is used when the swash has an inversion list. This makes this much 8832 * faster, as it uses a binary search instead of a linear one. This is 8833 * intimately tied to that function, and perhaps should be in utf8.c, 8834 * except it is intimately tied to inversion lists as well. It assumes 8835 * that <swatch> is all 0's on input */ 8836 8837 UV current = start; 8838 const IV len = _invlist_len(invlist); 8839 IV i; 8840 const UV * array; 8841 8842 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; 8843 8844 if (len == 0) { /* Empty inversion list */ 8845 return; 8846 } 8847 8848 array = invlist_array(invlist); 8849 8850 /* Find which element it is */ 8851 i = _invlist_search(invlist, start); 8852 8853 /* We populate from <start> to <end> */ 8854 while (current < end) { 8855 UV upper; 8856 8857 /* The inversion list gives the results for every possible code point 8858 * after the first one in the list. Only those ranges whose index is 8859 * even are ones that the inversion list matches. For the odd ones, 8860 * and if the initial code point is not in the list, we have to skip 8861 * forward to the next element */ 8862 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { 8863 i++; 8864 if (i >= len) { /* Finished if beyond the end of the array */ 8865 return; 8866 } 8867 current = array[i]; 8868 if (current >= end) { /* Finished if beyond the end of what we 8869 are populating */ 8870 if (LIKELY(end < UV_MAX)) { 8871 return; 8872 } 8873 8874 /* We get here when the upper bound is the maximum 8875 * representable on the machine, and we are looking for just 8876 * that code point. Have to special case it */ 8877 i = len; 8878 goto join_end_of_list; 8879 } 8880 } 8881 assert(current >= start); 8882 8883 /* The current range ends one below the next one, except don't go past 8884 * <end> */ 8885 i++; 8886 upper = (i < len && array[i] < end) ? array[i] : end; 8887 8888 /* Here we are in a range that matches. Populate a bit in the 3-bit U8 8889 * for each code point in it */ 8890 for (; current < upper; current++) { 8891 const STRLEN offset = (STRLEN)(current - start); 8892 swatch[offset >> 3] |= 1 << (offset & 7); 8893 } 8894 8895 join_end_of_list: 8896 8897 /* Quit if at the end of the list */ 8898 if (i >= len) { 8899 8900 /* But first, have to deal with the highest possible code point on 8901 * the platform. The previous code assumes that <end> is one 8902 * beyond where we want to populate, but that is impossible at the 8903 * platform's infinity, so have to handle it specially */ 8904 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) 8905 { 8906 const STRLEN offset = (STRLEN)(end - start); 8907 swatch[offset >> 3] |= 1 << (offset & 7); 8908 } 8909 return; 8910 } 8911 8912 /* Advance to the next range, which will be for code points not in the 8913 * inversion list */ 8914 current = array[i]; 8915 } 8916 8917 return; 8918 } 8919 8920 void 8921 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, 8922 const bool complement_b, SV** output) 8923 { 8924 /* Take the union of two inversion lists and point <output> to it. *output 8925 * SHOULD BE DEFINED upon input, and if it points to one of the two lists, 8926 * the reference count to that list will be decremented if not already a 8927 * temporary (mortal); otherwise just its contents will be modified to be 8928 * the union. The first list, <a>, may be NULL, in which case a copy of 8929 * the second list is returned. If <complement_b> is TRUE, the union is 8930 * taken of the complement (inversion) of <b> instead of b itself. 8931 * 8932 * The basis for this comes from "Unicode Demystified" Chapter 13 by 8933 * Richard Gillam, published by Addison-Wesley, and explained at some 8934 * length there. The preface says to incorporate its examples into your 8935 * code at your own risk. 8936 * 8937 * The algorithm is like a merge sort. 8938 * 8939 * XXX A potential performance improvement is to keep track as we go along 8940 * if only one of the inputs contributes to the result, meaning the other 8941 * is a subset of that one. In that case, we can skip the final copy and 8942 * return the larger of the input lists, but then outside code might need 8943 * to keep track of whether to free the input list or not */ 8944 8945 const UV* array_a; /* a's array */ 8946 const UV* array_b; 8947 UV len_a; /* length of a's array */ 8948 UV len_b; 8949 8950 SV* u; /* the resulting union */ 8951 UV* array_u; 8952 UV len_u = 0; 8953 8954 UV i_a = 0; /* current index into a's array */ 8955 UV i_b = 0; 8956 UV i_u = 0; 8957 8958 /* running count, as explained in the algorithm source book; items are 8959 * stopped accumulating and are output when the count changes to/from 0. 8960 * The count is incremented when we start a range that's in the set, and 8961 * decremented when we start a range that's not in the set. So its range 8962 * is 0 to 2. Only when the count is zero is something not in the set. 8963 */ 8964 UV count = 0; 8965 8966 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; 8967 assert(a != b); 8968 8969 len_b = _invlist_len(b); 8970 if (len_b == 0) { 8971 8972 /* Here, 'b' is empty. If the output is the complement of 'b', the 8973 * union is all possible code points, and we need not even look at 'a'. 8974 * It's easiest to create a new inversion list that matches everything. 8975 * */ 8976 if (complement_b) { 8977 SV* everything = _new_invlist(1); 8978 _append_range_to_invlist(everything, 0, UV_MAX); 8979 8980 /* If the output didn't exist, just point it at the new list */ 8981 if (*output == NULL) { 8982 *output = everything; 8983 return; 8984 } 8985 8986 /* Otherwise, replace its contents with the new list */ 8987 invlist_replace_list_destroys_src(*output, everything); 8988 SvREFCNT_dec_NN(everything); 8989 return; 8990 } 8991 8992 /* Here, we don't want the complement of 'b', and since it is empty, 8993 * the union will come entirely from 'a'. If 'a' is NULL or empty, the 8994 * output will be empty */ 8995 8996 if (a == NULL) { 8997 *output = _new_invlist(0); 8998 return; 8999 } 9000 9001 if (_invlist_len(a) == 0) { 9002 invlist_clear(*output); 9003 return; 9004 } 9005 9006 /* Here, 'a' is not empty, and entirely determines the union. If the 9007 * output is not to overwrite 'b', we can just return 'a'. */ 9008 if (*output != b) { 9009 9010 /* If the output is to overwrite 'a', we have a no-op, as it's 9011 * already in 'a' */ 9012 if (*output == a) { 9013 return; 9014 } 9015 9016 /* But otherwise we have to copy 'a' to the output */ 9017 *output = invlist_clone(a); 9018 return; 9019 } 9020 9021 /* Here, 'b' is to be overwritten by the output, which will be 'a' */ 9022 u = invlist_clone(a); 9023 invlist_replace_list_destroys_src(*output, u); 9024 SvREFCNT_dec_NN(u); 9025 9026 return; 9027 } 9028 9029 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { 9030 9031 /* Here, 'a' is empty (and b is not). That means the union will come 9032 * entirely from 'b'. If the output is not to overwrite 'a', we can 9033 * just return what's in 'b'. */ 9034 if (*output != a) { 9035 9036 /* If the output is to overwrite 'b', it's already in 'b', but 9037 * otherwise we have to copy 'b' to the output */ 9038 if (*output != b) { 9039 *output = invlist_clone(b); 9040 } 9041 9042 /* And if the output is to be the inversion of 'b', do that */ 9043 if (complement_b) { 9044 _invlist_invert(*output); 9045 } 9046 9047 return; 9048 } 9049 9050 /* Here, 'a', which is empty or even NULL, is to be overwritten by the 9051 * output, which will either be 'b' or the complement of 'b' */ 9052 9053 if (a == NULL) { 9054 *output = invlist_clone(b); 9055 } 9056 else { 9057 u = invlist_clone(b); 9058 invlist_replace_list_destroys_src(*output, u); 9059 SvREFCNT_dec_NN(u); 9060 } 9061 9062 if (complement_b) { 9063 _invlist_invert(*output); 9064 } 9065 9066 return; 9067 } 9068 9069 /* Here both lists exist and are non-empty */ 9070 array_a = invlist_array(a); 9071 array_b = invlist_array(b); 9072 9073 /* If are to take the union of 'a' with the complement of b, set it 9074 * up so are looking at b's complement. */ 9075 if (complement_b) { 9076 9077 /* To complement, we invert: if the first element is 0, remove it. To 9078 * do this, we just pretend the array starts one later */ 9079 if (array_b[0] == 0) { 9080 array_b++; 9081 len_b--; 9082 } 9083 else { 9084 9085 /* But if the first element is not zero, we pretend the list starts 9086 * at the 0 that is always stored immediately before the array. */ 9087 array_b--; 9088 len_b++; 9089 } 9090 } 9091 9092 /* Size the union for the worst case: that the sets are completely 9093 * disjoint */ 9094 u = _new_invlist(len_a + len_b); 9095 9096 /* Will contain U+0000 if either component does */ 9097 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) 9098 || (len_b > 0 && array_b[0] == 0)); 9099 9100 /* Go through each list item by item, stopping when exhausted one of 9101 * them */ 9102 while (i_a < len_a && i_b < len_b) { 9103 UV cp; /* The element to potentially add to the union's array */ 9104 bool cp_in_set; /* is it in the the input list's set or not */ 9105 9106 /* We need to take one or the other of the two inputs for the union. 9107 * Since we are merging two sorted lists, we take the smaller of the 9108 * next items. In case of a tie, we take the one that is in its set 9109 * first. If we took one not in the set first, it would decrement the 9110 * count, possibly to 0 which would cause it to be output as ending the 9111 * range, and the next time through we would take the same number, and 9112 * output it again as beginning the next range. By doing it the 9113 * opposite way, there is no possibility that the count will be 9114 * momentarily decremented to 0, and thus the two adjoining ranges will 9115 * be seamlessly merged. (In a tie and both are in the set or both not 9116 * in the set, it doesn't matter which we take first.) */ 9117 if (array_a[i_a] < array_b[i_b] 9118 || (array_a[i_a] == array_b[i_b] 9119 && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) 9120 { 9121 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); 9122 cp= array_a[i_a++]; 9123 } 9124 else { 9125 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); 9126 cp = array_b[i_b++]; 9127 } 9128 9129 /* Here, have chosen which of the two inputs to look at. Only output 9130 * if the running count changes to/from 0, which marks the 9131 * beginning/end of a range that's in the set */ 9132 if (cp_in_set) { 9133 if (count == 0) { 9134 array_u[i_u++] = cp; 9135 } 9136 count++; 9137 } 9138 else { 9139 count--; 9140 if (count == 0) { 9141 array_u[i_u++] = cp; 9142 } 9143 } 9144 } 9145 9146 /* Here, we are finished going through at least one of the lists, which 9147 * means there is something remaining in at most one. We check if the list 9148 * that hasn't been exhausted is positioned such that we are in the middle 9149 * of a range in its set or not. (i_a and i_b point to the element beyond 9150 * the one we care about.) If in the set, we decrement 'count'; if 0, there 9151 * is potentially more to output. 9152 * There are four cases: 9153 * 1) Both weren't in their sets, count is 0, and remains 0. What's left 9154 * in the union is entirely from the non-exhausted set. 9155 * 2) Both were in their sets, count is 2. Nothing further should 9156 * be output, as everything that remains will be in the exhausted 9157 * list's set, hence in the union; decrementing to 1 but not 0 insures 9158 * that 9159 * 3) the exhausted was in its set, non-exhausted isn't, count is 1. 9160 * Nothing further should be output because the union includes 9161 * everything from the exhausted set. Not decrementing ensures that. 9162 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; 9163 * decrementing to 0 insures that we look at the remainder of the 9164 * non-exhausted set */ 9165 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) 9166 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) 9167 { 9168 count--; 9169 } 9170 9171 /* The final length is what we've output so far, plus what else is about to 9172 * be output. (If 'count' is non-zero, then the input list we exhausted 9173 * has everything remaining up to the machine's limit in its set, and hence 9174 * in the union, so there will be no further output. */ 9175 len_u = i_u; 9176 if (count == 0) { 9177 /* At most one of the subexpressions will be non-zero */ 9178 len_u += (len_a - i_a) + (len_b - i_b); 9179 } 9180 9181 /* Set the result to the final length, which can change the pointer to 9182 * array_u, so re-find it. (Note that it is unlikely that this will 9183 * change, as we are shrinking the space, not enlarging it) */ 9184 if (len_u != _invlist_len(u)) { 9185 invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); 9186 invlist_trim(u); 9187 array_u = invlist_array(u); 9188 } 9189 9190 /* When 'count' is 0, the list that was exhausted (if one was shorter than 9191 * the other) ended with everything above it not in its set. That means 9192 * that the remaining part of the union is precisely the same as the 9193 * non-exhausted list, so can just copy it unchanged. (If both lists were 9194 * exhausted at the same time, then the operations below will be both 0.) 9195 */ 9196 if (count == 0) { 9197 IV copy_count; /* At most one will have a non-zero copy count */ 9198 if ((copy_count = len_a - i_a) > 0) { 9199 Copy(array_a + i_a, array_u + i_u, copy_count, UV); 9200 } 9201 else if ((copy_count = len_b - i_b) > 0) { 9202 Copy(array_b + i_b, array_u + i_u, copy_count, UV); 9203 } 9204 } 9205 9206 /* If the output is not to overwrite either of the inputs, just return the 9207 * calculated union */ 9208 if (a != *output && b != *output) { 9209 *output = u; 9210 } 9211 else { 9212 /* Here, the output is to be the same as one of the input scalars, 9213 * hence replacing it. The simple thing to do is to free the input 9214 * scalar, making it instead be the output one. But experience has 9215 * shown [perl #127392] that if the input is a mortal, we can get a 9216 * huge build-up of these during regex compilation before they get 9217 * freed. So for that case, replace just the input's interior with 9218 * the output's, and then free the output */ 9219 9220 assert(! invlist_is_iterating(*output)); 9221 9222 if (! SvTEMP(*output)) { 9223 SvREFCNT_dec_NN(*output); 9224 *output = u; 9225 } 9226 else { 9227 invlist_replace_list_destroys_src(*output, u); 9228 SvREFCNT_dec_NN(u); 9229 } 9230 } 9231 9232 return; 9233 } 9234 9235 void 9236 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, 9237 const bool complement_b, SV** i) 9238 { 9239 /* Take the intersection of two inversion lists and point <i> to it. *i 9240 * SHOULD BE DEFINED upon input, and if it points to one of the two lists, 9241 * the reference count to that list will be decremented if not already a 9242 * temporary (mortal); otherwise just its contents will be modified to be 9243 * the intersection. The first list, <a>, may be NULL, in which case an 9244 * empty list is returned. If <complement_b> is TRUE, the result will be 9245 * the intersection of <a> and the complement (or inversion) of <b> instead 9246 * of <b> directly. 9247 * 9248 * The basis for this comes from "Unicode Demystified" Chapter 13 by 9249 * Richard Gillam, published by Addison-Wesley, and explained at some 9250 * length there. The preface says to incorporate its examples into your 9251 * code at your own risk. In fact, it had bugs 9252 * 9253 * The algorithm is like a merge sort, and is essentially the same as the 9254 * union above 9255 */ 9256 9257 const UV* array_a; /* a's array */ 9258 const UV* array_b; 9259 UV len_a; /* length of a's array */ 9260 UV len_b; 9261 9262 SV* r; /* the resulting intersection */ 9263 UV* array_r; 9264 UV len_r = 0; 9265 9266 UV i_a = 0; /* current index into a's array */ 9267 UV i_b = 0; 9268 UV i_r = 0; 9269 9270 /* running count, as explained in the algorithm source book; items are 9271 * stopped accumulating and are output when the count changes to/from 2. 9272 * The count is incremented when we start a range that's in the set, and 9273 * decremented when we start a range that's not in the set. So its range 9274 * is 0 to 2. Only when the count is 2 is something in the intersection. 9275 */ 9276 UV count = 0; 9277 9278 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; 9279 assert(a != b); 9280 9281 /* Special case if either one is empty */ 9282 len_a = (a == NULL) ? 0 : _invlist_len(a); 9283 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { 9284 if (len_a != 0 && complement_b) { 9285 9286 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b' 9287 * must be empty. Here, also we are using 'b's complement, which 9288 * hence must be every possible code point. Thus the intersection 9289 * is simply 'a'. */ 9290 9291 if (*i == a) { /* No-op */ 9292 return; 9293 } 9294 9295 /* If not overwriting either input, just make a copy of 'a' */ 9296 if (*i != b) { 9297 *i = invlist_clone(a); 9298 return; 9299 } 9300 9301 /* Here we are overwriting 'b' with 'a's contents */ 9302 r = invlist_clone(a); 9303 invlist_replace_list_destroys_src(*i, r); 9304 SvREFCNT_dec_NN(r); 9305 return; 9306 } 9307 9308 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The 9309 * intersection must be empty */ 9310 if (*i == NULL) { 9311 *i = _new_invlist(0); 9312 return; 9313 } 9314 9315 invlist_clear(*i); 9316 return; 9317 } 9318 9319 /* Here both lists exist and are non-empty */ 9320 array_a = invlist_array(a); 9321 array_b = invlist_array(b); 9322 9323 /* If are to take the intersection of 'a' with the complement of b, set it 9324 * up so are looking at b's complement. */ 9325 if (complement_b) { 9326 9327 /* To complement, we invert: if the first element is 0, remove it. To 9328 * do this, we just pretend the array starts one later */ 9329 if (array_b[0] == 0) { 9330 array_b++; 9331 len_b--; 9332 } 9333 else { 9334 9335 /* But if the first element is not zero, we pretend the list starts 9336 * at the 0 that is always stored immediately before the array. */ 9337 array_b--; 9338 len_b++; 9339 } 9340 } 9341 9342 /* Size the intersection for the worst case: that the intersection ends up 9343 * fragmenting everything to be completely disjoint */ 9344 r= _new_invlist(len_a + len_b); 9345 9346 /* Will contain U+0000 iff both components do */ 9347 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 9348 && len_b > 0 && array_b[0] == 0); 9349 9350 /* Go through each list item by item, stopping when exhausted one of 9351 * them */ 9352 while (i_a < len_a && i_b < len_b) { 9353 UV cp; /* The element to potentially add to the intersection's 9354 array */ 9355 bool cp_in_set; /* Is it in the input list's set or not */ 9356 9357 /* We need to take one or the other of the two inputs for the 9358 * intersection. Since we are merging two sorted lists, we take the 9359 * smaller of the next items. In case of a tie, we take the one that 9360 * is not in its set first (a difference from the union algorithm). If 9361 * we took one in the set first, it would increment the count, possibly 9362 * to 2 which would cause it to be output as starting a range in the 9363 * intersection, and the next time through we would take that same 9364 * number, and output it again as ending the set. By doing it the 9365 * opposite of this, there is no possibility that the count will be 9366 * momentarily incremented to 2. (In a tie and both are in the set or 9367 * both not in the set, it doesn't matter which we take first.) */ 9368 if (array_a[i_a] < array_b[i_b] 9369 || (array_a[i_a] == array_b[i_b] 9370 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) 9371 { 9372 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); 9373 cp= array_a[i_a++]; 9374 } 9375 else { 9376 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); 9377 cp= array_b[i_b++]; 9378 } 9379 9380 /* Here, have chosen which of the two inputs to look at. Only output 9381 * if the running count changes to/from 2, which marks the 9382 * beginning/end of a range that's in the intersection */ 9383 if (cp_in_set) { 9384 count++; 9385 if (count == 2) { 9386 array_r[i_r++] = cp; 9387 } 9388 } 9389 else { 9390 if (count == 2) { 9391 array_r[i_r++] = cp; 9392 } 9393 count--; 9394 } 9395 } 9396 9397 /* Here, we are finished going through at least one of the lists, which 9398 * means there is something remaining in at most one. We check if the list 9399 * that has been exhausted is positioned such that we are in the middle 9400 * of a range in its set or not. (i_a and i_b point to elements 1 beyond 9401 * the ones we care about.) There are four cases: 9402 * 1) Both weren't in their sets, count is 0, and remains 0. There's 9403 * nothing left in the intersection. 9404 * 2) Both were in their sets, count is 2 and perhaps is incremented to 9405 * above 2. What should be output is exactly that which is in the 9406 * non-exhausted set, as everything it has is also in the intersection 9407 * set, and everything it doesn't have can't be in the intersection 9408 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and 9409 * gets incremented to 2. Like the previous case, the intersection is 9410 * everything that remains in the non-exhausted set. 9411 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and 9412 * remains 1. And the intersection has nothing more. */ 9413 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) 9414 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) 9415 { 9416 count++; 9417 } 9418 9419 /* The final length is what we've output so far plus what else is in the 9420 * intersection. At most one of the subexpressions below will be non-zero 9421 * */ 9422 len_r = i_r; 9423 if (count >= 2) { 9424 len_r += (len_a - i_a) + (len_b - i_b); 9425 } 9426 9427 /* Set the result to the final length, which can change the pointer to 9428 * array_r, so re-find it. (Note that it is unlikely that this will 9429 * change, as we are shrinking the space, not enlarging it) */ 9430 if (len_r != _invlist_len(r)) { 9431 invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); 9432 invlist_trim(r); 9433 array_r = invlist_array(r); 9434 } 9435 9436 /* Finish outputting any remaining */ 9437 if (count >= 2) { /* At most one will have a non-zero copy count */ 9438 IV copy_count; 9439 if ((copy_count = len_a - i_a) > 0) { 9440 Copy(array_a + i_a, array_r + i_r, copy_count, UV); 9441 } 9442 else if ((copy_count = len_b - i_b) > 0) { 9443 Copy(array_b + i_b, array_r + i_r, copy_count, UV); 9444 } 9445 } 9446 9447 /* If the output is not to overwrite either of the inputs, just return the 9448 * calculated intersection */ 9449 if (a != *i && b != *i) { 9450 *i = r; 9451 } 9452 else { 9453 /* Here, the output is to be the same as one of the input scalars, 9454 * hence replacing it. The simple thing to do is to free the input 9455 * scalar, making it instead be the output one. But experience has 9456 * shown [perl #127392] that if the input is a mortal, we can get a 9457 * huge build-up of these during regex compilation before they get 9458 * freed. So for that case, replace just the input's interior with 9459 * the output's, and then free the output. A short-cut in this case 9460 * is if the output is empty, we can just set the input to be empty */ 9461 9462 assert(! invlist_is_iterating(*i)); 9463 9464 if (! SvTEMP(*i)) { 9465 SvREFCNT_dec_NN(*i); 9466 *i = r; 9467 } 9468 else { 9469 if (len_r) { 9470 invlist_replace_list_destroys_src(*i, r); 9471 } 9472 else { 9473 invlist_clear(*i); 9474 } 9475 SvREFCNT_dec_NN(r); 9476 } 9477 } 9478 9479 return; 9480 } 9481 9482 SV* 9483 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) 9484 { 9485 /* Add the range from 'start' to 'end' inclusive to the inversion list's 9486 * set. A pointer to the inversion list is returned. This may actually be 9487 * a new list, in which case the passed in one has been destroyed. The 9488 * passed-in inversion list can be NULL, in which case a new one is created 9489 * with just the one range in it */ 9490 9491 SV* range_invlist; 9492 UV len; 9493 9494 if (invlist == NULL) { 9495 invlist = _new_invlist(2); 9496 len = 0; 9497 } 9498 else { 9499 len = _invlist_len(invlist); 9500 } 9501 9502 /* If comes after the final entry actually in the list, can just append it 9503 * to the end, */ 9504 if (len == 0 9505 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1) 9506 && start >= invlist_array(invlist)[len - 1])) 9507 { 9508 _append_range_to_invlist(invlist, start, end); 9509 return invlist; 9510 } 9511 9512 /* Here, can't just append things, create and return a new inversion list 9513 * which is the union of this range and the existing inversion list. (If 9514 * the new range is well-behaved wrt to the old one, we could just insert 9515 * it, doing a Move() down on the tail of the old one (potentially growing 9516 * it first). But to determine that means we would have the extra 9517 * (possibly throw-away) work of first finding where the new one goes and 9518 * whether it disrupts (splits) an existing range, so it doesn't appear to 9519 * me (khw) that it's worth it) */ 9520 range_invlist = _new_invlist(2); 9521 _append_range_to_invlist(range_invlist, start, end); 9522 9523 _invlist_union(invlist, range_invlist, &invlist); 9524 9525 /* The temporary can be freed */ 9526 SvREFCNT_dec_NN(range_invlist); 9527 9528 return invlist; 9529 } 9530 9531 SV* 9532 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, 9533 UV** other_elements_ptr) 9534 { 9535 /* Create and return an inversion list whose contents are to be populated 9536 * by the caller. The caller gives the number of elements (in 'size') and 9537 * the very first element ('element0'). This function will set 9538 * '*other_elements_ptr' to an array of UVs, where the remaining elements 9539 * are to be placed. 9540 * 9541 * Obviously there is some trust involved that the caller will properly 9542 * fill in the other elements of the array. 9543 * 9544 * (The first element needs to be passed in, as the underlying code does 9545 * things differently depending on whether it is zero or non-zero) */ 9546 9547 SV* invlist = _new_invlist(size); 9548 bool offset; 9549 9550 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; 9551 9552 _append_range_to_invlist(invlist, element0, element0); 9553 offset = *get_invlist_offset_addr(invlist); 9554 9555 invlist_set_len(invlist, size, offset); 9556 *other_elements_ptr = invlist_array(invlist) + 1; 9557 return invlist; 9558 } 9559 9560 #endif 9561 9562 PERL_STATIC_INLINE SV* 9563 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { 9564 return _add_range_to_invlist(invlist, cp, cp); 9565 } 9566 9567 #ifndef PERL_IN_XSUB_RE 9568 void 9569 Perl__invlist_invert(pTHX_ SV* const invlist) 9570 { 9571 /* Complement the input inversion list. This adds a 0 if the list didn't 9572 * have a zero; removes it otherwise. As described above, the data 9573 * structure is set up so that this is very efficient */ 9574 9575 PERL_ARGS_ASSERT__INVLIST_INVERT; 9576 9577 assert(! invlist_is_iterating(invlist)); 9578 9579 /* The inverse of matching nothing is matching everything */ 9580 if (_invlist_len(invlist) == 0) { 9581 _append_range_to_invlist(invlist, 0, UV_MAX); 9582 return; 9583 } 9584 9585 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); 9586 } 9587 9588 #endif 9589 9590 PERL_STATIC_INLINE SV* 9591 S_invlist_clone(pTHX_ SV* const invlist) 9592 { 9593 9594 /* Return a new inversion list that is a copy of the input one, which is 9595 * unchanged. The new list will not be mortal even if the old one was. */ 9596 9597 /* Need to allocate extra space to accommodate Perl's addition of a 9598 * trailing NUL to SvPV's, since it thinks they are always strings */ 9599 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); 9600 STRLEN physical_length = SvCUR(invlist); 9601 bool offset = *(get_invlist_offset_addr(invlist)); 9602 9603 PERL_ARGS_ASSERT_INVLIST_CLONE; 9604 9605 *(get_invlist_offset_addr(new_invlist)) = offset; 9606 invlist_set_len(new_invlist, _invlist_len(invlist), offset); 9607 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); 9608 9609 return new_invlist; 9610 } 9611 9612 PERL_STATIC_INLINE STRLEN* 9613 S_get_invlist_iter_addr(SV* invlist) 9614 { 9615 /* Return the address of the UV that contains the current iteration 9616 * position */ 9617 9618 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; 9619 9620 assert(SvTYPE(invlist) == SVt_INVLIST); 9621 9622 return &(((XINVLIST*) SvANY(invlist))->iterator); 9623 } 9624 9625 PERL_STATIC_INLINE void 9626 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ 9627 { 9628 PERL_ARGS_ASSERT_INVLIST_ITERINIT; 9629 9630 *get_invlist_iter_addr(invlist) = 0; 9631 } 9632 9633 PERL_STATIC_INLINE void 9634 S_invlist_iterfinish(SV* invlist) 9635 { 9636 /* Terminate iterator for invlist. This is to catch development errors. 9637 * Any iteration that is interrupted before completed should call this 9638 * function. Functions that add code points anywhere else but to the end 9639 * of an inversion list assert that they are not in the middle of an 9640 * iteration. If they were, the addition would make the iteration 9641 * problematical: if the iteration hadn't reached the place where things 9642 * were being added, it would be ok */ 9643 9644 PERL_ARGS_ASSERT_INVLIST_ITERFINISH; 9645 9646 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; 9647 } 9648 9649 STATIC bool 9650 S_invlist_iternext(SV* invlist, UV* start, UV* end) 9651 { 9652 /* An C<invlist_iterinit> call on <invlist> must be used to set this up. 9653 * This call sets in <*start> and <*end>, the next range in <invlist>. 9654 * Returns <TRUE> if successful and the next call will return the next 9655 * range; <FALSE> if was already at the end of the list. If the latter, 9656 * <*start> and <*end> are unchanged, and the next call to this function 9657 * will start over at the beginning of the list */ 9658 9659 STRLEN* pos = get_invlist_iter_addr(invlist); 9660 UV len = _invlist_len(invlist); 9661 UV *array; 9662 9663 PERL_ARGS_ASSERT_INVLIST_ITERNEXT; 9664 9665 if (*pos >= len) { 9666 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ 9667 return FALSE; 9668 } 9669 9670 array = invlist_array(invlist); 9671 9672 *start = array[(*pos)++]; 9673 9674 if (*pos >= len) { 9675 *end = UV_MAX; 9676 } 9677 else { 9678 *end = array[(*pos)++] - 1; 9679 } 9680 9681 return TRUE; 9682 } 9683 9684 PERL_STATIC_INLINE UV 9685 S_invlist_highest(SV* const invlist) 9686 { 9687 /* Returns the highest code point that matches an inversion list. This API 9688 * has an ambiguity, as it returns 0 under either the highest is actually 9689 * 0, or if the list is empty. If this distinction matters to you, check 9690 * for emptiness before calling this function */ 9691 9692 UV len = _invlist_len(invlist); 9693 UV *array; 9694 9695 PERL_ARGS_ASSERT_INVLIST_HIGHEST; 9696 9697 if (len == 0) { 9698 return 0; 9699 } 9700 9701 array = invlist_array(invlist); 9702 9703 /* The last element in the array in the inversion list always starts a 9704 * range that goes to infinity. That range may be for code points that are 9705 * matched in the inversion list, or it may be for ones that aren't 9706 * matched. In the latter case, the highest code point in the set is one 9707 * less than the beginning of this range; otherwise it is the final element 9708 * of this range: infinity */ 9709 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) 9710 ? UV_MAX 9711 : array[len - 1] - 1; 9712 } 9713 9714 STATIC SV * 9715 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) 9716 { 9717 /* Get the contents of an inversion list into a string SV so that they can 9718 * be printed out. If 'traditional_style' is TRUE, it uses the format 9719 * traditionally done for debug tracing; otherwise it uses a format 9720 * suitable for just copying to the output, with blanks between ranges and 9721 * a dash between range components */ 9722 9723 UV start, end; 9724 SV* output; 9725 const char intra_range_delimiter = (traditional_style ? '\t' : '-'); 9726 const char inter_range_delimiter = (traditional_style ? '\n' : ' '); 9727 9728 if (traditional_style) { 9729 output = newSVpvs("\n"); 9730 } 9731 else { 9732 output = newSVpvs(""); 9733 } 9734 9735 PERL_ARGS_ASSERT_INVLIST_CONTENTS; 9736 9737 assert(! invlist_is_iterating(invlist)); 9738 9739 invlist_iterinit(invlist); 9740 while (invlist_iternext(invlist, &start, &end)) { 9741 if (end == UV_MAX) { 9742 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c", 9743 start, intra_range_delimiter, 9744 inter_range_delimiter); 9745 } 9746 else if (end != start) { 9747 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c", 9748 start, 9749 intra_range_delimiter, 9750 end, inter_range_delimiter); 9751 } 9752 else { 9753 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c", 9754 start, inter_range_delimiter); 9755 } 9756 } 9757 9758 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */ 9759 SvCUR_set(output, SvCUR(output) - 1); 9760 } 9761 9762 return output; 9763 } 9764 9765 #ifndef PERL_IN_XSUB_RE 9766 void 9767 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, 9768 const char * const indent, SV* const invlist) 9769 { 9770 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the 9771 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by 9772 * the string 'indent'. The output looks like this: 9773 [0] 0x000A .. 0x000D 9774 [2] 0x0085 9775 [4] 0x2028 .. 0x2029 9776 [6] 0x3104 .. INFINITY 9777 * This means that the first range of code points matched by the list are 9778 * 0xA through 0xD; the second range contains only the single code point 9779 * 0x85, etc. An inversion list is an array of UVs. Two array elements 9780 * are used to define each range (except if the final range extends to 9781 * infinity, only a single element is needed). The array index of the 9782 * first element for the corresponding range is given in brackets. */ 9783 9784 UV start, end; 9785 STRLEN count = 0; 9786 9787 PERL_ARGS_ASSERT__INVLIST_DUMP; 9788 9789 if (invlist_is_iterating(invlist)) { 9790 Perl_dump_indent(aTHX_ level, file, 9791 "%sCan't dump inversion list because is in middle of iterating\n", 9792 indent); 9793 return; 9794 } 9795 9796 invlist_iterinit(invlist); 9797 while (invlist_iternext(invlist, &start, &end)) { 9798 if (end == UV_MAX) { 9799 Perl_dump_indent(aTHX_ level, file, 9800 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", 9801 indent, (UV)count, start); 9802 } 9803 else if (end != start) { 9804 Perl_dump_indent(aTHX_ level, file, 9805 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", 9806 indent, (UV)count, start, end); 9807 } 9808 else { 9809 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", 9810 indent, (UV)count, start); 9811 } 9812 count += 2; 9813 } 9814 } 9815 9816 void 9817 Perl__load_PL_utf8_foldclosures (pTHX) 9818 { 9819 assert(! PL_utf8_foldclosures); 9820 9821 /* If the folds haven't been read in, call a fold function 9822 * to force that */ 9823 if (! PL_utf8_tofold) { 9824 U8 dummy[UTF8_MAXBYTES_CASE+1]; 9825 9826 /* This string is just a short named one above \xff */ 9827 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); 9828 assert(PL_utf8_tofold); /* Verify that worked */ 9829 } 9830 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); 9831 } 9832 #endif 9833 9834 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE) 9835 bool 9836 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) 9837 { 9838 /* Return a boolean as to if the two passed in inversion lists are 9839 * identical. The final argument, if TRUE, says to take the complement of 9840 * the second inversion list before doing the comparison */ 9841 9842 const UV* array_a = invlist_array(a); 9843 const UV* array_b = invlist_array(b); 9844 UV len_a = _invlist_len(a); 9845 UV len_b = _invlist_len(b); 9846 9847 UV i = 0; /* current index into the arrays */ 9848 bool retval = TRUE; /* Assume are identical until proven otherwise */ 9849 9850 PERL_ARGS_ASSERT__INVLISTEQ; 9851 9852 /* If are to compare 'a' with the complement of b, set it 9853 * up so are looking at b's complement. */ 9854 if (complement_b) { 9855 9856 /* The complement of nothing is everything, so <a> would have to have 9857 * just one element, starting at zero (ending at infinity) */ 9858 if (len_b == 0) { 9859 return (len_a == 1 && array_a[0] == 0); 9860 } 9861 else if (array_b[0] == 0) { 9862 9863 /* Otherwise, to complement, we invert. Here, the first element is 9864 * 0, just remove it. To do this, we just pretend the array starts 9865 * one later */ 9866 9867 array_b++; 9868 len_b--; 9869 } 9870 else { 9871 9872 /* But if the first element is not zero, we pretend the list starts 9873 * at the 0 that is always stored immediately before the array. */ 9874 array_b--; 9875 len_b++; 9876 } 9877 } 9878 9879 /* Make sure that the lengths are the same, as well as the final element 9880 * before looping through the remainder. (Thus we test the length, final, 9881 * and first elements right off the bat) */ 9882 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { 9883 retval = FALSE; 9884 } 9885 else for (i = 0; i < len_a - 1; i++) { 9886 if (array_a[i] != array_b[i]) { 9887 retval = FALSE; 9888 break; 9889 } 9890 } 9891 9892 return retval; 9893 } 9894 #endif 9895 9896 /* 9897 * As best we can, determine the characters that can match the start of 9898 * the given EXACTF-ish node. 9899 * 9900 * Returns the invlist as a new SV*; it is the caller's responsibility to 9901 * call SvREFCNT_dec() when done with it. 9902 */ 9903 STATIC SV* 9904 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) 9905 { 9906 const U8 * s = (U8*)STRING(node); 9907 SSize_t bytelen = STR_LEN(node); 9908 UV uc; 9909 /* Start out big enough for 2 separate code points */ 9910 SV* invlist = _new_invlist(4); 9911 9912 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST; 9913 9914 if (! UTF) { 9915 uc = *s; 9916 9917 /* We punt and assume can match anything if the node begins 9918 * with a multi-character fold. Things are complicated. For 9919 * example, /ffi/i could match any of: 9920 * "\N{LATIN SMALL LIGATURE FFI}" 9921 * "\N{LATIN SMALL LIGATURE FF}I" 9922 * "F\N{LATIN SMALL LIGATURE FI}" 9923 * plus several other things; and making sure we have all the 9924 * possibilities is hard. */ 9925 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) { 9926 invlist = _add_range_to_invlist(invlist, 0, UV_MAX); 9927 } 9928 else { 9929 /* Any Latin1 range character can potentially match any 9930 * other depending on the locale */ 9931 if (OP(node) == EXACTFL) { 9932 _invlist_union(invlist, PL_Latin1, &invlist); 9933 } 9934 else { 9935 /* But otherwise, it matches at least itself. We can 9936 * quickly tell if it has a distinct fold, and if so, 9937 * it matches that as well */ 9938 invlist = add_cp_to_invlist(invlist, uc); 9939 if (IS_IN_SOME_FOLD_L1(uc)) 9940 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]); 9941 } 9942 9943 /* Some characters match above-Latin1 ones under /i. This 9944 * is true of EXACTFL ones when the locale is UTF-8 */ 9945 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) 9946 && (! isASCII(uc) || (OP(node) != EXACTFA 9947 && OP(node) != EXACTFA_NO_TRIE))) 9948 { 9949 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); 9950 } 9951 } 9952 } 9953 else { /* Pattern is UTF-8 */ 9954 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; 9955 STRLEN foldlen = UTF8SKIP(s); 9956 const U8* e = s + bytelen; 9957 SV** listp; 9958 9959 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); 9960 9961 /* The only code points that aren't folded in a UTF EXACTFish 9962 * node are are the problematic ones in EXACTFL nodes */ 9963 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { 9964 /* We need to check for the possibility that this EXACTFL 9965 * node begins with a multi-char fold. Therefore we fold 9966 * the first few characters of it so that we can make that 9967 * check */ 9968 U8 *d = folded; 9969 int i; 9970 9971 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { 9972 if (isASCII(*s)) { 9973 *(d++) = (U8) toFOLD(*s); 9974 s++; 9975 } 9976 else { 9977 STRLEN len; 9978 to_utf8_fold(s, d, &len); 9979 d += len; 9980 s += UTF8SKIP(s); 9981 } 9982 } 9983 9984 /* And set up so the code below that looks in this folded 9985 * buffer instead of the node's string */ 9986 e = d; 9987 foldlen = UTF8SKIP(folded); 9988 s = folded; 9989 } 9990 9991 /* When we reach here 's' points to the fold of the first 9992 * character(s) of the node; and 'e' points to far enough along 9993 * the folded string to be just past any possible multi-char 9994 * fold. 'foldlen' is the length in bytes of the first 9995 * character in 's' 9996 * 9997 * Unlike the non-UTF-8 case, the macro for determining if a 9998 * string is a multi-char fold requires all the characters to 9999 * already be folded. This is because of all the complications 10000 * if not. Note that they are folded anyway, except in EXACTFL 10001 * nodes. Like the non-UTF case above, we punt if the node 10002 * begins with a multi-char fold */ 10003 10004 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { 10005 invlist = _add_range_to_invlist(invlist, 0, UV_MAX); 10006 } 10007 else { /* Single char fold */ 10008 10009 /* It matches all the things that fold to it, which are 10010 * found in PL_utf8_foldclosures (including itself) */ 10011 invlist = add_cp_to_invlist(invlist, uc); 10012 if (! PL_utf8_foldclosures) 10013 _load_PL_utf8_foldclosures(); 10014 if ((listp = hv_fetch(PL_utf8_foldclosures, 10015 (char *) s, foldlen, FALSE))) 10016 { 10017 AV* list = (AV*) *listp; 10018 IV k; 10019 for (k = 0; k <= av_tindex_nomg(list); k++) { 10020 SV** c_p = av_fetch(list, k, FALSE); 10021 UV c; 10022 assert(c_p); 10023 10024 c = SvUV(*c_p); 10025 10026 /* /aa doesn't allow folds between ASCII and non- */ 10027 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE) 10028 && isASCII(c) != isASCII(uc)) 10029 { 10030 continue; 10031 } 10032 10033 invlist = add_cp_to_invlist(invlist, c); 10034 } 10035 } 10036 } 10037 } 10038 10039 return invlist; 10040 } 10041 10042 #undef HEADER_LENGTH 10043 #undef TO_INTERNAL_SIZE 10044 #undef FROM_INTERNAL_SIZE 10045 #undef INVLIST_VERSION_ID 10046 10047 /* End of inversion list object */ 10048 10049 STATIC void 10050 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) 10051 { 10052 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' 10053 * constructs, and updates RExC_flags with them. On input, RExC_parse 10054 * should point to the first flag; it is updated on output to point to the 10055 * final ')' or ':'. There needs to be at least one flag, or this will 10056 * abort */ 10057 10058 /* for (?g), (?gc), and (?o) warnings; warning 10059 about (?c) will warn about (?g) -- japhy */ 10060 10061 #define WASTED_O 0x01 10062 #define WASTED_G 0x02 10063 #define WASTED_C 0x04 10064 #define WASTED_GC (WASTED_G|WASTED_C) 10065 I32 wastedflags = 0x00; 10066 U32 posflags = 0, negflags = 0; 10067 U32 *flagsp = &posflags; 10068 char has_charset_modifier = '\0'; 10069 regex_charset cs; 10070 bool has_use_defaults = FALSE; 10071 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ 10072 int x_mod_count = 0; 10073 10074 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; 10075 10076 /* '^' as an initial flag sets certain defaults */ 10077 if (UCHARAT(RExC_parse) == '^') { 10078 RExC_parse++; 10079 has_use_defaults = TRUE; 10080 STD_PMMOD_FLAGS_CLEAR(&RExC_flags); 10081 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics) 10082 ? REGEX_UNICODE_CHARSET 10083 : REGEX_DEPENDS_CHARSET); 10084 } 10085 10086 cs = get_regex_charset(RExC_flags); 10087 if (cs == REGEX_DEPENDS_CHARSET 10088 && (RExC_utf8 || RExC_uni_semantics)) 10089 { 10090 cs = REGEX_UNICODE_CHARSET; 10091 } 10092 10093 while (RExC_parse < RExC_end) { 10094 /* && strchr("iogcmsx", *RExC_parse) */ 10095 /* (?g), (?gc) and (?o) are useless here 10096 and must be globally applied -- japhy */ 10097 switch (*RExC_parse) { 10098 10099 /* Code for the imsxn flags */ 10100 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count); 10101 10102 case LOCALE_PAT_MOD: 10103 if (has_charset_modifier) { 10104 goto excess_modifier; 10105 } 10106 else if (flagsp == &negflags) { 10107 goto neg_modifier; 10108 } 10109 cs = REGEX_LOCALE_CHARSET; 10110 has_charset_modifier = LOCALE_PAT_MOD; 10111 break; 10112 case UNICODE_PAT_MOD: 10113 if (has_charset_modifier) { 10114 goto excess_modifier; 10115 } 10116 else if (flagsp == &negflags) { 10117 goto neg_modifier; 10118 } 10119 cs = REGEX_UNICODE_CHARSET; 10120 has_charset_modifier = UNICODE_PAT_MOD; 10121 break; 10122 case ASCII_RESTRICT_PAT_MOD: 10123 if (flagsp == &negflags) { 10124 goto neg_modifier; 10125 } 10126 if (has_charset_modifier) { 10127 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { 10128 goto excess_modifier; 10129 } 10130 /* Doubled modifier implies more restricted */ 10131 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; 10132 } 10133 else { 10134 cs = REGEX_ASCII_RESTRICTED_CHARSET; 10135 } 10136 has_charset_modifier = ASCII_RESTRICT_PAT_MOD; 10137 break; 10138 case DEPENDS_PAT_MOD: 10139 if (has_use_defaults) { 10140 goto fail_modifiers; 10141 } 10142 else if (flagsp == &negflags) { 10143 goto neg_modifier; 10144 } 10145 else if (has_charset_modifier) { 10146 goto excess_modifier; 10147 } 10148 10149 /* The dual charset means unicode semantics if the 10150 * pattern (or target, not known until runtime) are 10151 * utf8, or something in the pattern indicates unicode 10152 * semantics */ 10153 cs = (RExC_utf8 || RExC_uni_semantics) 10154 ? REGEX_UNICODE_CHARSET 10155 : REGEX_DEPENDS_CHARSET; 10156 has_charset_modifier = DEPENDS_PAT_MOD; 10157 break; 10158 excess_modifier: 10159 RExC_parse++; 10160 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { 10161 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); 10162 } 10163 else if (has_charset_modifier == *(RExC_parse - 1)) { 10164 vFAIL2("Regexp modifier \"%c\" may not appear twice", 10165 *(RExC_parse - 1)); 10166 } 10167 else { 10168 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); 10169 } 10170 NOT_REACHED; /*NOTREACHED*/ 10171 neg_modifier: 10172 RExC_parse++; 10173 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", 10174 *(RExC_parse - 1)); 10175 NOT_REACHED; /*NOTREACHED*/ 10176 case ONCE_PAT_MOD: /* 'o' */ 10177 case GLOBAL_PAT_MOD: /* 'g' */ 10178 if (PASS2 && ckWARN(WARN_REGEXP)) { 10179 const I32 wflagbit = *RExC_parse == 'o' 10180 ? WASTED_O 10181 : WASTED_G; 10182 if (! (wastedflags & wflagbit) ) { 10183 wastedflags |= wflagbit; 10184 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 10185 vWARN5( 10186 RExC_parse + 1, 10187 "Useless (%s%c) - %suse /%c modifier", 10188 flagsp == &negflags ? "?-" : "?", 10189 *RExC_parse, 10190 flagsp == &negflags ? "don't " : "", 10191 *RExC_parse 10192 ); 10193 } 10194 } 10195 break; 10196 10197 case CONTINUE_PAT_MOD: /* 'c' */ 10198 if (PASS2 && ckWARN(WARN_REGEXP)) { 10199 if (! (wastedflags & WASTED_C) ) { 10200 wastedflags |= WASTED_GC; 10201 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 10202 vWARN3( 10203 RExC_parse + 1, 10204 "Useless (%sc) - %suse /gc modifier", 10205 flagsp == &negflags ? "?-" : "?", 10206 flagsp == &negflags ? "don't " : "" 10207 ); 10208 } 10209 } 10210 break; 10211 case KEEPCOPY_PAT_MOD: /* 'p' */ 10212 if (flagsp == &negflags) { 10213 if (PASS2) 10214 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); 10215 } else { 10216 *flagsp |= RXf_PMf_KEEPCOPY; 10217 } 10218 break; 10219 case '-': 10220 /* A flag is a default iff it is following a minus, so 10221 * if there is a minus, it means will be trying to 10222 * re-specify a default which is an error */ 10223 if (has_use_defaults || flagsp == &negflags) { 10224 goto fail_modifiers; 10225 } 10226 flagsp = &negflags; 10227 wastedflags = 0; /* reset so (?g-c) warns twice */ 10228 break; 10229 case ':': 10230 case ')': 10231 RExC_flags |= posflags; 10232 RExC_flags &= ~negflags; 10233 set_regex_charset(&RExC_flags, cs); 10234 if (RExC_flags & RXf_PMf_FOLD) { 10235 RExC_contains_i = 1; 10236 } 10237 if (PASS2) { 10238 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); 10239 } 10240 return; 10241 /*NOTREACHED*/ 10242 default: 10243 fail_modifiers: 10244 RExC_parse += SKIP_IF_CHAR(RExC_parse); 10245 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 10246 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", 10247 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 10248 NOT_REACHED; /*NOTREACHED*/ 10249 } 10250 10251 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 10252 } 10253 10254 vFAIL("Sequence (?... not terminated"); 10255 } 10256 10257 /* 10258 - reg - regular expression, i.e. main body or parenthesized thing 10259 * 10260 * Caller must absorb opening parenthesis. 10261 * 10262 * Combining parenthesis handling with the base level of regular expression 10263 * is a trifle forced, but the need to tie the tails of the branches to what 10264 * follows makes it hard to avoid. 10265 */ 10266 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) 10267 #ifdef DEBUGGING 10268 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) 10269 #else 10270 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) 10271 #endif 10272 10273 PERL_STATIC_INLINE regnode * 10274 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, 10275 I32 *flagp, 10276 char * parse_start, 10277 char ch 10278 ) 10279 { 10280 regnode *ret; 10281 char* name_start = RExC_parse; 10282 U32 num = 0; 10283 SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY 10284 ? REG_RSN_RETURN_NULL 10285 : REG_RSN_RETURN_DATA); 10286 GET_RE_DEBUG_FLAGS_DECL; 10287 10288 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF; 10289 10290 if (RExC_parse == name_start || *RExC_parse != ch) { 10291 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 10292 vFAIL2("Sequence %.3s... not terminated",parse_start); 10293 } 10294 10295 if (!SIZE_ONLY) { 10296 num = add_data( pRExC_state, STR_WITH_LEN("S")); 10297 RExC_rxi->data->data[num]=(void*)sv_dat; 10298 SvREFCNT_inc_simple_void(sv_dat); 10299 } 10300 RExC_sawback = 1; 10301 ret = reganode(pRExC_state, 10302 ((! FOLD) 10303 ? NREF 10304 : (ASCII_FOLD_RESTRICTED) 10305 ? NREFFA 10306 : (AT_LEAST_UNI_SEMANTICS) 10307 ? NREFFU 10308 : (LOC) 10309 ? NREFFL 10310 : NREFF), 10311 num); 10312 *flagp |= HASWIDTH; 10313 10314 Set_Node_Offset(ret, parse_start+1); 10315 Set_Node_Cur_Length(ret, parse_start); 10316 10317 nextchar(pRExC_state); 10318 return ret; 10319 } 10320 10321 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets 10322 flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan 10323 needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be 10324 upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns 10325 NULL, which cannot happen. */ 10326 STATIC regnode * 10327 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) 10328 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. 10329 * 2 is like 1, but indicates that nextchar() has been called to advance 10330 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and 10331 * this flag alerts us to the need to check for that */ 10332 { 10333 regnode *ret; /* Will be the head of the group. */ 10334 regnode *br; 10335 regnode *lastbr; 10336 regnode *ender = NULL; 10337 I32 parno = 0; 10338 I32 flags; 10339 U32 oregflags = RExC_flags; 10340 bool have_branch = 0; 10341 bool is_open = 0; 10342 I32 freeze_paren = 0; 10343 I32 after_freeze = 0; 10344 I32 num; /* numeric backreferences */ 10345 10346 char * parse_start = RExC_parse; /* MJD */ 10347 char * const oregcomp_parse = RExC_parse; 10348 10349 GET_RE_DEBUG_FLAGS_DECL; 10350 10351 PERL_ARGS_ASSERT_REG; 10352 DEBUG_PARSE("reg "); 10353 10354 *flagp = 0; /* Tentatively. */ 10355 10356 /* Having this true makes it feasible to have a lot fewer tests for the 10357 * parse pointer being in scope. For example, we can write 10358 * while(isFOO(*RExC_parse)) RExC_parse++; 10359 * instead of 10360 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++; 10361 */ 10362 assert(*RExC_end == '\0'); 10363 10364 /* Make an OPEN node, if parenthesized. */ 10365 if (paren) { 10366 10367 /* Under /x, space and comments can be gobbled up between the '(' and 10368 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such 10369 * intervening space, as the sequence is a token, and a token should be 10370 * indivisible */ 10371 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; 10372 10373 if (RExC_parse >= RExC_end) { 10374 vFAIL("Unmatched ("); 10375 } 10376 10377 if ( *RExC_parse == '*') { /* (*VERB:ARG) */ 10378 char *start_verb = RExC_parse + 1; 10379 STRLEN verb_len; 10380 char *start_arg = NULL; 10381 unsigned char op = 0; 10382 int arg_required = 0; 10383 int internal_argval = -1; /* if >-1 we are not allowed an argument*/ 10384 10385 if (has_intervening_patws) { 10386 RExC_parse++; /* past the '*' */ 10387 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); 10388 } 10389 while (RExC_parse < RExC_end && *RExC_parse != ')' ) { 10390 if ( *RExC_parse == ':' ) { 10391 start_arg = RExC_parse + 1; 10392 break; 10393 } 10394 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 10395 } 10396 verb_len = RExC_parse - start_verb; 10397 if ( start_arg ) { 10398 if (RExC_parse >= RExC_end) { 10399 goto unterminated_verb_pattern; 10400 } 10401 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 10402 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) 10403 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 10404 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) 10405 unterminated_verb_pattern: 10406 vFAIL("Unterminated verb pattern argument"); 10407 if ( RExC_parse == start_arg ) 10408 start_arg = NULL; 10409 } else { 10410 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) 10411 vFAIL("Unterminated verb pattern"); 10412 } 10413 10414 /* Here, we know that RExC_parse < RExC_end */ 10415 10416 switch ( *start_verb ) { 10417 case 'A': /* (*ACCEPT) */ 10418 if ( memEQs(start_verb,verb_len,"ACCEPT") ) { 10419 op = ACCEPT; 10420 internal_argval = RExC_nestroot; 10421 } 10422 break; 10423 case 'C': /* (*COMMIT) */ 10424 if ( memEQs(start_verb,verb_len,"COMMIT") ) 10425 op = COMMIT; 10426 break; 10427 case 'F': /* (*FAIL) */ 10428 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { 10429 op = OPFAIL; 10430 } 10431 break; 10432 case ':': /* (*:NAME) */ 10433 case 'M': /* (*MARK:NAME) */ 10434 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { 10435 op = MARKPOINT; 10436 arg_required = 1; 10437 } 10438 break; 10439 case 'P': /* (*PRUNE) */ 10440 if ( memEQs(start_verb,verb_len,"PRUNE") ) 10441 op = PRUNE; 10442 break; 10443 case 'S': /* (*SKIP) */ 10444 if ( memEQs(start_verb,verb_len,"SKIP") ) 10445 op = SKIP; 10446 break; 10447 case 'T': /* (*THEN) */ 10448 /* [19:06] <TimToady> :: is then */ 10449 if ( memEQs(start_verb,verb_len,"THEN") ) { 10450 op = CUTGROUP; 10451 RExC_seen |= REG_CUTGROUP_SEEN; 10452 } 10453 break; 10454 } 10455 if ( ! op ) { 10456 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 10457 vFAIL2utf8f( 10458 "Unknown verb pattern '%"UTF8f"'", 10459 UTF8fARG(UTF, verb_len, start_verb)); 10460 } 10461 if ( arg_required && !start_arg ) { 10462 vFAIL3("Verb pattern '%.*s' has a mandatory argument", 10463 verb_len, start_verb); 10464 } 10465 if (internal_argval == -1) { 10466 ret = reganode(pRExC_state, op, 0); 10467 } else { 10468 ret = reg2Lanode(pRExC_state, op, 0, internal_argval); 10469 } 10470 RExC_seen |= REG_VERBARG_SEEN; 10471 if ( ! SIZE_ONLY ) { 10472 if (start_arg) { 10473 SV *sv = newSVpvn( start_arg, 10474 RExC_parse - start_arg); 10475 ARG(ret) = add_data( pRExC_state, 10476 STR_WITH_LEN("S")); 10477 RExC_rxi->data->data[ARG(ret)]=(void*)sv; 10478 ret->flags = 1; 10479 } else { 10480 ret->flags = 0; 10481 } 10482 if ( internal_argval != -1 ) 10483 ARG2L_SET(ret, internal_argval); 10484 } 10485 nextchar(pRExC_state); 10486 return ret; 10487 } 10488 else if (*RExC_parse == '?') { /* (?...) */ 10489 bool is_logical = 0; 10490 const char * const seqstart = RExC_parse; 10491 const char * endptr; 10492 if (has_intervening_patws) { 10493 RExC_parse++; 10494 vFAIL("In '(?...)', the '(' and '?' must be adjacent"); 10495 } 10496 10497 RExC_parse++; /* past the '?' */ 10498 paren = *RExC_parse; /* might be a trailing NUL, if not 10499 well-formed */ 10500 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 10501 if (RExC_parse > RExC_end) { 10502 paren = '\0'; 10503 } 10504 ret = NULL; /* For look-ahead/behind. */ 10505 switch (paren) { 10506 10507 case 'P': /* (?P...) variants for those used to PCRE/Python */ 10508 paren = *RExC_parse; 10509 if ( paren == '<') { /* (?P<...>) named capture */ 10510 RExC_parse++; 10511 if (RExC_parse >= RExC_end) { 10512 vFAIL("Sequence (?P<... not terminated"); 10513 } 10514 goto named_capture; 10515 } 10516 else if (paren == '>') { /* (?P>name) named recursion */ 10517 RExC_parse++; 10518 if (RExC_parse >= RExC_end) { 10519 vFAIL("Sequence (?P>... not terminated"); 10520 } 10521 goto named_recursion; 10522 } 10523 else if (paren == '=') { /* (?P=...) named backref */ 10524 RExC_parse++; 10525 return handle_named_backref(pRExC_state, flagp, 10526 parse_start, ')'); 10527 } 10528 RExC_parse += SKIP_IF_CHAR(RExC_parse); 10529 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 10530 vFAIL3("Sequence (%.*s...) not recognized", 10531 RExC_parse-seqstart, seqstart); 10532 NOT_REACHED; /*NOTREACHED*/ 10533 case '<': /* (?<...) */ 10534 if (*RExC_parse == '!') 10535 paren = ','; 10536 else if (*RExC_parse != '=') 10537 named_capture: 10538 { /* (?<...>) */ 10539 char *name_start; 10540 SV *svname; 10541 paren= '>'; 10542 /* FALLTHROUGH */ 10543 case '\'': /* (?'...') */ 10544 name_start = RExC_parse; 10545 svname = reg_scan_name(pRExC_state, 10546 SIZE_ONLY /* reverse test from the others */ 10547 ? REG_RSN_RETURN_NAME 10548 : REG_RSN_RETURN_NULL); 10549 if ( RExC_parse == name_start 10550 || RExC_parse >= RExC_end 10551 || *RExC_parse != paren) 10552 { 10553 vFAIL2("Sequence (?%c... not terminated", 10554 paren=='>' ? '<' : paren); 10555 } 10556 if (SIZE_ONLY) { 10557 HE *he_str; 10558 SV *sv_dat = NULL; 10559 if (!svname) /* shouldn't happen */ 10560 Perl_croak(aTHX_ 10561 "panic: reg_scan_name returned NULL"); 10562 if (!RExC_paren_names) { 10563 RExC_paren_names= newHV(); 10564 sv_2mortal(MUTABLE_SV(RExC_paren_names)); 10565 #ifdef DEBUGGING 10566 RExC_paren_name_list= newAV(); 10567 sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); 10568 #endif 10569 } 10570 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); 10571 if ( he_str ) 10572 sv_dat = HeVAL(he_str); 10573 if ( ! sv_dat ) { 10574 /* croak baby croak */ 10575 Perl_croak(aTHX_ 10576 "panic: paren_name hash element allocation failed"); 10577 } else if ( SvPOK(sv_dat) ) { 10578 /* (?|...) can mean we have dupes so scan to check 10579 its already been stored. Maybe a flag indicating 10580 we are inside such a construct would be useful, 10581 but the arrays are likely to be quite small, so 10582 for now we punt -- dmq */ 10583 IV count = SvIV(sv_dat); 10584 I32 *pv = (I32*)SvPVX(sv_dat); 10585 IV i; 10586 for ( i = 0 ; i < count ; i++ ) { 10587 if ( pv[i] == RExC_npar ) { 10588 count = 0; 10589 break; 10590 } 10591 } 10592 if ( count ) { 10593 pv = (I32*)SvGROW(sv_dat, 10594 SvCUR(sv_dat) + sizeof(I32)+1); 10595 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); 10596 pv[count] = RExC_npar; 10597 SvIV_set(sv_dat, SvIVX(sv_dat) + 1); 10598 } 10599 } else { 10600 (void)SvUPGRADE(sv_dat,SVt_PVNV); 10601 sv_setpvn(sv_dat, (char *)&(RExC_npar), 10602 sizeof(I32)); 10603 SvIOK_on(sv_dat); 10604 SvIV_set(sv_dat, 1); 10605 } 10606 #ifdef DEBUGGING 10607 /* Yes this does cause a memory leak in debugging Perls 10608 * */ 10609 if (!av_store(RExC_paren_name_list, 10610 RExC_npar, SvREFCNT_inc(svname))) 10611 SvREFCNT_dec_NN(svname); 10612 #endif 10613 10614 /*sv_dump(sv_dat);*/ 10615 } 10616 nextchar(pRExC_state); 10617 paren = 1; 10618 goto capturing_parens; 10619 } 10620 RExC_seen |= REG_LOOKBEHIND_SEEN; 10621 RExC_in_lookbehind++; 10622 RExC_parse++; 10623 if (RExC_parse >= RExC_end) { 10624 vFAIL("Sequence (?... not terminated"); 10625 } 10626 10627 /* FALLTHROUGH */ 10628 case '=': /* (?=...) */ 10629 RExC_seen_zerolen++; 10630 break; 10631 case '!': /* (?!...) */ 10632 RExC_seen_zerolen++; 10633 /* check if we're really just a "FAIL" assertion */ 10634 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 10635 FALSE /* Don't force to /x */ ); 10636 if (*RExC_parse == ')') { 10637 ret=reganode(pRExC_state, OPFAIL, 0); 10638 nextchar(pRExC_state); 10639 return ret; 10640 } 10641 break; 10642 case '|': /* (?|...) */ 10643 /* branch reset, behave like a (?:...) except that 10644 buffers in alternations share the same numbers */ 10645 paren = ':'; 10646 after_freeze = freeze_paren = RExC_npar; 10647 break; 10648 case ':': /* (?:...) */ 10649 case '>': /* (?>...) */ 10650 break; 10651 case '$': /* (?$...) */ 10652 case '@': /* (?@...) */ 10653 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 10654 break; 10655 case '0' : /* (?0) */ 10656 case 'R' : /* (?R) */ 10657 if (RExC_parse == RExC_end || *RExC_parse != ')') 10658 FAIL("Sequence (?R) not terminated"); 10659 num = 0; 10660 RExC_seen |= REG_RECURSE_SEEN; 10661 *flagp |= POSTPONED; 10662 goto gen_recurse_regop; 10663 /*notreached*/ 10664 /* named and numeric backreferences */ 10665 case '&': /* (?&NAME) */ 10666 parse_start = RExC_parse - 1; 10667 named_recursion: 10668 { 10669 SV *sv_dat = reg_scan_name(pRExC_state, 10670 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 10671 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 10672 } 10673 if (RExC_parse >= RExC_end || *RExC_parse != ')') 10674 vFAIL("Sequence (?&... not terminated"); 10675 goto gen_recurse_regop; 10676 /* NOTREACHED */ 10677 case '+': 10678 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 10679 RExC_parse++; 10680 vFAIL("Illegal pattern"); 10681 } 10682 goto parse_recursion; 10683 /* NOTREACHED*/ 10684 case '-': /* (?-1) */ 10685 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 10686 RExC_parse--; /* rewind to let it be handled later */ 10687 goto parse_flags; 10688 } 10689 /* FALLTHROUGH */ 10690 case '1': case '2': case '3': case '4': /* (?1) */ 10691 case '5': case '6': case '7': case '8': case '9': 10692 RExC_parse = (char *) seqstart + 1; /* Point to the digit */ 10693 parse_recursion: 10694 { 10695 bool is_neg = FALSE; 10696 UV unum; 10697 parse_start = RExC_parse - 1; /* MJD */ 10698 if (*RExC_parse == '-') { 10699 RExC_parse++; 10700 is_neg = TRUE; 10701 } 10702 if (grok_atoUV(RExC_parse, &unum, &endptr) 10703 && unum <= I32_MAX 10704 ) { 10705 num = (I32)unum; 10706 RExC_parse = (char*)endptr; 10707 } else 10708 num = I32_MAX; 10709 if (is_neg) { 10710 /* Some limit for num? */ 10711 num = -num; 10712 } 10713 } 10714 if (*RExC_parse!=')') 10715 vFAIL("Expecting close bracket"); 10716 10717 gen_recurse_regop: 10718 if ( paren == '-' ) { 10719 /* 10720 Diagram of capture buffer numbering. 10721 Top line is the normal capture buffer numbers 10722 Bottom line is the negative indexing as from 10723 the X (the (?-2)) 10724 10725 + 1 2 3 4 5 X 6 7 10726 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ 10727 - 5 4 3 2 1 X x x 10728 10729 */ 10730 num = RExC_npar + num; 10731 if (num < 1) { 10732 RExC_parse++; 10733 vFAIL("Reference to nonexistent group"); 10734 } 10735 } else if ( paren == '+' ) { 10736 num = RExC_npar + num - 1; 10737 } 10738 /* We keep track how many GOSUB items we have produced. 10739 To start off the ARG2L() of the GOSUB holds its "id", 10740 which is used later in conjunction with RExC_recurse 10741 to calculate the offset we need to jump for the GOSUB, 10742 which it will store in the final representation. 10743 We have to defer the actual calculation until much later 10744 as the regop may move. 10745 */ 10746 10747 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count); 10748 if (!SIZE_ONLY) { 10749 if (num > (I32)RExC_rx->nparens) { 10750 RExC_parse++; 10751 vFAIL("Reference to nonexistent group"); 10752 } 10753 RExC_recurse_count++; 10754 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 10755 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n", 10756 22, "| |", (int)(depth * 2 + 1), "", 10757 (UV)ARG(ret), (IV)ARG2L(ret))); 10758 } 10759 RExC_seen |= REG_RECURSE_SEEN; 10760 10761 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ 10762 Set_Node_Offset(ret, parse_start); /* MJD */ 10763 10764 *flagp |= POSTPONED; 10765 assert(*RExC_parse == ')'); 10766 nextchar(pRExC_state); 10767 return ret; 10768 10769 /* NOTREACHED */ 10770 10771 case '?': /* (??...) */ 10772 is_logical = 1; 10773 if (*RExC_parse != '{') { 10774 RExC_parse += SKIP_IF_CHAR(RExC_parse); 10775 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 10776 vFAIL2utf8f( 10777 "Sequence (%"UTF8f"...) not recognized", 10778 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 10779 NOT_REACHED; /*NOTREACHED*/ 10780 } 10781 *flagp |= POSTPONED; 10782 paren = '{'; 10783 RExC_parse++; 10784 /* FALLTHROUGH */ 10785 case '{': /* (?{...}) */ 10786 { 10787 U32 n = 0; 10788 struct reg_code_block *cb; 10789 10790 RExC_seen_zerolen++; 10791 10792 if ( !pRExC_state->num_code_blocks 10793 || pRExC_state->code_index >= pRExC_state->num_code_blocks 10794 || pRExC_state->code_blocks[pRExC_state->code_index].start 10795 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) 10796 - RExC_start) 10797 ) { 10798 if (RExC_pm_flags & PMf_USE_RE_EVAL) 10799 FAIL("panic: Sequence (?{...}): no code block found\n"); 10800 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 10801 } 10802 /* this is a pre-compiled code block (?{...}) */ 10803 cb = &pRExC_state->code_blocks[pRExC_state->code_index]; 10804 RExC_parse = RExC_start + cb->end; 10805 if (!SIZE_ONLY) { 10806 OP *o = cb->block; 10807 if (cb->src_regex) { 10808 n = add_data(pRExC_state, STR_WITH_LEN("rl")); 10809 RExC_rxi->data->data[n] = 10810 (void*)SvREFCNT_inc((SV*)cb->src_regex); 10811 RExC_rxi->data->data[n+1] = (void*)o; 10812 } 10813 else { 10814 n = add_data(pRExC_state, 10815 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); 10816 RExC_rxi->data->data[n] = (void*)o; 10817 } 10818 } 10819 pRExC_state->code_index++; 10820 nextchar(pRExC_state); 10821 10822 if (is_logical) { 10823 regnode *eval; 10824 ret = reg_node(pRExC_state, LOGICAL); 10825 10826 eval = reg2Lanode(pRExC_state, EVAL, 10827 n, 10828 10829 /* for later propagation into (??{}) 10830 * return value */ 10831 RExC_flags & RXf_PMf_COMPILETIME 10832 ); 10833 if (!SIZE_ONLY) { 10834 ret->flags = 2; 10835 } 10836 REGTAIL(pRExC_state, ret, eval); 10837 /* deal with the length of this later - MJD */ 10838 return ret; 10839 } 10840 ret = reg2Lanode(pRExC_state, EVAL, n, 0); 10841 Set_Node_Length(ret, RExC_parse - parse_start + 1); 10842 Set_Node_Offset(ret, parse_start); 10843 return ret; 10844 } 10845 case '(': /* (?(?{...})...) and (?(?=...)...) */ 10846 { 10847 int is_define= 0; 10848 const int DEFINE_len = sizeof("DEFINE") - 1; 10849 if (RExC_parse[0] == '?') { /* (?(?...)) */ 10850 if ( RExC_parse < RExC_end - 1 10851 && ( RExC_parse[1] == '=' 10852 || RExC_parse[1] == '!' 10853 || RExC_parse[1] == '<' 10854 || RExC_parse[1] == '{') 10855 ) { /* Lookahead or eval. */ 10856 I32 flag; 10857 regnode *tail; 10858 10859 ret = reg_node(pRExC_state, LOGICAL); 10860 if (!SIZE_ONLY) 10861 ret->flags = 1; 10862 10863 tail = reg(pRExC_state, 1, &flag, depth+1); 10864 if (flag & (RESTART_PASS1|NEED_UTF8)) { 10865 *flagp = flag & (RESTART_PASS1|NEED_UTF8); 10866 return NULL; 10867 } 10868 REGTAIL(pRExC_state, ret, tail); 10869 goto insert_if; 10870 } 10871 /* Fall through to ‘Unknown switch condition’ at the 10872 end of the if/else chain. */ 10873 } 10874 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ 10875 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ 10876 { 10877 char ch = RExC_parse[0] == '<' ? '>' : '\''; 10878 char *name_start= RExC_parse++; 10879 U32 num = 0; 10880 SV *sv_dat=reg_scan_name(pRExC_state, 10881 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 10882 if ( RExC_parse == name_start 10883 || RExC_parse >= RExC_end 10884 || *RExC_parse != ch) 10885 { 10886 vFAIL2("Sequence (?(%c... not terminated", 10887 (ch == '>' ? '<' : ch)); 10888 } 10889 RExC_parse++; 10890 if (!SIZE_ONLY) { 10891 num = add_data( pRExC_state, STR_WITH_LEN("S")); 10892 RExC_rxi->data->data[num]=(void*)sv_dat; 10893 SvREFCNT_inc_simple_void(sv_dat); 10894 } 10895 ret = reganode(pRExC_state,NGROUPP,num); 10896 goto insert_if_check_paren; 10897 } 10898 else if (RExC_end - RExC_parse >= DEFINE_len 10899 && strnEQ(RExC_parse, "DEFINE", DEFINE_len)) 10900 { 10901 ret = reganode(pRExC_state,DEFINEP,0); 10902 RExC_parse += DEFINE_len; 10903 is_define = 1; 10904 goto insert_if_check_paren; 10905 } 10906 else if (RExC_parse[0] == 'R') { 10907 RExC_parse++; 10908 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval" 10909 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)" 10910 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)" 10911 */ 10912 parno = 0; 10913 if (RExC_parse[0] == '0') { 10914 parno = 1; 10915 RExC_parse++; 10916 } 10917 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 10918 UV uv; 10919 if (grok_atoUV(RExC_parse, &uv, &endptr) 10920 && uv <= I32_MAX 10921 ) { 10922 parno = (I32)uv + 1; 10923 RExC_parse = (char*)endptr; 10924 } 10925 /* else "Switch condition not recognized" below */ 10926 } else if (RExC_parse[0] == '&') { 10927 SV *sv_dat; 10928 RExC_parse++; 10929 sv_dat = reg_scan_name(pRExC_state, 10930 SIZE_ONLY 10931 ? REG_RSN_RETURN_NULL 10932 : REG_RSN_RETURN_DATA); 10933 10934 /* we should only have a false sv_dat when 10935 * SIZE_ONLY is true, and we always have false 10936 * sv_dat when SIZE_ONLY is true. 10937 * reg_scan_name() will VFAIL() if the name is 10938 * unknown when SIZE_ONLY is false, and otherwise 10939 * will return something, and when SIZE_ONLY is 10940 * true, reg_scan_name() just parses the string, 10941 * and doesnt return anything. (in theory) */ 10942 assert(SIZE_ONLY ? !sv_dat : !!sv_dat); 10943 10944 if (sv_dat) 10945 parno = 1 + *((I32 *)SvPVX(sv_dat)); 10946 } 10947 ret = reganode(pRExC_state,INSUBP,parno); 10948 goto insert_if_check_paren; 10949 } 10950 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 10951 /* (?(1)...) */ 10952 char c; 10953 UV uv; 10954 if (grok_atoUV(RExC_parse, &uv, &endptr) 10955 && uv <= I32_MAX 10956 ) { 10957 parno = (I32)uv; 10958 RExC_parse = (char*)endptr; 10959 } 10960 else { 10961 vFAIL("panic: grok_atoUV returned FALSE"); 10962 } 10963 ret = reganode(pRExC_state, GROUPP, parno); 10964 10965 insert_if_check_paren: 10966 if (UCHARAT(RExC_parse) != ')') { 10967 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 10968 vFAIL("Switch condition not recognized"); 10969 } 10970 nextchar(pRExC_state); 10971 insert_if: 10972 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); 10973 br = regbranch(pRExC_state, &flags, 1,depth+1); 10974 if (br == NULL) { 10975 if (flags & (RESTART_PASS1|NEED_UTF8)) { 10976 *flagp = flags & (RESTART_PASS1|NEED_UTF8); 10977 return NULL; 10978 } 10979 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", 10980 (UV) flags); 10981 } else 10982 REGTAIL(pRExC_state, br, reganode(pRExC_state, 10983 LONGJMP, 0)); 10984 c = UCHARAT(RExC_parse); 10985 nextchar(pRExC_state); 10986 if (flags&HASWIDTH) 10987 *flagp |= HASWIDTH; 10988 if (c == '|') { 10989 if (is_define) 10990 vFAIL("(?(DEFINE)....) does not allow branches"); 10991 10992 /* Fake one for optimizer. */ 10993 lastbr = reganode(pRExC_state, IFTHEN, 0); 10994 10995 if (!regbranch(pRExC_state, &flags, 1,depth+1)) { 10996 if (flags & (RESTART_PASS1|NEED_UTF8)) { 10997 *flagp = flags & (RESTART_PASS1|NEED_UTF8); 10998 return NULL; 10999 } 11000 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", 11001 (UV) flags); 11002 } 11003 REGTAIL(pRExC_state, ret, lastbr); 11004 if (flags&HASWIDTH) 11005 *flagp |= HASWIDTH; 11006 c = UCHARAT(RExC_parse); 11007 nextchar(pRExC_state); 11008 } 11009 else 11010 lastbr = NULL; 11011 if (c != ')') { 11012 if (RExC_parse >= RExC_end) 11013 vFAIL("Switch (?(condition)... not terminated"); 11014 else 11015 vFAIL("Switch (?(condition)... contains too many branches"); 11016 } 11017 ender = reg_node(pRExC_state, TAIL); 11018 REGTAIL(pRExC_state, br, ender); 11019 if (lastbr) { 11020 REGTAIL(pRExC_state, lastbr, ender); 11021 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); 11022 } 11023 else 11024 REGTAIL(pRExC_state, ret, ender); 11025 RExC_size++; /* XXX WHY do we need this?!! 11026 For large programs it seems to be required 11027 but I can't figure out why. -- dmq*/ 11028 return ret; 11029 } 11030 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 11031 vFAIL("Unknown switch condition (?(...))"); 11032 } 11033 case '[': /* (?[ ... ]) */ 11034 return handle_regex_sets(pRExC_state, NULL, flagp, depth, 11035 oregcomp_parse); 11036 case 0: /* A NUL */ 11037 RExC_parse--; /* for vFAIL to print correctly */ 11038 vFAIL("Sequence (? incomplete"); 11039 break; 11040 default: /* e.g., (?i) */ 11041 RExC_parse = (char *) seqstart + 1; 11042 parse_flags: 11043 parse_lparen_question_flags(pRExC_state); 11044 if (UCHARAT(RExC_parse) != ':') { 11045 if (RExC_parse < RExC_end) 11046 nextchar(pRExC_state); 11047 *flagp = TRYAGAIN; 11048 return NULL; 11049 } 11050 paren = ':'; 11051 nextchar(pRExC_state); 11052 ret = NULL; 11053 goto parse_rest; 11054 } /* end switch */ 11055 } 11056 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ 11057 capturing_parens: 11058 parno = RExC_npar; 11059 RExC_npar++; 11060 11061 ret = reganode(pRExC_state, OPEN, parno); 11062 if (!SIZE_ONLY ){ 11063 if (!RExC_nestroot) 11064 RExC_nestroot = parno; 11065 if (RExC_open_parens && !RExC_open_parens[parno]) 11066 { 11067 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 11068 "%*s%*s Setting open paren #%"IVdf" to %d\n", 11069 22, "| |", (int)(depth * 2 + 1), "", 11070 (IV)parno, REG_NODE_NUM(ret))); 11071 RExC_open_parens[parno]= ret; 11072 } 11073 } 11074 Set_Node_Length(ret, 1); /* MJD */ 11075 Set_Node_Offset(ret, RExC_parse); /* MJD */ 11076 is_open = 1; 11077 } else { 11078 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ 11079 paren = ':'; 11080 ret = NULL; 11081 } 11082 } 11083 else /* ! paren */ 11084 ret = NULL; 11085 11086 parse_rest: 11087 /* Pick up the branches, linking them together. */ 11088 parse_start = RExC_parse; /* MJD */ 11089 br = regbranch(pRExC_state, &flags, 1,depth+1); 11090 11091 /* branch_len = (paren != 0); */ 11092 11093 if (br == NULL) { 11094 if (flags & (RESTART_PASS1|NEED_UTF8)) { 11095 *flagp = flags & (RESTART_PASS1|NEED_UTF8); 11096 return NULL; 11097 } 11098 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); 11099 } 11100 if (*RExC_parse == '|') { 11101 if (!SIZE_ONLY && RExC_extralen) { 11102 reginsert(pRExC_state, BRANCHJ, br, depth+1); 11103 } 11104 else { /* MJD */ 11105 reginsert(pRExC_state, BRANCH, br, depth+1); 11106 Set_Node_Length(br, paren != 0); 11107 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); 11108 } 11109 have_branch = 1; 11110 if (SIZE_ONLY) 11111 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ 11112 } 11113 else if (paren == ':') { 11114 *flagp |= flags&SIMPLE; 11115 } 11116 if (is_open) { /* Starts with OPEN. */ 11117 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ 11118 } 11119 else if (paren != '?') /* Not Conditional */ 11120 ret = br; 11121 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 11122 lastbr = br; 11123 while (*RExC_parse == '|') { 11124 if (!SIZE_ONLY && RExC_extralen) { 11125 ender = reganode(pRExC_state, LONGJMP,0); 11126 11127 /* Append to the previous. */ 11128 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); 11129 } 11130 if (SIZE_ONLY) 11131 RExC_extralen += 2; /* Account for LONGJMP. */ 11132 nextchar(pRExC_state); 11133 if (freeze_paren) { 11134 if (RExC_npar > after_freeze) 11135 after_freeze = RExC_npar; 11136 RExC_npar = freeze_paren; 11137 } 11138 br = regbranch(pRExC_state, &flags, 0, depth+1); 11139 11140 if (br == NULL) { 11141 if (flags & (RESTART_PASS1|NEED_UTF8)) { 11142 *flagp = flags & (RESTART_PASS1|NEED_UTF8); 11143 return NULL; 11144 } 11145 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); 11146 } 11147 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ 11148 lastbr = br; 11149 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 11150 } 11151 11152 if (have_branch || paren != ':') { 11153 /* Make a closing node, and hook it on the end. */ 11154 switch (paren) { 11155 case ':': 11156 ender = reg_node(pRExC_state, TAIL); 11157 break; 11158 case 1: case 2: 11159 ender = reganode(pRExC_state, CLOSE, parno); 11160 if ( RExC_close_parens ) { 11161 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 11162 "%*s%*s Setting close paren #%"IVdf" to %d\n", 11163 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender))); 11164 RExC_close_parens[parno]= ender; 11165 if (RExC_nestroot == parno) 11166 RExC_nestroot = 0; 11167 } 11168 Set_Node_Offset(ender,RExC_parse+1); /* MJD */ 11169 Set_Node_Length(ender,1); /* MJD */ 11170 break; 11171 case '<': 11172 case ',': 11173 case '=': 11174 case '!': 11175 *flagp &= ~HASWIDTH; 11176 /* FALLTHROUGH */ 11177 case '>': 11178 ender = reg_node(pRExC_state, SUCCEED); 11179 break; 11180 case 0: 11181 ender = reg_node(pRExC_state, END); 11182 if (!SIZE_ONLY) { 11183 assert(!RExC_end_op); /* there can only be one! */ 11184 RExC_end_op = ender; 11185 if (RExC_close_parens) { 11186 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 11187 "%*s%*s Setting close paren #0 (END) to %d\n", 11188 22, "| |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender))); 11189 11190 RExC_close_parens[0]= ender; 11191 } 11192 } 11193 break; 11194 } 11195 DEBUG_PARSE_r(if (!SIZE_ONLY) { 11196 DEBUG_PARSE_MSG("lsbr"); 11197 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state); 11198 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state); 11199 Perl_re_printf( aTHX_ "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", 11200 SvPV_nolen_const(RExC_mysv1), 11201 (IV)REG_NODE_NUM(lastbr), 11202 SvPV_nolen_const(RExC_mysv2), 11203 (IV)REG_NODE_NUM(ender), 11204 (IV)(ender - lastbr) 11205 ); 11206 }); 11207 REGTAIL(pRExC_state, lastbr, ender); 11208 11209 if (have_branch && !SIZE_ONLY) { 11210 char is_nothing= 1; 11211 if (depth==1) 11212 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 11213 11214 /* Hook the tails of the branches to the closing node. */ 11215 for (br = ret; br; br = regnext(br)) { 11216 const U8 op = PL_regkind[OP(br)]; 11217 if (op == BRANCH) { 11218 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); 11219 if ( OP(NEXTOPER(br)) != NOTHING 11220 || regnext(NEXTOPER(br)) != ender) 11221 is_nothing= 0; 11222 } 11223 else if (op == BRANCHJ) { 11224 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); 11225 /* for now we always disable this optimisation * / 11226 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING 11227 || regnext(NEXTOPER(NEXTOPER(br))) != ender) 11228 */ 11229 is_nothing= 0; 11230 } 11231 } 11232 if (is_nothing) { 11233 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; 11234 DEBUG_PARSE_r(if (!SIZE_ONLY) { 11235 DEBUG_PARSE_MSG("NADA"); 11236 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state); 11237 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state); 11238 Perl_re_printf( aTHX_ "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", 11239 SvPV_nolen_const(RExC_mysv1), 11240 (IV)REG_NODE_NUM(ret), 11241 SvPV_nolen_const(RExC_mysv2), 11242 (IV)REG_NODE_NUM(ender), 11243 (IV)(ender - ret) 11244 ); 11245 }); 11246 OP(br)= NOTHING; 11247 if (OP(ender) == TAIL) { 11248 NEXT_OFF(br)= 0; 11249 RExC_emit= br + 1; 11250 } else { 11251 regnode *opt; 11252 for ( opt= br + 1; opt < ender ; opt++ ) 11253 OP(opt)= OPTIMIZED; 11254 NEXT_OFF(br)= ender - br; 11255 } 11256 } 11257 } 11258 } 11259 11260 { 11261 const char *p; 11262 static const char parens[] = "=!<,>"; 11263 11264 if (paren && (p = strchr(parens, paren))) { 11265 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 11266 int flag = (p - parens) > 1; 11267 11268 if (paren == '>') 11269 node = SUSPEND, flag = 0; 11270 reginsert(pRExC_state, node,ret, depth+1); 11271 Set_Node_Cur_Length(ret, parse_start); 11272 Set_Node_Offset(ret, parse_start + 1); 11273 ret->flags = flag; 11274 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); 11275 } 11276 } 11277 11278 /* Check for proper termination. */ 11279 if (paren) { 11280 /* restore original flags, but keep (?p) and, if we've changed from /d 11281 * rules to /u, keep the /u */ 11282 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); 11283 if (DEPENDS_SEMANTICS && RExC_uni_semantics) { 11284 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); 11285 } 11286 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { 11287 RExC_parse = oregcomp_parse; 11288 vFAIL("Unmatched ("); 11289 } 11290 nextchar(pRExC_state); 11291 } 11292 else if (!paren && RExC_parse < RExC_end) { 11293 if (*RExC_parse == ')') { 11294 RExC_parse++; 11295 vFAIL("Unmatched )"); 11296 } 11297 else 11298 FAIL("Junk on end of regexp"); /* "Can't happen". */ 11299 NOT_REACHED; /* NOTREACHED */ 11300 } 11301 11302 if (RExC_in_lookbehind) { 11303 RExC_in_lookbehind--; 11304 } 11305 if (after_freeze > RExC_npar) 11306 RExC_npar = after_freeze; 11307 return(ret); 11308 } 11309 11310 /* 11311 - regbranch - one alternative of an | operator 11312 * 11313 * Implements the concatenation operator. 11314 * 11315 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be 11316 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 11317 */ 11318 STATIC regnode * 11319 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) 11320 { 11321 regnode *ret; 11322 regnode *chain = NULL; 11323 regnode *latest; 11324 I32 flags = 0, c = 0; 11325 GET_RE_DEBUG_FLAGS_DECL; 11326 11327 PERL_ARGS_ASSERT_REGBRANCH; 11328 11329 DEBUG_PARSE("brnc"); 11330 11331 if (first) 11332 ret = NULL; 11333 else { 11334 if (!SIZE_ONLY && RExC_extralen) 11335 ret = reganode(pRExC_state, BRANCHJ,0); 11336 else { 11337 ret = reg_node(pRExC_state, BRANCH); 11338 Set_Node_Length(ret, 1); 11339 } 11340 } 11341 11342 if (!first && SIZE_ONLY) 11343 RExC_extralen += 1; /* BRANCHJ */ 11344 11345 *flagp = WORST; /* Tentatively. */ 11346 11347 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 11348 FALSE /* Don't force to /x */ ); 11349 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 11350 flags &= ~TRYAGAIN; 11351 latest = regpiece(pRExC_state, &flags,depth+1); 11352 if (latest == NULL) { 11353 if (flags & TRYAGAIN) 11354 continue; 11355 if (flags & (RESTART_PASS1|NEED_UTF8)) { 11356 *flagp = flags & (RESTART_PASS1|NEED_UTF8); 11357 return NULL; 11358 } 11359 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); 11360 } 11361 else if (ret == NULL) 11362 ret = latest; 11363 *flagp |= flags&(HASWIDTH|POSTPONED); 11364 if (chain == NULL) /* First piece. */ 11365 *flagp |= flags&SPSTART; 11366 else { 11367 /* FIXME adding one for every branch after the first is probably 11368 * excessive now we have TRIE support. (hv) */ 11369 MARK_NAUGHTY(1); 11370 REGTAIL(pRExC_state, chain, latest); 11371 } 11372 chain = latest; 11373 c++; 11374 } 11375 if (chain == NULL) { /* Loop ran zero times. */ 11376 chain = reg_node(pRExC_state, NOTHING); 11377 if (ret == NULL) 11378 ret = chain; 11379 } 11380 if (c == 1) { 11381 *flagp |= flags&SIMPLE; 11382 } 11383 11384 return ret; 11385 } 11386 11387 /* 11388 - regpiece - something followed by possible [*+?] 11389 * 11390 * Note that the branching code sequences used for ? and the general cases 11391 * of * and + are somewhat optimized: they use the same NOTHING node as 11392 * both the endmarker for their branch list and the body of the last branch. 11393 * It might seem that this node could be dispensed with entirely, but the 11394 * endmarker role is not redundant. 11395 * 11396 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with 11397 * TRYAGAIN. 11398 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be 11399 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 11400 */ 11401 STATIC regnode * 11402 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 11403 { 11404 regnode *ret; 11405 char op; 11406 char *next; 11407 I32 flags; 11408 const char * const origparse = RExC_parse; 11409 I32 min; 11410 I32 max = REG_INFTY; 11411 #ifdef RE_TRACK_PATTERN_OFFSETS 11412 char *parse_start; 11413 #endif 11414 const char *maxpos = NULL; 11415 UV uv; 11416 11417 /* Save the original in case we change the emitted regop to a FAIL. */ 11418 regnode * const orig_emit = RExC_emit; 11419 11420 GET_RE_DEBUG_FLAGS_DECL; 11421 11422 PERL_ARGS_ASSERT_REGPIECE; 11423 11424 DEBUG_PARSE("piec"); 11425 11426 ret = regatom(pRExC_state, &flags,depth+1); 11427 if (ret == NULL) { 11428 if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8)) 11429 *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8); 11430 else 11431 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); 11432 return(NULL); 11433 } 11434 11435 op = *RExC_parse; 11436 11437 if (op == '{' && regcurly(RExC_parse)) { 11438 maxpos = NULL; 11439 #ifdef RE_TRACK_PATTERN_OFFSETS 11440 parse_start = RExC_parse; /* MJD */ 11441 #endif 11442 next = RExC_parse + 1; 11443 while (isDIGIT(*next) || *next == ',') { 11444 if (*next == ',') { 11445 if (maxpos) 11446 break; 11447 else 11448 maxpos = next; 11449 } 11450 next++; 11451 } 11452 if (*next == '}') { /* got one */ 11453 const char* endptr; 11454 if (!maxpos) 11455 maxpos = next; 11456 RExC_parse++; 11457 if (isDIGIT(*RExC_parse)) { 11458 if (!grok_atoUV(RExC_parse, &uv, &endptr)) 11459 vFAIL("Invalid quantifier in {,}"); 11460 if (uv >= REG_INFTY) 11461 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 11462 min = (I32)uv; 11463 } else { 11464 min = 0; 11465 } 11466 if (*maxpos == ',') 11467 maxpos++; 11468 else 11469 maxpos = RExC_parse; 11470 if (isDIGIT(*maxpos)) { 11471 if (!grok_atoUV(maxpos, &uv, &endptr)) 11472 vFAIL("Invalid quantifier in {,}"); 11473 if (uv >= REG_INFTY) 11474 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 11475 max = (I32)uv; 11476 } else { 11477 max = REG_INFTY; /* meaning "infinity" */ 11478 } 11479 RExC_parse = next; 11480 nextchar(pRExC_state); 11481 if (max < min) { /* If can't match, warn and optimize to fail 11482 unconditionally */ 11483 if (SIZE_ONLY) { 11484 11485 /* We can't back off the size because we have to reserve 11486 * enough space for all the things we are about to throw 11487 * away, but we can shrink it by the amount we are about 11488 * to re-use here */ 11489 RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; 11490 } 11491 else { 11492 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); 11493 RExC_emit = orig_emit; 11494 } 11495 ret = reganode(pRExC_state, OPFAIL, 0); 11496 return ret; 11497 } 11498 else if (min == max && *RExC_parse == '?') 11499 { 11500 if (PASS2) { 11501 ckWARN2reg(RExC_parse + 1, 11502 "Useless use of greediness modifier '%c'", 11503 *RExC_parse); 11504 } 11505 } 11506 11507 do_curly: 11508 if ((flags&SIMPLE)) { 11509 if (min == 0 && max == REG_INFTY) { 11510 reginsert(pRExC_state, STAR, ret, depth+1); 11511 ret->flags = 0; 11512 MARK_NAUGHTY(4); 11513 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 11514 goto nest_check; 11515 } 11516 if (min == 1 && max == REG_INFTY) { 11517 reginsert(pRExC_state, PLUS, ret, depth+1); 11518 ret->flags = 0; 11519 MARK_NAUGHTY(3); 11520 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 11521 goto nest_check; 11522 } 11523 MARK_NAUGHTY_EXP(2, 2); 11524 reginsert(pRExC_state, CURLY, ret, depth+1); 11525 Set_Node_Offset(ret, parse_start+1); /* MJD */ 11526 Set_Node_Cur_Length(ret, parse_start); 11527 } 11528 else { 11529 regnode * const w = reg_node(pRExC_state, WHILEM); 11530 11531 w->flags = 0; 11532 REGTAIL(pRExC_state, ret, w); 11533 if (!SIZE_ONLY && RExC_extralen) { 11534 reginsert(pRExC_state, LONGJMP,ret, depth+1); 11535 reginsert(pRExC_state, NOTHING,ret, depth+1); 11536 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ 11537 } 11538 reginsert(pRExC_state, CURLYX,ret, depth+1); 11539 /* MJD hk */ 11540 Set_Node_Offset(ret, parse_start+1); 11541 Set_Node_Length(ret, 11542 op == '{' ? (RExC_parse - parse_start) : 1); 11543 11544 if (!SIZE_ONLY && RExC_extralen) 11545 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ 11546 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); 11547 if (SIZE_ONLY) 11548 RExC_whilem_seen++, RExC_extralen += 3; 11549 MARK_NAUGHTY_EXP(1, 4); /* compound interest */ 11550 } 11551 ret->flags = 0; 11552 11553 if (min > 0) 11554 *flagp = WORST; 11555 if (max > 0) 11556 *flagp |= HASWIDTH; 11557 if (!SIZE_ONLY) { 11558 ARG1_SET(ret, (U16)min); 11559 ARG2_SET(ret, (U16)max); 11560 } 11561 if (max == REG_INFTY) 11562 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 11563 11564 goto nest_check; 11565 } 11566 } 11567 11568 if (!ISMULT1(op)) { 11569 *flagp = flags; 11570 return(ret); 11571 } 11572 11573 #if 0 /* Now runtime fix should be reliable. */ 11574 11575 /* if this is reinstated, don't forget to put this back into perldiag: 11576 11577 =item Regexp *+ operand could be empty at {#} in regex m/%s/ 11578 11579 (F) The part of the regexp subject to either the * or + quantifier 11580 could match an empty string. The {#} shows in the regular 11581 expression about where the problem was discovered. 11582 11583 */ 11584 11585 if (!(flags&HASWIDTH) && op != '?') 11586 vFAIL("Regexp *+ operand could be empty"); 11587 #endif 11588 11589 #ifdef RE_TRACK_PATTERN_OFFSETS 11590 parse_start = RExC_parse; 11591 #endif 11592 nextchar(pRExC_state); 11593 11594 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); 11595 11596 if (op == '*') { 11597 min = 0; 11598 goto do_curly; 11599 } 11600 else if (op == '+') { 11601 min = 1; 11602 goto do_curly; 11603 } 11604 else if (op == '?') { 11605 min = 0; max = 1; 11606 goto do_curly; 11607 } 11608 nest_check: 11609 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { 11610 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ 11611 ckWARN2reg(RExC_parse, 11612 "%"UTF8f" matches null string many times", 11613 UTF8fARG(UTF, (RExC_parse >= origparse 11614 ? RExC_parse - origparse 11615 : 0), 11616 origparse)); 11617 (void)ReREFCNT_inc(RExC_rx_sv); 11618 } 11619 11620 if (*RExC_parse == '?') { 11621 nextchar(pRExC_state); 11622 reginsert(pRExC_state, MINMOD, ret, depth+1); 11623 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); 11624 } 11625 else if (*RExC_parse == '+') { 11626 regnode *ender; 11627 nextchar(pRExC_state); 11628 ender = reg_node(pRExC_state, SUCCEED); 11629 REGTAIL(pRExC_state, ret, ender); 11630 reginsert(pRExC_state, SUSPEND, ret, depth+1); 11631 ret->flags = 0; 11632 ender = reg_node(pRExC_state, TAIL); 11633 REGTAIL(pRExC_state, ret, ender); 11634 } 11635 11636 if (ISMULT2(RExC_parse)) { 11637 RExC_parse++; 11638 vFAIL("Nested quantifiers"); 11639 } 11640 11641 return(ret); 11642 } 11643 11644 STATIC bool 11645 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, 11646 regnode ** node_p, 11647 UV * code_point_p, 11648 int * cp_count, 11649 I32 * flagp, 11650 const bool strict, 11651 const U32 depth 11652 ) 11653 { 11654 /* This routine teases apart the various meanings of \N and returns 11655 * accordingly. The input parameters constrain which meaning(s) is/are valid 11656 * in the current context. 11657 * 11658 * Exactly one of <node_p> and <code_point_p> must be non-NULL. 11659 * 11660 * If <code_point_p> is not NULL, the context is expecting the result to be a 11661 * single code point. If this \N instance turns out to a single code point, 11662 * the function returns TRUE and sets *code_point_p to that code point. 11663 * 11664 * If <node_p> is not NULL, the context is expecting the result to be one of 11665 * the things representable by a regnode. If this \N instance turns out to be 11666 * one such, the function generates the regnode, returns TRUE and sets *node_p 11667 * to point to that regnode. 11668 * 11669 * If this instance of \N isn't legal in any context, this function will 11670 * generate a fatal error and not return. 11671 * 11672 * On input, RExC_parse should point to the first char following the \N at the 11673 * time of the call. On successful return, RExC_parse will have been updated 11674 * to point to just after the sequence identified by this routine. Also 11675 * *flagp has been updated as needed. 11676 * 11677 * When there is some problem with the current context and this \N instance, 11678 * the function returns FALSE, without advancing RExC_parse, nor setting 11679 * *node_p, nor *code_point_p, nor *flagp. 11680 * 11681 * If <cp_count> is not NULL, the caller wants to know the length (in code 11682 * points) that this \N sequence matches. This is set even if the function 11683 * returns FALSE, as detailed below. 11684 * 11685 * There are 5 possibilities here, as detailed in the next 5 paragraphs. 11686 * 11687 * Probably the most common case is for the \N to specify a single code point. 11688 * *cp_count will be set to 1, and *code_point_p will be set to that code 11689 * point. 11690 * 11691 * Another possibility is for the input to be an empty \N{}, which for 11692 * backwards compatibility we accept. *cp_count will be set to 0. *node_p 11693 * will be set to a generated NOTHING node. 11694 * 11695 * Still another possibility is for the \N to mean [^\n]. *cp_count will be 11696 * set to 0. *node_p will be set to a generated REG_ANY node. 11697 * 11698 * The fourth possibility is that \N resolves to a sequence of more than one 11699 * code points. *cp_count will be set to the number of code points in the 11700 * sequence. *node_p * will be set to a generated node returned by this 11701 * function calling S_reg(). 11702 * 11703 * The final possibility is that it is premature to be calling this function; 11704 * that pass1 needs to be restarted. This can happen when this changes from 11705 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The 11706 * latter occurs only when the fourth possibility would otherwise be in 11707 * effect, and is because one of those code points requires the pattern to be 11708 * recompiled as UTF-8. The function returns FALSE, and sets the 11709 * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this 11710 * happens, the caller needs to desist from continuing parsing, and return 11711 * this information to its caller. This is not set for when there is only one 11712 * code point, as this can be called as part of an ANYOF node, and they can 11713 * store above-Latin1 code points without the pattern having to be in UTF-8. 11714 * 11715 * For non-single-quoted regexes, the tokenizer has resolved character and 11716 * sequence names inside \N{...} into their Unicode values, normalizing the 11717 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the 11718 * hex-represented code points in the sequence. This is done there because 11719 * the names can vary based on what charnames pragma is in scope at the time, 11720 * so we need a way to take a snapshot of what they resolve to at the time of 11721 * the original parse. [perl #56444]. 11722 * 11723 * That parsing is skipped for single-quoted regexes, so we may here get 11724 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the 11725 * parser. But if the single-quoted regex is something like '\N{U+41}', that 11726 * is legal and handled here. The code point is Unicode, and has to be 11727 * translated into the native character set for non-ASCII platforms. 11728 */ 11729 11730 char * endbrace; /* points to '}' following the name */ 11731 char *endchar; /* Points to '.' or '}' ending cur char in the input 11732 stream */ 11733 char* p = RExC_parse; /* Temporary */ 11734 11735 GET_RE_DEBUG_FLAGS_DECL; 11736 11737 PERL_ARGS_ASSERT_GROK_BSLASH_N; 11738 11739 GET_RE_DEBUG_FLAGS; 11740 11741 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */ 11742 assert(! (node_p && cp_count)); /* At most 1 should be set */ 11743 11744 if (cp_count) { /* Initialize return for the most common case */ 11745 *cp_count = 1; 11746 } 11747 11748 /* The [^\n] meaning of \N ignores spaces and comments under the /x 11749 * modifier. The other meanings do not, so use a temporary until we find 11750 * out which we are being called with */ 11751 skip_to_be_ignored_text(pRExC_state, &p, 11752 FALSE /* Don't force to /x */ ); 11753 11754 /* Disambiguate between \N meaning a named character versus \N meaning 11755 * [^\n]. The latter is assumed when the {...} following the \N is a legal 11756 * quantifier, or there is no '{' at all */ 11757 if (*p != '{' || regcurly(p)) { 11758 RExC_parse = p; 11759 if (cp_count) { 11760 *cp_count = -1; 11761 } 11762 11763 if (! node_p) { 11764 return FALSE; 11765 } 11766 11767 *node_p = reg_node(pRExC_state, REG_ANY); 11768 *flagp |= HASWIDTH|SIMPLE; 11769 MARK_NAUGHTY(1); 11770 Set_Node_Length(*node_p, 1); /* MJD */ 11771 return TRUE; 11772 } 11773 11774 /* Here, we have decided it should be a named character or sequence */ 11775 11776 /* The test above made sure that the next real character is a '{', but 11777 * under the /x modifier, it could be separated by space (or a comment and 11778 * \n) and this is not allowed (for consistency with \x{...} and the 11779 * tokenizer handling of \N{NAME}). */ 11780 if (*RExC_parse != '{') { 11781 vFAIL("Missing braces on \\N{}"); 11782 } 11783 11784 RExC_parse++; /* Skip past the '{' */ 11785 11786 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ 11787 || ! (endbrace == RExC_parse /* nothing between the {} */ 11788 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */ 11789 && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better 11790 error msg) */ 11791 { 11792 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ 11793 vFAIL("\\N{NAME} must be resolved by the lexer"); 11794 } 11795 11796 REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode 11797 semantics */ 11798 11799 if (endbrace == RExC_parse) { /* empty: \N{} */ 11800 if (strict) { 11801 RExC_parse++; /* Position after the "}" */ 11802 vFAIL("Zero length \\N{}"); 11803 } 11804 if (cp_count) { 11805 *cp_count = 0; 11806 } 11807 nextchar(pRExC_state); 11808 if (! node_p) { 11809 return FALSE; 11810 } 11811 11812 *node_p = reg_node(pRExC_state,NOTHING); 11813 return TRUE; 11814 } 11815 11816 RExC_parse += 2; /* Skip past the 'U+' */ 11817 11818 /* Because toke.c has generated a special construct for us guaranteed not 11819 * to have NULs, we can use a str function */ 11820 endchar = RExC_parse + strcspn(RExC_parse, ".}"); 11821 11822 /* Code points are separated by dots. If none, there is only one code 11823 * point, and is terminated by the brace */ 11824 11825 if (endchar >= endbrace) { 11826 STRLEN length_of_hex; 11827 I32 grok_hex_flags; 11828 11829 /* Here, exactly one code point. If that isn't what is wanted, fail */ 11830 if (! code_point_p) { 11831 RExC_parse = p; 11832 return FALSE; 11833 } 11834 11835 /* Convert code point from hex */ 11836 length_of_hex = (STRLEN)(endchar - RExC_parse); 11837 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES 11838 | PERL_SCAN_DISALLOW_PREFIX 11839 11840 /* No errors in the first pass (See [perl 11841 * #122671].) We let the code below find the 11842 * errors when there are multiple chars. */ 11843 | ((SIZE_ONLY) 11844 ? PERL_SCAN_SILENT_ILLDIGIT 11845 : 0); 11846 11847 /* This routine is the one place where both single- and double-quotish 11848 * \N{U+xxxx} are evaluated. The value is a Unicode code point which 11849 * must be converted to native. */ 11850 *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse, 11851 &length_of_hex, 11852 &grok_hex_flags, 11853 NULL)); 11854 11855 /* The tokenizer should have guaranteed validity, but it's possible to 11856 * bypass it by using single quoting, so check. Don't do the check 11857 * here when there are multiple chars; we do it below anyway. */ 11858 if (length_of_hex == 0 11859 || length_of_hex != (STRLEN)(endchar - RExC_parse) ) 11860 { 11861 RExC_parse += length_of_hex; /* Includes all the valid */ 11862 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ 11863 ? UTF8SKIP(RExC_parse) 11864 : 1; 11865 /* Guard against malformed utf8 */ 11866 if (RExC_parse >= endchar) { 11867 RExC_parse = endchar; 11868 } 11869 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 11870 } 11871 11872 RExC_parse = endbrace + 1; 11873 return TRUE; 11874 } 11875 else { /* Is a multiple character sequence */ 11876 SV * substitute_parse; 11877 STRLEN len; 11878 char *orig_end = RExC_end; 11879 char *save_start = RExC_start; 11880 I32 flags; 11881 11882 /* Count the code points, if desired, in the sequence */ 11883 if (cp_count) { 11884 *cp_count = 0; 11885 while (RExC_parse < endbrace) { 11886 /* Point to the beginning of the next character in the sequence. */ 11887 RExC_parse = endchar + 1; 11888 endchar = RExC_parse + strcspn(RExC_parse, ".}"); 11889 (*cp_count)++; 11890 } 11891 } 11892 11893 /* Fail if caller doesn't want to handle a multi-code-point sequence. 11894 * But don't backup up the pointer if the caller want to know how many 11895 * code points there are (they can then handle things) */ 11896 if (! node_p) { 11897 if (! cp_count) { 11898 RExC_parse = p; 11899 } 11900 return FALSE; 11901 } 11902 11903 /* What is done here is to convert this to a sub-pattern of the form 11904 * \x{char1}\x{char2}... and then call reg recursively to parse it 11905 * (enclosing in "(?: ... )" ). That way, it retains its atomicness, 11906 * while not having to worry about special handling that some code 11907 * points may have. */ 11908 11909 substitute_parse = newSVpvs("?:"); 11910 11911 while (RExC_parse < endbrace) { 11912 11913 /* Convert to notation the rest of the code understands */ 11914 sv_catpv(substitute_parse, "\\x{"); 11915 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); 11916 sv_catpv(substitute_parse, "}"); 11917 11918 /* Point to the beginning of the next character in the sequence. */ 11919 RExC_parse = endchar + 1; 11920 endchar = RExC_parse + strcspn(RExC_parse, ".}"); 11921 11922 } 11923 sv_catpv(substitute_parse, ")"); 11924 11925 len = SvCUR(substitute_parse); 11926 11927 /* Don't allow empty number */ 11928 if (len < (STRLEN) 8) { 11929 RExC_parse = endbrace; 11930 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 11931 } 11932 11933 RExC_parse = RExC_start = RExC_adjusted_start 11934 = SvPV_nolen(substitute_parse); 11935 RExC_end = RExC_parse + len; 11936 11937 /* The values are Unicode, and therefore not subject to recoding, but 11938 * have to be converted to native on a non-Unicode (meaning non-ASCII) 11939 * platform. */ 11940 RExC_override_recoding = 1; 11941 #ifdef EBCDIC 11942 RExC_recode_x_to_native = 1; 11943 #endif 11944 11945 if (node_p) { 11946 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { 11947 if (flags & (RESTART_PASS1|NEED_UTF8)) { 11948 *flagp = flags & (RESTART_PASS1|NEED_UTF8); 11949 return FALSE; 11950 } 11951 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", 11952 (UV) flags); 11953 } 11954 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); 11955 } 11956 11957 /* Restore the saved values */ 11958 RExC_start = RExC_adjusted_start = save_start; 11959 RExC_parse = endbrace; 11960 RExC_end = orig_end; 11961 RExC_override_recoding = 0; 11962 #ifdef EBCDIC 11963 RExC_recode_x_to_native = 0; 11964 #endif 11965 11966 SvREFCNT_dec_NN(substitute_parse); 11967 nextchar(pRExC_state); 11968 11969 return TRUE; 11970 } 11971 } 11972 11973 11974 /* 11975 * reg_recode 11976 * 11977 * It returns the code point in utf8 for the value in *encp. 11978 * value: a code value in the source encoding 11979 * encp: a pointer to an Encode object 11980 * 11981 * If the result from Encode is not a single character, 11982 * it returns U+FFFD (Replacement character) and sets *encp to NULL. 11983 */ 11984 STATIC UV 11985 S_reg_recode(pTHX_ const U8 value, SV **encp) 11986 { 11987 STRLEN numlen = 1; 11988 SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP); 11989 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); 11990 const STRLEN newlen = SvCUR(sv); 11991 UV uv = UNICODE_REPLACEMENT; 11992 11993 PERL_ARGS_ASSERT_REG_RECODE; 11994 11995 if (newlen) 11996 uv = SvUTF8(sv) 11997 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) 11998 : *(U8*)s; 11999 12000 if (!newlen || numlen != newlen) { 12001 uv = UNICODE_REPLACEMENT; 12002 *encp = NULL; 12003 } 12004 return uv; 12005 } 12006 12007 PERL_STATIC_INLINE U8 12008 S_compute_EXACTish(RExC_state_t *pRExC_state) 12009 { 12010 U8 op; 12011 12012 PERL_ARGS_ASSERT_COMPUTE_EXACTISH; 12013 12014 if (! FOLD) { 12015 return (LOC) 12016 ? EXACTL 12017 : EXACT; 12018 } 12019 12020 op = get_regex_charset(RExC_flags); 12021 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { 12022 op--; /* /a is same as /u, and map /aa's offset to what /a's would have 12023 been, so there is no hole */ 12024 } 12025 12026 return op + EXACTF; 12027 } 12028 12029 PERL_STATIC_INLINE void 12030 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, 12031 regnode *node, I32* flagp, STRLEN len, UV code_point, 12032 bool downgradable) 12033 { 12034 /* This knows the details about sizing an EXACTish node, setting flags for 12035 * it (by setting <*flagp>, and potentially populating it with a single 12036 * character. 12037 * 12038 * If <len> (the length in bytes) is non-zero, this function assumes that 12039 * the node has already been populated, and just does the sizing. In this 12040 * case <code_point> should be the final code point that has already been 12041 * placed into the node. This value will be ignored except that under some 12042 * circumstances <*flagp> is set based on it. 12043 * 12044 * If <len> is zero, the function assumes that the node is to contain only 12045 * the single character given by <code_point> and calculates what <len> 12046 * should be. In pass 1, it sizes the node appropriately. In pass 2, it 12047 * additionally will populate the node's STRING with <code_point> or its 12048 * fold if folding. 12049 * 12050 * In both cases <*flagp> is appropriately set 12051 * 12052 * It knows that under FOLD, the Latin Sharp S and UTF characters above 12053 * 255, must be folded (the former only when the rules indicate it can 12054 * match 'ss') 12055 * 12056 * When it does the populating, it looks at the flag 'downgradable'. If 12057 * true with a node that folds, it checks if the single code point 12058 * participates in a fold, and if not downgrades the node to an EXACT. 12059 * This helps the optimizer */ 12060 12061 bool len_passed_in = cBOOL(len != 0); 12062 U8 character[UTF8_MAXBYTES_CASE+1]; 12063 12064 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; 12065 12066 /* Don't bother to check for downgrading in PASS1, as it doesn't make any 12067 * sizing difference, and is extra work that is thrown away */ 12068 if (downgradable && ! PASS2) { 12069 downgradable = FALSE; 12070 } 12071 12072 if (! len_passed_in) { 12073 if (UTF) { 12074 if (UVCHR_IS_INVARIANT(code_point)) { 12075 if (LOC || ! FOLD) { /* /l defers folding until runtime */ 12076 *character = (U8) code_point; 12077 } 12078 else { /* Here is /i and not /l. (toFOLD() is defined on just 12079 ASCII, which isn't the same thing as INVARIANT on 12080 EBCDIC, but it works there, as the extra invariants 12081 fold to themselves) */ 12082 *character = toFOLD((U8) code_point); 12083 12084 /* We can downgrade to an EXACT node if this character 12085 * isn't a folding one. Note that this assumes that 12086 * nothing above Latin1 folds to some other invariant than 12087 * one of these alphabetics; otherwise we would also have 12088 * to check: 12089 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) 12090 * || ASCII_FOLD_RESTRICTED)) 12091 */ 12092 if (downgradable && PL_fold[code_point] == code_point) { 12093 OP(node) = EXACT; 12094 } 12095 } 12096 len = 1; 12097 } 12098 else if (FOLD && (! LOC 12099 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) 12100 { /* Folding, and ok to do so now */ 12101 UV folded = _to_uni_fold_flags( 12102 code_point, 12103 character, 12104 &len, 12105 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) 12106 ? FOLD_FLAGS_NOMIX_ASCII 12107 : 0)); 12108 if (downgradable 12109 && folded == code_point /* This quickly rules out many 12110 cases, avoiding the 12111 _invlist_contains_cp() overhead 12112 for those. */ 12113 && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) 12114 { 12115 OP(node) = (LOC) 12116 ? EXACTL 12117 : EXACT; 12118 } 12119 } 12120 else if (code_point <= MAX_UTF8_TWO_BYTE) { 12121 12122 /* Not folding this cp, and can output it directly */ 12123 *character = UTF8_TWO_BYTE_HI(code_point); 12124 *(character + 1) = UTF8_TWO_BYTE_LO(code_point); 12125 len = 2; 12126 } 12127 else { 12128 uvchr_to_utf8( character, code_point); 12129 len = UTF8SKIP(character); 12130 } 12131 } /* Else pattern isn't UTF8. */ 12132 else if (! FOLD) { 12133 *character = (U8) code_point; 12134 len = 1; 12135 } /* Else is folded non-UTF8 */ 12136 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 12137 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 12138 || UNICODE_DOT_DOT_VERSION > 0) 12139 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { 12140 #else 12141 else if (1) { 12142 #endif 12143 /* We don't fold any non-UTF8 except possibly the Sharp s (see 12144 * comments at join_exact()); */ 12145 *character = (U8) code_point; 12146 len = 1; 12147 12148 /* Can turn into an EXACT node if we know the fold at compile time, 12149 * and it folds to itself and doesn't particpate in other folds */ 12150 if (downgradable 12151 && ! LOC 12152 && PL_fold_latin1[code_point] == code_point 12153 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) 12154 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) 12155 { 12156 OP(node) = EXACT; 12157 } 12158 } /* else is Sharp s. May need to fold it */ 12159 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { 12160 *character = 's'; 12161 *(character + 1) = 's'; 12162 len = 2; 12163 } 12164 else { 12165 *character = LATIN_SMALL_LETTER_SHARP_S; 12166 len = 1; 12167 } 12168 } 12169 12170 if (SIZE_ONLY) { 12171 RExC_size += STR_SZ(len); 12172 } 12173 else { 12174 RExC_emit += STR_SZ(len); 12175 STR_LEN(node) = len; 12176 if (! len_passed_in) { 12177 Copy((char *) character, STRING(node), len, char); 12178 } 12179 } 12180 12181 *flagp |= HASWIDTH; 12182 12183 /* A single character node is SIMPLE, except for the special-cased SHARP S 12184 * under /di. */ 12185 if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point))) 12186 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 12187 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 12188 || UNICODE_DOT_DOT_VERSION > 0) 12189 && ( code_point != LATIN_SMALL_LETTER_SHARP_S 12190 || ! FOLD || ! DEPENDS_SEMANTICS) 12191 #endif 12192 ) { 12193 *flagp |= SIMPLE; 12194 } 12195 12196 /* The OP may not be well defined in PASS1 */ 12197 if (PASS2 && OP(node) == EXACTFL) { 12198 RExC_contains_locale = 1; 12199 } 12200 } 12201 12202 12203 /* Parse backref decimal value, unless it's too big to sensibly be a backref, 12204 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ 12205 12206 static I32 12207 S_backref_value(char *p) 12208 { 12209 const char* endptr; 12210 UV val; 12211 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX) 12212 return (I32)val; 12213 return I32_MAX; 12214 } 12215 12216 12217 /* 12218 - regatom - the lowest level 12219 12220 Try to identify anything special at the start of the pattern. If there 12221 is, then handle it as required. This may involve generating a single regop, 12222 such as for an assertion; or it may involve recursing, such as to 12223 handle a () structure. 12224 12225 If the string doesn't start with something special then we gobble up 12226 as much literal text as we can. 12227 12228 Once we have been able to handle whatever type of thing started the 12229 sequence, we return. 12230 12231 Note: we have to be careful with escapes, as they can be both literal 12232 and special, and in the case of \10 and friends, context determines which. 12233 12234 A summary of the code structure is: 12235 12236 switch (first_byte) { 12237 cases for each special: 12238 handle this special; 12239 break; 12240 case '\\': 12241 switch (2nd byte) { 12242 cases for each unambiguous special: 12243 handle this special; 12244 break; 12245 cases for each ambigous special/literal: 12246 disambiguate; 12247 if (special) handle here 12248 else goto defchar; 12249 default: // unambiguously literal: 12250 goto defchar; 12251 } 12252 default: // is a literal char 12253 // FALL THROUGH 12254 defchar: 12255 create EXACTish node for literal; 12256 while (more input and node isn't full) { 12257 switch (input_byte) { 12258 cases for each special; 12259 make sure parse pointer is set so that the next call to 12260 regatom will see this special first 12261 goto loopdone; // EXACTish node terminated by prev. char 12262 default: 12263 append char to EXACTISH node; 12264 } 12265 get next input byte; 12266 } 12267 loopdone: 12268 } 12269 return the generated node; 12270 12271 Specifically there are two separate switches for handling 12272 escape sequences, with the one for handling literal escapes requiring 12273 a dummy entry for all of the special escapes that are actually handled 12274 by the other. 12275 12276 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with 12277 TRYAGAIN. 12278 Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be 12279 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 12280 Otherwise does not return NULL. 12281 */ 12282 12283 STATIC regnode * 12284 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 12285 { 12286 regnode *ret = NULL; 12287 I32 flags = 0; 12288 char *parse_start; 12289 U8 op; 12290 int invert = 0; 12291 U8 arg; 12292 12293 GET_RE_DEBUG_FLAGS_DECL; 12294 12295 *flagp = WORST; /* Tentatively. */ 12296 12297 DEBUG_PARSE("atom"); 12298 12299 PERL_ARGS_ASSERT_REGATOM; 12300 12301 tryagain: 12302 parse_start = RExC_parse; 12303 assert(RExC_parse < RExC_end); 12304 switch ((U8)*RExC_parse) { 12305 case '^': 12306 RExC_seen_zerolen++; 12307 nextchar(pRExC_state); 12308 if (RExC_flags & RXf_PMf_MULTILINE) 12309 ret = reg_node(pRExC_state, MBOL); 12310 else 12311 ret = reg_node(pRExC_state, SBOL); 12312 Set_Node_Length(ret, 1); /* MJD */ 12313 break; 12314 case '$': 12315 nextchar(pRExC_state); 12316 if (*RExC_parse) 12317 RExC_seen_zerolen++; 12318 if (RExC_flags & RXf_PMf_MULTILINE) 12319 ret = reg_node(pRExC_state, MEOL); 12320 else 12321 ret = reg_node(pRExC_state, SEOL); 12322 Set_Node_Length(ret, 1); /* MJD */ 12323 break; 12324 case '.': 12325 nextchar(pRExC_state); 12326 if (RExC_flags & RXf_PMf_SINGLELINE) 12327 ret = reg_node(pRExC_state, SANY); 12328 else 12329 ret = reg_node(pRExC_state, REG_ANY); 12330 *flagp |= HASWIDTH|SIMPLE; 12331 MARK_NAUGHTY(1); 12332 Set_Node_Length(ret, 1); /* MJD */ 12333 break; 12334 case '[': 12335 { 12336 char * const oregcomp_parse = ++RExC_parse; 12337 ret = regclass(pRExC_state, flagp,depth+1, 12338 FALSE, /* means parse the whole char class */ 12339 TRUE, /* allow multi-char folds */ 12340 FALSE, /* don't silence non-portable warnings. */ 12341 (bool) RExC_strict, 12342 TRUE, /* Allow an optimized regnode result */ 12343 NULL, 12344 NULL); 12345 if (ret == NULL) { 12346 if (*flagp & (RESTART_PASS1|NEED_UTF8)) 12347 return NULL; 12348 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", 12349 (UV) *flagp); 12350 } 12351 if (*RExC_parse != ']') { 12352 RExC_parse = oregcomp_parse; 12353 vFAIL("Unmatched ["); 12354 } 12355 nextchar(pRExC_state); 12356 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ 12357 break; 12358 } 12359 case '(': 12360 nextchar(pRExC_state); 12361 ret = reg(pRExC_state, 2, &flags,depth+1); 12362 if (ret == NULL) { 12363 if (flags & TRYAGAIN) { 12364 if (RExC_parse >= RExC_end) { 12365 /* Make parent create an empty node if needed. */ 12366 *flagp |= TRYAGAIN; 12367 return(NULL); 12368 } 12369 goto tryagain; 12370 } 12371 if (flags & (RESTART_PASS1|NEED_UTF8)) { 12372 *flagp = flags & (RESTART_PASS1|NEED_UTF8); 12373 return NULL; 12374 } 12375 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", 12376 (UV) flags); 12377 } 12378 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); 12379 break; 12380 case '|': 12381 case ')': 12382 if (flags & TRYAGAIN) { 12383 *flagp |= TRYAGAIN; 12384 return NULL; 12385 } 12386 vFAIL("Internal urp"); 12387 /* Supposed to be caught earlier. */ 12388 break; 12389 case '?': 12390 case '+': 12391 case '*': 12392 RExC_parse++; 12393 vFAIL("Quantifier follows nothing"); 12394 break; 12395 case '\\': 12396 /* Special Escapes 12397 12398 This switch handles escape sequences that resolve to some kind 12399 of special regop and not to literal text. Escape sequnces that 12400 resolve to literal text are handled below in the switch marked 12401 "Literal Escapes". 12402 12403 Every entry in this switch *must* have a corresponding entry 12404 in the literal escape switch. However, the opposite is not 12405 required, as the default for this switch is to jump to the 12406 literal text handling code. 12407 */ 12408 RExC_parse++; 12409 switch ((U8)*RExC_parse) { 12410 /* Special Escapes */ 12411 case 'A': 12412 RExC_seen_zerolen++; 12413 ret = reg_node(pRExC_state, SBOL); 12414 /* SBOL is shared with /^/ so we set the flags so we can tell 12415 * /\A/ from /^/ in split. We check ret because first pass we 12416 * have no regop struct to set the flags on. */ 12417 if (PASS2) 12418 ret->flags = 1; 12419 *flagp |= SIMPLE; 12420 goto finish_meta_pat; 12421 case 'G': 12422 ret = reg_node(pRExC_state, GPOS); 12423 RExC_seen |= REG_GPOS_SEEN; 12424 *flagp |= SIMPLE; 12425 goto finish_meta_pat; 12426 case 'K': 12427 RExC_seen_zerolen++; 12428 ret = reg_node(pRExC_state, KEEPS); 12429 *flagp |= SIMPLE; 12430 /* XXX:dmq : disabling in-place substitution seems to 12431 * be necessary here to avoid cases of memory corruption, as 12432 * with: C<$_="x" x 80; s/x\K/y/> -- rgs 12433 */ 12434 RExC_seen |= REG_LOOKBEHIND_SEEN; 12435 goto finish_meta_pat; 12436 case 'Z': 12437 ret = reg_node(pRExC_state, SEOL); 12438 *flagp |= SIMPLE; 12439 RExC_seen_zerolen++; /* Do not optimize RE away */ 12440 goto finish_meta_pat; 12441 case 'z': 12442 ret = reg_node(pRExC_state, EOS); 12443 *flagp |= SIMPLE; 12444 RExC_seen_zerolen++; /* Do not optimize RE away */ 12445 goto finish_meta_pat; 12446 case 'C': 12447 vFAIL("\\C no longer supported"); 12448 case 'X': 12449 ret = reg_node(pRExC_state, CLUMP); 12450 *flagp |= HASWIDTH; 12451 goto finish_meta_pat; 12452 12453 case 'W': 12454 invert = 1; 12455 /* FALLTHROUGH */ 12456 case 'w': 12457 arg = ANYOF_WORDCHAR; 12458 goto join_posix; 12459 12460 case 'B': 12461 invert = 1; 12462 /* FALLTHROUGH */ 12463 case 'b': 12464 { 12465 regex_charset charset = get_regex_charset(RExC_flags); 12466 12467 RExC_seen_zerolen++; 12468 RExC_seen |= REG_LOOKBEHIND_SEEN; 12469 op = BOUND + charset; 12470 12471 if (op == BOUNDL) { 12472 RExC_contains_locale = 1; 12473 } 12474 12475 ret = reg_node(pRExC_state, op); 12476 *flagp |= SIMPLE; 12477 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') { 12478 FLAGS(ret) = TRADITIONAL_BOUND; 12479 if (PASS2 && op > BOUNDA) { /* /aa is same as /a */ 12480 OP(ret) = BOUNDA; 12481 } 12482 } 12483 else { 12484 STRLEN length; 12485 char name = *RExC_parse; 12486 char * endbrace; 12487 RExC_parse += 2; 12488 endbrace = strchr(RExC_parse, '}'); 12489 12490 if (! endbrace) { 12491 vFAIL2("Missing right brace on \\%c{}", name); 12492 } 12493 /* XXX Need to decide whether to take spaces or not. Should be 12494 * consistent with \p{}, but that currently is SPACE, which 12495 * means vertical too, which seems wrong 12496 * while (isBLANK(*RExC_parse)) { 12497 RExC_parse++; 12498 }*/ 12499 if (endbrace == RExC_parse) { 12500 RExC_parse++; /* After the '}' */ 12501 vFAIL2("Empty \\%c{}", name); 12502 } 12503 length = endbrace - RExC_parse; 12504 /*while (isBLANK(*(RExC_parse + length - 1))) { 12505 length--; 12506 }*/ 12507 switch (*RExC_parse) { 12508 case 'g': 12509 if (length != 1 12510 && (length != 3 || strnNE(RExC_parse + 1, "cb", 2))) 12511 { 12512 goto bad_bound_type; 12513 } 12514 FLAGS(ret) = GCB_BOUND; 12515 break; 12516 case 'l': 12517 if (length != 2 || *(RExC_parse + 1) != 'b') { 12518 goto bad_bound_type; 12519 } 12520 FLAGS(ret) = LB_BOUND; 12521 break; 12522 case 's': 12523 if (length != 2 || *(RExC_parse + 1) != 'b') { 12524 goto bad_bound_type; 12525 } 12526 FLAGS(ret) = SB_BOUND; 12527 break; 12528 case 'w': 12529 if (length != 2 || *(RExC_parse + 1) != 'b') { 12530 goto bad_bound_type; 12531 } 12532 FLAGS(ret) = WB_BOUND; 12533 break; 12534 default: 12535 bad_bound_type: 12536 RExC_parse = endbrace; 12537 vFAIL2utf8f( 12538 "'%"UTF8f"' is an unknown bound type", 12539 UTF8fARG(UTF, length, endbrace - length)); 12540 NOT_REACHED; /*NOTREACHED*/ 12541 } 12542 RExC_parse = endbrace; 12543 REQUIRE_UNI_RULES(flagp, NULL); 12544 12545 if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */ 12546 OP(ret) = BOUNDU; 12547 length += 4; 12548 12549 /* Don't have to worry about UTF-8, in this message because 12550 * to get here the contents of the \b must be ASCII */ 12551 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */ 12552 "Using /u for '%.*s' instead of /%s", 12553 (unsigned) length, 12554 endbrace - length + 1, 12555 (charset == REGEX_ASCII_RESTRICTED_CHARSET) 12556 ? ASCII_RESTRICT_PAT_MODS 12557 : ASCII_MORE_RESTRICT_PAT_MODS); 12558 } 12559 } 12560 12561 if (PASS2 && invert) { 12562 OP(ret) += NBOUND - BOUND; 12563 } 12564 goto finish_meta_pat; 12565 } 12566 12567 case 'D': 12568 invert = 1; 12569 /* FALLTHROUGH */ 12570 case 'd': 12571 arg = ANYOF_DIGIT; 12572 if (! DEPENDS_SEMANTICS) { 12573 goto join_posix; 12574 } 12575 12576 /* \d doesn't have any matches in the upper Latin1 range, hence /d 12577 * is equivalent to /u. Changing to /u saves some branches at 12578 * runtime */ 12579 op = POSIXU; 12580 goto join_posix_op_known; 12581 12582 case 'R': 12583 ret = reg_node(pRExC_state, LNBREAK); 12584 *flagp |= HASWIDTH|SIMPLE; 12585 goto finish_meta_pat; 12586 12587 case 'H': 12588 invert = 1; 12589 /* FALLTHROUGH */ 12590 case 'h': 12591 arg = ANYOF_BLANK; 12592 op = POSIXU; 12593 goto join_posix_op_known; 12594 12595 case 'V': 12596 invert = 1; 12597 /* FALLTHROUGH */ 12598 case 'v': 12599 arg = ANYOF_VERTWS; 12600 op = POSIXU; 12601 goto join_posix_op_known; 12602 12603 case 'S': 12604 invert = 1; 12605 /* FALLTHROUGH */ 12606 case 's': 12607 arg = ANYOF_SPACE; 12608 12609 join_posix: 12610 12611 op = POSIXD + get_regex_charset(RExC_flags); 12612 if (op > POSIXA) { /* /aa is same as /a */ 12613 op = POSIXA; 12614 } 12615 else if (op == POSIXL) { 12616 RExC_contains_locale = 1; 12617 } 12618 12619 join_posix_op_known: 12620 12621 if (invert) { 12622 op += NPOSIXD - POSIXD; 12623 } 12624 12625 ret = reg_node(pRExC_state, op); 12626 if (! SIZE_ONLY) { 12627 FLAGS(ret) = namedclass_to_classnum(arg); 12628 } 12629 12630 *flagp |= HASWIDTH|SIMPLE; 12631 /* FALLTHROUGH */ 12632 12633 finish_meta_pat: 12634 nextchar(pRExC_state); 12635 Set_Node_Length(ret, 2); /* MJD */ 12636 break; 12637 case 'p': 12638 case 'P': 12639 RExC_parse--; 12640 12641 ret = regclass(pRExC_state, flagp,depth+1, 12642 TRUE, /* means just parse this element */ 12643 FALSE, /* don't allow multi-char folds */ 12644 FALSE, /* don't silence non-portable warnings. It 12645 would be a bug if these returned 12646 non-portables */ 12647 (bool) RExC_strict, 12648 TRUE, /* Allow an optimized regnode result */ 12649 NULL, 12650 NULL); 12651 if (*flagp & RESTART_PASS1) 12652 return NULL; 12653 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if 12654 * multi-char folds are allowed. */ 12655 if (!ret) 12656 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", 12657 (UV) *flagp); 12658 12659 RExC_parse--; 12660 12661 Set_Node_Offset(ret, parse_start); 12662 Set_Node_Cur_Length(ret, parse_start - 2); 12663 nextchar(pRExC_state); 12664 break; 12665 case 'N': 12666 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the 12667 * \N{...} evaluates to a sequence of more than one code points). 12668 * The function call below returns a regnode, which is our result. 12669 * The parameters cause it to fail if the \N{} evaluates to a 12670 * single code point; we handle those like any other literal. The 12671 * reason that the multicharacter case is handled here and not as 12672 * part of the EXACtish code is because of quantifiers. In 12673 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it 12674 * this way makes that Just Happen. dmq. 12675 * join_exact() will join this up with adjacent EXACTish nodes 12676 * later on, if appropriate. */ 12677 ++RExC_parse; 12678 if (grok_bslash_N(pRExC_state, 12679 &ret, /* Want a regnode returned */ 12680 NULL, /* Fail if evaluates to a single code 12681 point */ 12682 NULL, /* Don't need a count of how many code 12683 points */ 12684 flagp, 12685 RExC_strict, 12686 depth) 12687 ) { 12688 break; 12689 } 12690 12691 if (*flagp & RESTART_PASS1) 12692 return NULL; 12693 12694 /* Here, evaluates to a single code point. Go get that */ 12695 RExC_parse = parse_start; 12696 goto defchar; 12697 12698 case 'k': /* Handle \k<NAME> and \k'NAME' */ 12699 parse_named_seq: 12700 { 12701 char ch; 12702 if ( RExC_parse >= RExC_end - 1 12703 || (( ch = RExC_parse[1]) != '<' 12704 && ch != '\'' 12705 && ch != '{')) 12706 { 12707 RExC_parse++; 12708 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 12709 vFAIL2("Sequence %.2s... not terminated",parse_start); 12710 } else { 12711 RExC_parse += 2; 12712 ret = handle_named_backref(pRExC_state, 12713 flagp, 12714 parse_start, 12715 (ch == '<') 12716 ? '>' 12717 : (ch == '{') 12718 ? '}' 12719 : '\''); 12720 } 12721 break; 12722 } 12723 case 'g': 12724 case '1': case '2': case '3': case '4': 12725 case '5': case '6': case '7': case '8': case '9': 12726 { 12727 I32 num; 12728 bool hasbrace = 0; 12729 12730 if (*RExC_parse == 'g') { 12731 bool isrel = 0; 12732 12733 RExC_parse++; 12734 if (*RExC_parse == '{') { 12735 RExC_parse++; 12736 hasbrace = 1; 12737 } 12738 if (*RExC_parse == '-') { 12739 RExC_parse++; 12740 isrel = 1; 12741 } 12742 if (hasbrace && !isDIGIT(*RExC_parse)) { 12743 if (isrel) RExC_parse--; 12744 RExC_parse -= 2; 12745 goto parse_named_seq; 12746 } 12747 12748 if (RExC_parse >= RExC_end) { 12749 goto unterminated_g; 12750 } 12751 num = S_backref_value(RExC_parse); 12752 if (num == 0) 12753 vFAIL("Reference to invalid group 0"); 12754 else if (num == I32_MAX) { 12755 if (isDIGIT(*RExC_parse)) 12756 vFAIL("Reference to nonexistent group"); 12757 else 12758 unterminated_g: 12759 vFAIL("Unterminated \\g... pattern"); 12760 } 12761 12762 if (isrel) { 12763 num = RExC_npar - num; 12764 if (num < 1) 12765 vFAIL("Reference to nonexistent or unclosed group"); 12766 } 12767 } 12768 else { 12769 num = S_backref_value(RExC_parse); 12770 /* bare \NNN might be backref or octal - if it is larger 12771 * than or equal RExC_npar then it is assumed to be an 12772 * octal escape. Note RExC_npar is +1 from the actual 12773 * number of parens. */ 12774 /* Note we do NOT check if num == I32_MAX here, as that is 12775 * handled by the RExC_npar check */ 12776 12777 if ( 12778 /* any numeric escape < 10 is always a backref */ 12779 num > 9 12780 /* any numeric escape < RExC_npar is a backref */ 12781 && num >= RExC_npar 12782 /* cannot be an octal escape if it starts with 8 */ 12783 && *RExC_parse != '8' 12784 /* cannot be an octal escape it it starts with 9 */ 12785 && *RExC_parse != '9' 12786 ) 12787 { 12788 /* Probably not a backref, instead likely to be an 12789 * octal character escape, e.g. \35 or \777. 12790 * The above logic should make it obvious why using 12791 * octal escapes in patterns is problematic. - Yves */ 12792 RExC_parse = parse_start; 12793 goto defchar; 12794 } 12795 } 12796 12797 /* At this point RExC_parse points at a numeric escape like 12798 * \12 or \88 or something similar, which we should NOT treat 12799 * as an octal escape. It may or may not be a valid backref 12800 * escape. For instance \88888888 is unlikely to be a valid 12801 * backref. */ 12802 while (isDIGIT(*RExC_parse)) 12803 RExC_parse++; 12804 if (hasbrace) { 12805 if (*RExC_parse != '}') 12806 vFAIL("Unterminated \\g{...} pattern"); 12807 RExC_parse++; 12808 } 12809 if (!SIZE_ONLY) { 12810 if (num > (I32)RExC_rx->nparens) 12811 vFAIL("Reference to nonexistent group"); 12812 } 12813 RExC_sawback = 1; 12814 ret = reganode(pRExC_state, 12815 ((! FOLD) 12816 ? REF 12817 : (ASCII_FOLD_RESTRICTED) 12818 ? REFFA 12819 : (AT_LEAST_UNI_SEMANTICS) 12820 ? REFFU 12821 : (LOC) 12822 ? REFFL 12823 : REFF), 12824 num); 12825 *flagp |= HASWIDTH; 12826 12827 /* override incorrect value set in reganode MJD */ 12828 Set_Node_Offset(ret, parse_start); 12829 Set_Node_Cur_Length(ret, parse_start-1); 12830 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 12831 FALSE /* Don't force to /x */ ); 12832 } 12833 break; 12834 case '\0': 12835 if (RExC_parse >= RExC_end) 12836 FAIL("Trailing \\"); 12837 /* FALLTHROUGH */ 12838 default: 12839 /* Do not generate "unrecognized" warnings here, we fall 12840 back into the quick-grab loop below */ 12841 RExC_parse = parse_start; 12842 goto defchar; 12843 } /* end of switch on a \foo sequence */ 12844 break; 12845 12846 case '#': 12847 12848 /* '#' comments should have been spaced over before this function was 12849 * called */ 12850 assert((RExC_flags & RXf_PMf_EXTENDED) == 0); 12851 /* 12852 if (RExC_flags & RXf_PMf_EXTENDED) { 12853 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); 12854 if (RExC_parse < RExC_end) 12855 goto tryagain; 12856 } 12857 */ 12858 12859 /* FALLTHROUGH */ 12860 12861 default: 12862 defchar: { 12863 12864 /* Here, we have determined that the next thing is probably a 12865 * literal character. RExC_parse points to the first byte of its 12866 * definition. (It still may be an escape sequence that evaluates 12867 * to a single character) */ 12868 12869 STRLEN len = 0; 12870 UV ender = 0; 12871 char *p; 12872 char *s; 12873 #define MAX_NODE_STRING_SIZE 127 12874 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; 12875 char *s0; 12876 U8 upper_parse = MAX_NODE_STRING_SIZE; 12877 U8 node_type = compute_EXACTish(pRExC_state); 12878 bool next_is_quantifier; 12879 char * oldp = NULL; 12880 12881 /* We can convert EXACTF nodes to EXACTFU if they contain only 12882 * characters that match identically regardless of the target 12883 * string's UTF8ness. The reason to do this is that EXACTF is not 12884 * trie-able, EXACTFU is. 12885 * 12886 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they 12887 * contain only above-Latin1 characters (hence must be in UTF8), 12888 * which don't participate in folds with Latin1-range characters, 12889 * as the latter's folds aren't known until runtime. (We don't 12890 * need to figure this out until pass 2) */ 12891 bool maybe_exactfu = PASS2 12892 && (node_type == EXACTF || node_type == EXACTFL); 12893 12894 /* If a folding node contains only code points that don't 12895 * participate in folds, it can be changed into an EXACT node, 12896 * which allows the optimizer more things to look for */ 12897 bool maybe_exact; 12898 12899 ret = reg_node(pRExC_state, node_type); 12900 12901 /* In pass1, folded, we use a temporary buffer instead of the 12902 * actual node, as the node doesn't exist yet */ 12903 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); 12904 12905 s0 = s; 12906 12907 reparse: 12908 12909 /* We look for the EXACTFish to EXACT node optimizaton only if 12910 * folding. (And we don't need to figure this out until pass 2). 12911 * XXX It might actually make sense to split the node into portions 12912 * that are exact and ones that aren't, so that we could later use 12913 * the exact ones to find the longest fixed and floating strings. 12914 * One would want to join them back into a larger node. One could 12915 * use a pseudo regnode like 'EXACT_ORIG_FOLD' */ 12916 maybe_exact = FOLD && PASS2; 12917 12918 /* XXX The node can hold up to 255 bytes, yet this only goes to 12919 * 127. I (khw) do not know why. Keeping it somewhat less than 12920 * 255 allows us to not have to worry about overflow due to 12921 * converting to utf8 and fold expansion, but that value is 12922 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes 12923 * split up by this limit into a single one using the real max of 12924 * 255. Even at 127, this breaks under rare circumstances. If 12925 * folding, we do not want to split a node at a character that is a 12926 * non-final in a multi-char fold, as an input string could just 12927 * happen to want to match across the node boundary. The join 12928 * would solve that problem if the join actually happens. But a 12929 * series of more than two nodes in a row each of 127 would cause 12930 * the first join to succeed to get to 254, but then there wouldn't 12931 * be room for the next one, which could at be one of those split 12932 * multi-char folds. I don't know of any fool-proof solution. One 12933 * could back off to end with only a code point that isn't such a 12934 * non-final, but it is possible for there not to be any in the 12935 * entire node. */ 12936 12937 assert( ! UTF /* Is at the beginning of a character */ 12938 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) 12939 || UTF8_IS_START(UCHARAT(RExC_parse))); 12940 12941 for (p = RExC_parse; 12942 len < upper_parse && p < RExC_end; 12943 len++) 12944 { 12945 oldp = p; 12946 12947 /* White space has already been ignored */ 12948 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0 12949 || ! is_PATWS_safe((p), RExC_end, UTF)); 12950 12951 switch ((U8)*p) { 12952 case '^': 12953 case '$': 12954 case '.': 12955 case '[': 12956 case '(': 12957 case ')': 12958 case '|': 12959 goto loopdone; 12960 case '\\': 12961 /* Literal Escapes Switch 12962 12963 This switch is meant to handle escape sequences that 12964 resolve to a literal character. 12965 12966 Every escape sequence that represents something 12967 else, like an assertion or a char class, is handled 12968 in the switch marked 'Special Escapes' above in this 12969 routine, but also has an entry here as anything that 12970 isn't explicitly mentioned here will be treated as 12971 an unescaped equivalent literal. 12972 */ 12973 12974 switch ((U8)*++p) { 12975 /* These are all the special escapes. */ 12976 case 'A': /* Start assertion */ 12977 case 'b': case 'B': /* Word-boundary assertion*/ 12978 case 'C': /* Single char !DANGEROUS! */ 12979 case 'd': case 'D': /* digit class */ 12980 case 'g': case 'G': /* generic-backref, pos assertion */ 12981 case 'h': case 'H': /* HORIZWS */ 12982 case 'k': case 'K': /* named backref, keep marker */ 12983 case 'p': case 'P': /* Unicode property */ 12984 case 'R': /* LNBREAK */ 12985 case 's': case 'S': /* space class */ 12986 case 'v': case 'V': /* VERTWS */ 12987 case 'w': case 'W': /* word class */ 12988 case 'X': /* eXtended Unicode "combining 12989 character sequence" */ 12990 case 'z': case 'Z': /* End of line/string assertion */ 12991 --p; 12992 goto loopdone; 12993 12994 /* Anything after here is an escape that resolves to a 12995 literal. (Except digits, which may or may not) 12996 */ 12997 case 'n': 12998 ender = '\n'; 12999 p++; 13000 break; 13001 case 'N': /* Handle a single-code point named character. */ 13002 RExC_parse = p + 1; 13003 if (! grok_bslash_N(pRExC_state, 13004 NULL, /* Fail if evaluates to 13005 anything other than a 13006 single code point */ 13007 &ender, /* The returned single code 13008 point */ 13009 NULL, /* Don't need a count of 13010 how many code points */ 13011 flagp, 13012 RExC_strict, 13013 depth) 13014 ) { 13015 if (*flagp & NEED_UTF8) 13016 FAIL("panic: grok_bslash_N set NEED_UTF8"); 13017 if (*flagp & RESTART_PASS1) 13018 return NULL; 13019 13020 /* Here, it wasn't a single code point. Go close 13021 * up this EXACTish node. The switch() prior to 13022 * this switch handles the other cases */ 13023 RExC_parse = p = oldp; 13024 goto loopdone; 13025 } 13026 p = RExC_parse; 13027 RExC_parse = parse_start; 13028 if (ender > 0xff) { 13029 REQUIRE_UTF8(flagp); 13030 } 13031 break; 13032 case 'r': 13033 ender = '\r'; 13034 p++; 13035 break; 13036 case 't': 13037 ender = '\t'; 13038 p++; 13039 break; 13040 case 'f': 13041 ender = '\f'; 13042 p++; 13043 break; 13044 case 'e': 13045 ender = ESC_NATIVE; 13046 p++; 13047 break; 13048 case 'a': 13049 ender = '\a'; 13050 p++; 13051 break; 13052 case 'o': 13053 { 13054 UV result; 13055 const char* error_msg; 13056 13057 bool valid = grok_bslash_o(&p, 13058 &result, 13059 &error_msg, 13060 PASS2, /* out warnings */ 13061 (bool) RExC_strict, 13062 TRUE, /* Output warnings 13063 for non- 13064 portables */ 13065 UTF); 13066 if (! valid) { 13067 RExC_parse = p; /* going to die anyway; point 13068 to exact spot of failure */ 13069 vFAIL(error_msg); 13070 } 13071 ender = result; 13072 if (IN_ENCODING && ender < 0x100) { 13073 goto recode_encoding; 13074 } 13075 if (ender > 0xff) { 13076 REQUIRE_UTF8(flagp); 13077 } 13078 break; 13079 } 13080 case 'x': 13081 { 13082 UV result = UV_MAX; /* initialize to erroneous 13083 value */ 13084 const char* error_msg; 13085 13086 bool valid = grok_bslash_x(&p, 13087 &result, 13088 &error_msg, 13089 PASS2, /* out warnings */ 13090 (bool) RExC_strict, 13091 TRUE, /* Silence warnings 13092 for non- 13093 portables */ 13094 UTF); 13095 if (! valid) { 13096 RExC_parse = p; /* going to die anyway; point 13097 to exact spot of failure */ 13098 vFAIL(error_msg); 13099 } 13100 ender = result; 13101 13102 if (ender < 0x100) { 13103 #ifdef EBCDIC 13104 if (RExC_recode_x_to_native) { 13105 ender = LATIN1_TO_NATIVE(ender); 13106 } 13107 else 13108 #endif 13109 if (IN_ENCODING) { 13110 goto recode_encoding; 13111 } 13112 } 13113 else { 13114 REQUIRE_UTF8(flagp); 13115 } 13116 break; 13117 } 13118 case 'c': 13119 p++; 13120 ender = grok_bslash_c(*p++, PASS2); 13121 break; 13122 case '8': case '9': /* must be a backreference */ 13123 --p; 13124 /* we have an escape like \8 which cannot be an octal escape 13125 * so we exit the loop, and let the outer loop handle this 13126 * escape which may or may not be a legitimate backref. */ 13127 goto loopdone; 13128 case '1': case '2': case '3':case '4': 13129 case '5': case '6': case '7': 13130 /* When we parse backslash escapes there is ambiguity 13131 * between backreferences and octal escapes. Any escape 13132 * from \1 - \9 is a backreference, any multi-digit 13133 * escape which does not start with 0 and which when 13134 * evaluated as decimal could refer to an already 13135 * parsed capture buffer is a back reference. Anything 13136 * else is octal. 13137 * 13138 * Note this implies that \118 could be interpreted as 13139 * 118 OR as "\11" . "8" depending on whether there 13140 * were 118 capture buffers defined already in the 13141 * pattern. */ 13142 13143 /* NOTE, RExC_npar is 1 more than the actual number of 13144 * parens we have seen so far, hence the < RExC_npar below. */ 13145 13146 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) 13147 { /* Not to be treated as an octal constant, go 13148 find backref */ 13149 --p; 13150 goto loopdone; 13151 } 13152 /* FALLTHROUGH */ 13153 case '0': 13154 { 13155 I32 flags = PERL_SCAN_SILENT_ILLDIGIT; 13156 STRLEN numlen = 3; 13157 ender = grok_oct(p, &numlen, &flags, NULL); 13158 if (ender > 0xff) { 13159 REQUIRE_UTF8(flagp); 13160 } 13161 p += numlen; 13162 if (PASS2 /* like \08, \178 */ 13163 && numlen < 3 13164 && isDIGIT(*p) && ckWARN(WARN_REGEXP)) 13165 { 13166 reg_warn_non_literal_string( 13167 p + 1, 13168 form_short_octal_warning(p, numlen)); 13169 } 13170 } 13171 if (IN_ENCODING && ender < 0x100) 13172 goto recode_encoding; 13173 break; 13174 recode_encoding: 13175 if (! RExC_override_recoding) { 13176 SV* enc = _get_encoding(); 13177 ender = reg_recode((U8)ender, &enc); 13178 if (!enc && PASS2) 13179 ckWARNreg(p, "Invalid escape in the specified encoding"); 13180 REQUIRE_UTF8(flagp); 13181 } 13182 break; 13183 case '\0': 13184 if (p >= RExC_end) 13185 FAIL("Trailing \\"); 13186 /* FALLTHROUGH */ 13187 default: 13188 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { 13189 /* Include any left brace following the alpha to emphasize 13190 * that it could be part of an escape at some point 13191 * in the future */ 13192 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; 13193 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p); 13194 } 13195 goto normal_default; 13196 } /* End of switch on '\' */ 13197 break; 13198 case '{': 13199 /* Currently we don't warn when the lbrace is at the start 13200 * of a construct. This catches it in the middle of a 13201 * literal string, or when it's the first thing after 13202 * something like "\b" */ 13203 if (! SIZE_ONLY 13204 && (len || (p > RExC_start && isALPHA_A(*(p -1))))) 13205 { 13206 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); 13207 } 13208 /*FALLTHROUGH*/ 13209 default: /* A literal character */ 13210 normal_default: 13211 if (! UTF8_IS_INVARIANT(*p) && UTF) { 13212 STRLEN numlen; 13213 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 13214 &numlen, UTF8_ALLOW_DEFAULT); 13215 p += numlen; 13216 } 13217 else 13218 ender = (U8) *p++; 13219 break; 13220 } /* End of switch on the literal */ 13221 13222 /* Here, have looked at the literal character and <ender> 13223 * contains its ordinal, <p> points to the character after it. 13224 * We need to check if the next non-ignored thing is a 13225 * quantifier. Move <p> to after anything that should be 13226 * ignored, which, as a side effect, positions <p> for the next 13227 * loop iteration */ 13228 skip_to_be_ignored_text(pRExC_state, &p, 13229 FALSE /* Don't force to /x */ ); 13230 13231 /* If the next thing is a quantifier, it applies to this 13232 * character only, which means that this character has to be in 13233 * its own node and can't just be appended to the string in an 13234 * existing node, so if there are already other characters in 13235 * the node, close the node with just them, and set up to do 13236 * this character again next time through, when it will be the 13237 * only thing in its new node */ 13238 if ((next_is_quantifier = ( LIKELY(p < RExC_end) 13239 && UNLIKELY(ISMULT2(p)))) 13240 && LIKELY(len)) 13241 { 13242 p = oldp; 13243 goto loopdone; 13244 } 13245 13246 /* Ready to add 'ender' to the node */ 13247 13248 if (! FOLD) { /* The simple case, just append the literal */ 13249 13250 /* In the sizing pass, we need only the size of the 13251 * character we are appending, hence we can delay getting 13252 * its representation until PASS2. */ 13253 if (SIZE_ONLY) { 13254 if (UTF) { 13255 const STRLEN unilen = UVCHR_SKIP(ender); 13256 s += unilen; 13257 13258 /* We have to subtract 1 just below (and again in 13259 * the corresponding PASS2 code) because the loop 13260 * increments <len> each time, as all but this path 13261 * (and one other) through it add a single byte to 13262 * the EXACTish node. But these paths would change 13263 * len to be the correct final value, so cancel out 13264 * the increment that follows */ 13265 len += unilen - 1; 13266 } 13267 else { 13268 s++; 13269 } 13270 } else { /* PASS2 */ 13271 not_fold_common: 13272 if (UTF) { 13273 U8 * new_s = uvchr_to_utf8((U8*)s, ender); 13274 len += (char *) new_s - s - 1; 13275 s = (char *) new_s; 13276 } 13277 else { 13278 *(s++) = (char) ender; 13279 } 13280 } 13281 } 13282 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) { 13283 13284 /* Here are folding under /l, and the code point is 13285 * problematic. First, we know we can't simplify things */ 13286 maybe_exact = FALSE; 13287 maybe_exactfu = FALSE; 13288 13289 /* A problematic code point in this context means that its 13290 * fold isn't known until runtime, so we can't fold it now. 13291 * (The non-problematic code points are the above-Latin1 13292 * ones that fold to also all above-Latin1. Their folds 13293 * don't vary no matter what the locale is.) But here we 13294 * have characters whose fold depends on the locale. 13295 * Unlike the non-folding case above, we have to keep track 13296 * of these in the sizing pass, so that we can make sure we 13297 * don't split too-long nodes in the middle of a potential 13298 * multi-char fold. And unlike the regular fold case 13299 * handled in the else clauses below, we don't actually 13300 * fold and don't have special cases to consider. What we 13301 * do for both passes is the PASS2 code for non-folding */ 13302 goto not_fold_common; 13303 } 13304 else /* A regular FOLD code point */ 13305 if (! ( UTF 13306 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 13307 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 13308 || UNICODE_DOT_DOT_VERSION > 0) 13309 /* See comments for join_exact() as to why we fold 13310 * this non-UTF at compile time */ 13311 || ( node_type == EXACTFU 13312 && ender == LATIN_SMALL_LETTER_SHARP_S) 13313 #endif 13314 )) { 13315 /* Here, are folding and are not UTF-8 encoded; therefore 13316 * the character must be in the range 0-255, and is not /l 13317 * (Not /l because we already handled these under /l in 13318 * is_PROBLEMATIC_LOCALE_FOLD_cp) */ 13319 if (IS_IN_SOME_FOLD_L1(ender)) { 13320 maybe_exact = FALSE; 13321 13322 /* See if the character's fold differs between /d and 13323 * /u. This includes the multi-char fold SHARP S to 13324 * 'ss' */ 13325 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { 13326 RExC_seen_unfolded_sharp_s = 1; 13327 maybe_exactfu = FALSE; 13328 } 13329 else if (maybe_exactfu 13330 && (PL_fold[ender] != PL_fold_latin1[ender] 13331 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 13332 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 13333 || UNICODE_DOT_DOT_VERSION > 0) 13334 || ( len > 0 13335 && isALPHA_FOLD_EQ(ender, 's') 13336 && isALPHA_FOLD_EQ(*(s-1), 's')) 13337 #endif 13338 )) { 13339 maybe_exactfu = FALSE; 13340 } 13341 } 13342 13343 /* Even when folding, we store just the input character, as 13344 * we have an array that finds its fold quickly */ 13345 *(s++) = (char) ender; 13346 } 13347 else { /* FOLD, and UTF (or sharp s) */ 13348 /* Unlike the non-fold case, we do actually have to 13349 * calculate the results here in pass 1. This is for two 13350 * reasons, the folded length may be longer than the 13351 * unfolded, and we have to calculate how many EXACTish 13352 * nodes it will take; and we may run out of room in a node 13353 * in the middle of a potential multi-char fold, and have 13354 * to back off accordingly. */ 13355 13356 UV folded; 13357 if (isASCII_uni(ender)) { 13358 folded = toFOLD(ender); 13359 *(s)++ = (U8) folded; 13360 } 13361 else { 13362 STRLEN foldlen; 13363 13364 folded = _to_uni_fold_flags( 13365 ender, 13366 (U8 *) s, 13367 &foldlen, 13368 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) 13369 ? FOLD_FLAGS_NOMIX_ASCII 13370 : 0)); 13371 s += foldlen; 13372 13373 /* The loop increments <len> each time, as all but this 13374 * path (and one other) through it add a single byte to 13375 * the EXACTish node. But this one has changed len to 13376 * be the correct final value, so subtract one to 13377 * cancel out the increment that follows */ 13378 len += foldlen - 1; 13379 } 13380 /* If this node only contains non-folding code points so 13381 * far, see if this new one is also non-folding */ 13382 if (maybe_exact) { 13383 if (folded != ender) { 13384 maybe_exact = FALSE; 13385 } 13386 else { 13387 /* Here the fold is the original; we have to check 13388 * further to see if anything folds to it */ 13389 if (_invlist_contains_cp(PL_utf8_foldable, 13390 ender)) 13391 { 13392 maybe_exact = FALSE; 13393 } 13394 } 13395 } 13396 ender = folded; 13397 } 13398 13399 if (next_is_quantifier) { 13400 13401 /* Here, the next input is a quantifier, and to get here, 13402 * the current character is the only one in the node. 13403 * Also, here <len> doesn't include the final byte for this 13404 * character */ 13405 len++; 13406 goto loopdone; 13407 } 13408 13409 } /* End of loop through literal characters */ 13410 13411 /* Here we have either exhausted the input or ran out of room in 13412 * the node. (If we encountered a character that can't be in the 13413 * node, transfer is made directly to <loopdone>, and so we 13414 * wouldn't have fallen off the end of the loop.) In the latter 13415 * case, we artificially have to split the node into two, because 13416 * we just don't have enough space to hold everything. This 13417 * creates a problem if the final character participates in a 13418 * multi-character fold in the non-final position, as a match that 13419 * should have occurred won't, due to the way nodes are matched, 13420 * and our artificial boundary. So back off until we find a non- 13421 * problematic character -- one that isn't at the beginning or 13422 * middle of such a fold. (Either it doesn't participate in any 13423 * folds, or appears only in the final position of all the folds it 13424 * does participate in.) A better solution with far fewer false 13425 * positives, and that would fill the nodes more completely, would 13426 * be to actually have available all the multi-character folds to 13427 * test against, and to back-off only far enough to be sure that 13428 * this node isn't ending with a partial one. <upper_parse> is set 13429 * further below (if we need to reparse the node) to include just 13430 * up through that final non-problematic character that this code 13431 * identifies, so when it is set to less than the full node, we can 13432 * skip the rest of this */ 13433 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { 13434 13435 const STRLEN full_len = len; 13436 13437 assert(len >= MAX_NODE_STRING_SIZE); 13438 13439 /* Here, <s> points to the final byte of the final character. 13440 * Look backwards through the string until find a non- 13441 * problematic character */ 13442 13443 if (! UTF) { 13444 13445 /* This has no multi-char folds to non-UTF characters */ 13446 if (ASCII_FOLD_RESTRICTED) { 13447 goto loopdone; 13448 } 13449 13450 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } 13451 len = s - s0 + 1; 13452 } 13453 else { 13454 if (! PL_NonL1NonFinalFold) { 13455 PL_NonL1NonFinalFold = _new_invlist_C_array( 13456 NonL1_Perl_Non_Final_Folds_invlist); 13457 } 13458 13459 /* Point to the first byte of the final character */ 13460 s = (char *) utf8_hop((U8 *) s, -1); 13461 13462 while (s >= s0) { /* Search backwards until find 13463 non-problematic char */ 13464 if (UTF8_IS_INVARIANT(*s)) { 13465 13466 /* There are no ascii characters that participate 13467 * in multi-char folds under /aa. In EBCDIC, the 13468 * non-ascii invariants are all control characters, 13469 * so don't ever participate in any folds. */ 13470 if (ASCII_FOLD_RESTRICTED 13471 || ! IS_NON_FINAL_FOLD(*s)) 13472 { 13473 break; 13474 } 13475 } 13476 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { 13477 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE( 13478 *s, *(s+1)))) 13479 { 13480 break; 13481 } 13482 } 13483 else if (! _invlist_contains_cp( 13484 PL_NonL1NonFinalFold, 13485 valid_utf8_to_uvchr((U8 *) s, NULL))) 13486 { 13487 break; 13488 } 13489 13490 /* Here, the current character is problematic in that 13491 * it does occur in the non-final position of some 13492 * fold, so try the character before it, but have to 13493 * special case the very first byte in the string, so 13494 * we don't read outside the string */ 13495 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); 13496 } /* End of loop backwards through the string */ 13497 13498 /* If there were only problematic characters in the string, 13499 * <s> will point to before s0, in which case the length 13500 * should be 0, otherwise include the length of the 13501 * non-problematic character just found */ 13502 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); 13503 } 13504 13505 /* Here, have found the final character, if any, that is 13506 * non-problematic as far as ending the node without splitting 13507 * it across a potential multi-char fold. <len> contains the 13508 * number of bytes in the node up-to and including that 13509 * character, or is 0 if there is no such character, meaning 13510 * the whole node contains only problematic characters. In 13511 * this case, give up and just take the node as-is. We can't 13512 * do any better */ 13513 if (len == 0) { 13514 len = full_len; 13515 13516 /* If the node ends in an 's' we make sure it stays EXACTF, 13517 * as if it turns into an EXACTFU, it could later get 13518 * joined with another 's' that would then wrongly match 13519 * the sharp s */ 13520 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's')) 13521 { 13522 maybe_exactfu = FALSE; 13523 } 13524 } else { 13525 13526 /* Here, the node does contain some characters that aren't 13527 * problematic. If one such is the final character in the 13528 * node, we are done */ 13529 if (len == full_len) { 13530 goto loopdone; 13531 } 13532 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { 13533 13534 /* If the final character is problematic, but the 13535 * penultimate is not, back-off that last character to 13536 * later start a new node with it */ 13537 p = oldp; 13538 goto loopdone; 13539 } 13540 13541 /* Here, the final non-problematic character is earlier 13542 * in the input than the penultimate character. What we do 13543 * is reparse from the beginning, going up only as far as 13544 * this final ok one, thus guaranteeing that the node ends 13545 * in an acceptable character. The reason we reparse is 13546 * that we know how far in the character is, but we don't 13547 * know how to correlate its position with the input parse. 13548 * An alternate implementation would be to build that 13549 * correlation as we go along during the original parse, 13550 * but that would entail extra work for every node, whereas 13551 * this code gets executed only when the string is too 13552 * large for the node, and the final two characters are 13553 * problematic, an infrequent occurrence. Yet another 13554 * possible strategy would be to save the tail of the 13555 * string, and the next time regatom is called, initialize 13556 * with that. The problem with this is that unless you 13557 * back off one more character, you won't be guaranteed 13558 * regatom will get called again, unless regbranch, 13559 * regpiece ... are also changed. If you do back off that 13560 * extra character, so that there is input guaranteed to 13561 * force calling regatom, you can't handle the case where 13562 * just the first character in the node is acceptable. I 13563 * (khw) decided to try this method which doesn't have that 13564 * pitfall; if performance issues are found, we can do a 13565 * combination of the current approach plus that one */ 13566 upper_parse = len; 13567 len = 0; 13568 s = s0; 13569 goto reparse; 13570 } 13571 } /* End of verifying node ends with an appropriate char */ 13572 13573 loopdone: /* Jumped to when encounters something that shouldn't be 13574 in the node */ 13575 13576 /* I (khw) don't know if you can get here with zero length, but the 13577 * old code handled this situation by creating a zero-length EXACT 13578 * node. Might as well be NOTHING instead */ 13579 if (len == 0) { 13580 OP(ret) = NOTHING; 13581 } 13582 else { 13583 if (FOLD) { 13584 /* If 'maybe_exact' is still set here, means there are no 13585 * code points in the node that participate in folds; 13586 * similarly for 'maybe_exactfu' and code points that match 13587 * differently depending on UTF8ness of the target string 13588 * (for /u), or depending on locale for /l */ 13589 if (maybe_exact) { 13590 OP(ret) = (LOC) 13591 ? EXACTL 13592 : EXACT; 13593 } 13594 else if (maybe_exactfu) { 13595 OP(ret) = (LOC) 13596 ? EXACTFLU8 13597 : EXACTFU; 13598 } 13599 } 13600 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, 13601 FALSE /* Don't look to see if could 13602 be turned into an EXACT 13603 node, as we have already 13604 computed that */ 13605 ); 13606 } 13607 13608 RExC_parse = p - 1; 13609 Set_Node_Cur_Length(ret, parse_start); 13610 RExC_parse = p; 13611 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 13612 FALSE /* Don't force to /x */ ); 13613 { 13614 /* len is STRLEN which is unsigned, need to copy to signed */ 13615 IV iv = len; 13616 if (iv < 0) 13617 vFAIL("Internal disaster"); 13618 } 13619 13620 } /* End of label 'defchar:' */ 13621 break; 13622 } /* End of giant switch on input character */ 13623 13624 return(ret); 13625 } 13626 13627 13628 STATIC void 13629 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) 13630 { 13631 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It 13632 * sets up the bitmap and any flags, removing those code points from the 13633 * inversion list, setting it to NULL should it become completely empty */ 13634 13635 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; 13636 assert(PL_regkind[OP(node)] == ANYOF); 13637 13638 ANYOF_BITMAP_ZERO(node); 13639 if (*invlist_ptr) { 13640 13641 /* This gets set if we actually need to modify things */ 13642 bool change_invlist = FALSE; 13643 13644 UV start, end; 13645 13646 /* Start looking through *invlist_ptr */ 13647 invlist_iterinit(*invlist_ptr); 13648 while (invlist_iternext(*invlist_ptr, &start, &end)) { 13649 UV high; 13650 int i; 13651 13652 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) { 13653 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP; 13654 } 13655 13656 /* Quit if are above what we should change */ 13657 if (start >= NUM_ANYOF_CODE_POINTS) { 13658 break; 13659 } 13660 13661 change_invlist = TRUE; 13662 13663 /* Set all the bits in the range, up to the max that we are doing */ 13664 high = (end < NUM_ANYOF_CODE_POINTS - 1) 13665 ? end 13666 : NUM_ANYOF_CODE_POINTS - 1; 13667 for (i = start; i <= (int) high; i++) { 13668 if (! ANYOF_BITMAP_TEST(node, i)) { 13669 ANYOF_BITMAP_SET(node, i); 13670 } 13671 } 13672 } 13673 invlist_iterfinish(*invlist_ptr); 13674 13675 /* Done with loop; remove any code points that are in the bitmap from 13676 * *invlist_ptr; similarly for code points above the bitmap if we have 13677 * a flag to match all of them anyways */ 13678 if (change_invlist) { 13679 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr); 13680 } 13681 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 13682 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr); 13683 } 13684 13685 /* If have completely emptied it, remove it completely */ 13686 if (_invlist_len(*invlist_ptr) == 0) { 13687 SvREFCNT_dec_NN(*invlist_ptr); 13688 *invlist_ptr = NULL; 13689 } 13690 } 13691 } 13692 13693 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 13694 Character classes ([:foo:]) can also be negated ([:^foo:]). 13695 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 13696 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 13697 but trigger failures because they are currently unimplemented. */ 13698 13699 #define POSIXCC_DONE(c) ((c) == ':') 13700 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 13701 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 13702 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';') 13703 13704 #define WARNING_PREFIX "Assuming NOT a POSIX class since " 13705 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one" 13706 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon" 13707 13708 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1) 13709 13710 /* 'posix_warnings' and 'warn_text' are names of variables in the following 13711 * routine. q.v. */ 13712 #define ADD_POSIX_WARNING(p, text) STMT_START { \ 13713 if (posix_warnings) { \ 13714 if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \ 13715 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \ 13716 WARNING_PREFIX \ 13717 text \ 13718 REPORT_LOCATION, \ 13719 REPORT_LOCATION_ARGS(p))); \ 13720 } \ 13721 } STMT_END 13722 13723 STATIC int 13724 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, 13725 13726 const char * const s, /* Where the putative posix class begins. 13727 Normally, this is one past the '['. This 13728 parameter exists so it can be somewhere 13729 besides RExC_parse. */ 13730 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or 13731 NULL */ 13732 AV ** posix_warnings, /* Where to place any generated warnings, or 13733 NULL */ 13734 const bool check_only /* Don't die if error */ 13735 ) 13736 { 13737 /* This parses what the caller thinks may be one of the three POSIX 13738 * constructs: 13739 * 1) a character class, like [:blank:] 13740 * 2) a collating symbol, like [. .] 13741 * 3) an equivalence class, like [= =] 13742 * In the latter two cases, it croaks if it finds a syntactically legal 13743 * one, as these are not handled by Perl. 13744 * 13745 * The main purpose is to look for a POSIX character class. It returns: 13746 * a) the class number 13747 * if it is a completely syntactically and semantically legal class. 13748 * 'updated_parse_ptr', if not NULL, is set to point to just after the 13749 * closing ']' of the class 13750 * b) OOB_NAMEDCLASS 13751 * if it appears that one of the three POSIX constructs was meant, but 13752 * its specification was somehow defective. 'updated_parse_ptr', if 13753 * not NULL, is set to point to the character just after the end 13754 * character of the class. See below for handling of warnings. 13755 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS 13756 * if it doesn't appear that a POSIX construct was intended. 13757 * 'updated_parse_ptr' is not changed. No warnings nor errors are 13758 * raised. 13759 * 13760 * In b) there may be errors or warnings generated. If 'check_only' is 13761 * TRUE, then any errors are discarded. Warnings are returned to the 13762 * caller via an AV* created into '*posix_warnings' if it is not NULL. If 13763 * instead it is NULL, warnings are suppressed. This is done in all 13764 * passes. The reason for this is that the rest of the parsing is heavily 13765 * dependent on whether this routine found a valid posix class or not. If 13766 * it did, the closing ']' is absorbed as part of the class. If no class, 13767 * or an invalid one is found, any ']' will be considered the terminator of 13768 * the outer bracketed character class, leading to very different results. 13769 * In particular, a '(?[ ])' construct will likely have a syntax error if 13770 * the class is parsed other than intended, and this will happen in pass1, 13771 * before the warnings would normally be output. This mechanism allows the 13772 * caller to output those warnings in pass1 just before dieing, giving a 13773 * much better clue as to what is wrong. 13774 * 13775 * The reason for this function, and its complexity is that a bracketed 13776 * character class can contain just about anything. But it's easy to 13777 * mistype the very specific posix class syntax but yielding a valid 13778 * regular bracketed class, so it silently gets compiled into something 13779 * quite unintended. 13780 * 13781 * The solution adopted here maintains backward compatibility except that 13782 * it adds a warning if it looks like a posix class was intended but 13783 * improperly specified. The warning is not raised unless what is input 13784 * very closely resembles one of the 14 legal posix classes. To do this, 13785 * it uses fuzzy parsing. It calculates how many single-character edits it 13786 * would take to transform what was input into a legal posix class. Only 13787 * if that number is quite small does it think that the intention was a 13788 * posix class. Obviously these are heuristics, and there will be cases 13789 * where it errs on one side or another, and they can be tweaked as 13790 * experience informs. 13791 * 13792 * The syntax for a legal posix class is: 13793 * 13794 * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/ 13795 * 13796 * What this routine considers syntactically to be an intended posix class 13797 * is this (the comments indicate some restrictions that the pattern 13798 * doesn't show): 13799 * 13800 * qr/(?x: \[? # The left bracket, possibly 13801 * # omitted 13802 * \h* # possibly followed by blanks 13803 * (?: \^ \h* )? # possibly a misplaced caret 13804 * [:;]? # The opening class character, 13805 * # possibly omitted. A typo 13806 * # semi-colon can also be used. 13807 * \h* 13808 * \^? # possibly a correctly placed 13809 * # caret, but not if there was also 13810 * # a misplaced one 13811 * \h* 13812 * .{3,15} # The class name. If there are 13813 * # deviations from the legal syntax, 13814 * # its edit distance must be close 13815 * # to a real class name in order 13816 * # for it to be considered to be 13817 * # an intended posix class. 13818 * \h* 13819 * [:punct:]? # The closing class character, 13820 * # possibly omitted. If not a colon 13821 * # nor semi colon, the class name 13822 * # must be even closer to a valid 13823 * # one 13824 * \h* 13825 * \]? # The right bracket, possibly 13826 * # omitted. 13827 * )/ 13828 * 13829 * In the above, \h must be ASCII-only. 13830 * 13831 * These are heuristics, and can be tweaked as field experience dictates. 13832 * There will be cases when someone didn't intend to specify a posix class 13833 * that this warns as being so. The goal is to minimize these, while 13834 * maximizing the catching of things intended to be a posix class that 13835 * aren't parsed as such. 13836 */ 13837 13838 const char* p = s; 13839 const char * const e = RExC_end; 13840 unsigned complement = 0; /* If to complement the class */ 13841 bool found_problem = FALSE; /* Assume OK until proven otherwise */ 13842 bool has_opening_bracket = FALSE; 13843 bool has_opening_colon = FALSE; 13844 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find 13845 valid class */ 13846 const char * possible_end = NULL; /* used for a 2nd parse pass */ 13847 const char* name_start; /* ptr to class name first char */ 13848 13849 /* If the number of single-character typos the input name is away from a 13850 * legal name is no more than this number, it is considered to have meant 13851 * the legal name */ 13852 int max_distance = 2; 13853 13854 /* to store the name. The size determines the maximum length before we 13855 * decide that no posix class was intended. Should be at least 13856 * sizeof("alphanumeric") */ 13857 UV input_text[15]; 13858 13859 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX; 13860 13861 if (posix_warnings && RExC_warn_text) 13862 av_clear(RExC_warn_text); 13863 13864 if (p >= e) { 13865 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 13866 } 13867 13868 if (*(p - 1) != '[') { 13869 ADD_POSIX_WARNING(p, "it doesn't start with a '['"); 13870 found_problem = TRUE; 13871 } 13872 else { 13873 has_opening_bracket = TRUE; 13874 } 13875 13876 /* They could be confused and think you can put spaces between the 13877 * components */ 13878 if (isBLANK(*p)) { 13879 found_problem = TRUE; 13880 13881 do { 13882 p++; 13883 } while (p < e && isBLANK(*p)); 13884 13885 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 13886 } 13887 13888 /* For [. .] and [= =]. These are quite different internally from [: :], 13889 * so they are handled separately. */ 13890 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']' 13891 and 1 for at least one char in it 13892 */ 13893 { 13894 const char open_char = *p; 13895 const char * temp_ptr = p + 1; 13896 13897 /* These two constructs are not handled by perl, and if we find a 13898 * syntactically valid one, we croak. khw, who wrote this code, finds 13899 * this explanation of them very unclear: 13900 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html 13901 * And searching the rest of the internet wasn't very helpful either. 13902 * It looks like just about any byte can be in these constructs, 13903 * depending on the locale. But unless the pattern is being compiled 13904 * under /l, which is very rare, Perl runs under the C or POSIX locale. 13905 * In that case, it looks like [= =] isn't allowed at all, and that 13906 * [. .] could be any single code point, but for longer strings the 13907 * constituent characters would have to be the ASCII alphabetics plus 13908 * the minus-hyphen. Any sensible locale definition would limit itself 13909 * to these. And any portable one definitely should. Trying to parse 13910 * the general case is a nightmare (see [perl #127604]). So, this code 13911 * looks only for interiors of these constructs that match: 13912 * qr/.|[-\w]{2,}/ 13913 * Using \w relaxes the apparent rules a little, without adding much 13914 * danger of mistaking something else for one of these constructs. 13915 * 13916 * [. .] in some implementations described on the internet is usable to 13917 * escape a character that otherwise is special in bracketed character 13918 * classes. For example [.].] means a literal right bracket instead of 13919 * the ending of the class 13920 * 13921 * [= =] can legitimately contain a [. .] construct, but we don't 13922 * handle this case, as that [. .] construct will later get parsed 13923 * itself and croak then. And [= =] is checked for even when not under 13924 * /l, as Perl has long done so. 13925 * 13926 * The code below relies on there being a trailing NUL, so it doesn't 13927 * have to keep checking if the parse ptr < e. 13928 */ 13929 if (temp_ptr[1] == open_char) { 13930 temp_ptr++; 13931 } 13932 else while ( temp_ptr < e 13933 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-')) 13934 { 13935 temp_ptr++; 13936 } 13937 13938 if (*temp_ptr == open_char) { 13939 temp_ptr++; 13940 if (*temp_ptr == ']') { 13941 temp_ptr++; 13942 if (! found_problem && ! check_only) { 13943 RExC_parse = (char *) temp_ptr; 13944 vFAIL3("POSIX syntax [%c %c] is reserved for future " 13945 "extensions", open_char, open_char); 13946 } 13947 13948 /* Here, the syntax wasn't completely valid, or else the call 13949 * is to check-only */ 13950 if (updated_parse_ptr) { 13951 *updated_parse_ptr = (char *) temp_ptr; 13952 } 13953 13954 return OOB_NAMEDCLASS; 13955 } 13956 } 13957 13958 /* If we find something that started out to look like one of these 13959 * constructs, but isn't, we continue below so that it can be checked 13960 * for being a class name with a typo of '.' or '=' instead of a colon. 13961 * */ 13962 } 13963 13964 /* Here, we think there is a possibility that a [: :] class was meant, and 13965 * we have the first real character. It could be they think the '^' comes 13966 * first */ 13967 if (*p == '^') { 13968 found_problem = TRUE; 13969 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon"); 13970 complement = 1; 13971 p++; 13972 13973 if (isBLANK(*p)) { 13974 found_problem = TRUE; 13975 13976 do { 13977 p++; 13978 } while (p < e && isBLANK(*p)); 13979 13980 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 13981 } 13982 } 13983 13984 /* But the first character should be a colon, which they could have easily 13985 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to 13986 * distinguish from a colon, so treat that as a colon). */ 13987 if (*p == ':') { 13988 p++; 13989 has_opening_colon = TRUE; 13990 } 13991 else if (*p == ';') { 13992 found_problem = TRUE; 13993 p++; 13994 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 13995 has_opening_colon = TRUE; 13996 } 13997 else { 13998 found_problem = TRUE; 13999 ADD_POSIX_WARNING(p, "there must be a starting ':'"); 14000 14001 /* Consider an initial punctuation (not one of the recognized ones) to 14002 * be a left terminator */ 14003 if (*p != '^' && *p != ']' && isPUNCT(*p)) { 14004 p++; 14005 } 14006 } 14007 14008 /* They may think that you can put spaces between the components */ 14009 if (isBLANK(*p)) { 14010 found_problem = TRUE; 14011 14012 do { 14013 p++; 14014 } while (p < e && isBLANK(*p)); 14015 14016 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 14017 } 14018 14019 if (*p == '^') { 14020 14021 /* We consider something like [^:^alnum:]] to not have been intended to 14022 * be a posix class, but XXX maybe we should */ 14023 if (complement) { 14024 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 14025 } 14026 14027 complement = 1; 14028 p++; 14029 } 14030 14031 /* Again, they may think that you can put spaces between the components */ 14032 if (isBLANK(*p)) { 14033 found_problem = TRUE; 14034 14035 do { 14036 p++; 14037 } while (p < e && isBLANK(*p)); 14038 14039 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 14040 } 14041 14042 if (*p == ']') { 14043 14044 /* XXX This ']' may be a typo, and something else was meant. But 14045 * treating it as such creates enough complications, that that 14046 * possibility isn't currently considered here. So we assume that the 14047 * ']' is what is intended, and if we've already found an initial '[', 14048 * this leaves this construct looking like [:] or [:^], which almost 14049 * certainly weren't intended to be posix classes */ 14050 if (has_opening_bracket) { 14051 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 14052 } 14053 14054 /* But this function can be called when we parse the colon for 14055 * something like qr/[alpha:]]/, so we back up to look for the 14056 * beginning */ 14057 p--; 14058 14059 if (*p == ';') { 14060 found_problem = TRUE; 14061 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 14062 } 14063 else if (*p != ':') { 14064 14065 /* XXX We are currently very restrictive here, so this code doesn't 14066 * consider the possibility that, say, /[alpha.]]/ was intended to 14067 * be a posix class. */ 14068 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 14069 } 14070 14071 /* Here we have something like 'foo:]'. There was no initial colon, 14072 * and we back up over 'foo. XXX Unlike the going forward case, we 14073 * don't handle typos of non-word chars in the middle */ 14074 has_opening_colon = FALSE; 14075 p--; 14076 14077 while (p > RExC_start && isWORDCHAR(*p)) { 14078 p--; 14079 } 14080 p++; 14081 14082 /* Here, we have positioned ourselves to where we think the first 14083 * character in the potential class is */ 14084 } 14085 14086 /* Now the interior really starts. There are certain key characters that 14087 * can end the interior, or these could just be typos. To catch both 14088 * cases, we may have to do two passes. In the first pass, we keep on 14089 * going unless we come to a sequence that matches 14090 * qr/ [[:punct:]] [[:blank:]]* \] /xa 14091 * This means it takes a sequence to end the pass, so two typos in a row if 14092 * that wasn't what was intended. If the class is perfectly formed, just 14093 * this one pass is needed. We also stop if there are too many characters 14094 * being accumulated, but this number is deliberately set higher than any 14095 * real class. It is set high enough so that someone who thinks that 14096 * 'alphanumeric' is a correct name would get warned that it wasn't. 14097 * While doing the pass, we keep track of where the key characters were in 14098 * it. If we don't find an end to the class, and one of the key characters 14099 * was found, we redo the pass, but stop when we get to that character. 14100 * Thus the key character was considered a typo in the first pass, but a 14101 * terminator in the second. If two key characters are found, we stop at 14102 * the second one in the first pass. Again this can miss two typos, but 14103 * catches a single one 14104 * 14105 * In the first pass, 'possible_end' starts as NULL, and then gets set to 14106 * point to the first key character. For the second pass, it starts as -1. 14107 * */ 14108 14109 name_start = p; 14110 parse_name: 14111 { 14112 bool has_blank = FALSE; 14113 bool has_upper = FALSE; 14114 bool has_terminating_colon = FALSE; 14115 bool has_terminating_bracket = FALSE; 14116 bool has_semi_colon = FALSE; 14117 unsigned int name_len = 0; 14118 int punct_count = 0; 14119 14120 while (p < e) { 14121 14122 /* Squeeze out blanks when looking up the class name below */ 14123 if (isBLANK(*p) ) { 14124 has_blank = TRUE; 14125 found_problem = TRUE; 14126 p++; 14127 continue; 14128 } 14129 14130 /* The name will end with a punctuation */ 14131 if (isPUNCT(*p)) { 14132 const char * peek = p + 1; 14133 14134 /* Treat any non-']' punctuation followed by a ']' (possibly 14135 * with intervening blanks) as trying to terminate the class. 14136 * ']]' is very likely to mean a class was intended (but 14137 * missing the colon), but the warning message that gets 14138 * generated shows the error position better if we exit the 14139 * loop at the bottom (eventually), so skip it here. */ 14140 if (*p != ']') { 14141 if (peek < e && isBLANK(*peek)) { 14142 has_blank = TRUE; 14143 found_problem = TRUE; 14144 do { 14145 peek++; 14146 } while (peek < e && isBLANK(*peek)); 14147 } 14148 14149 if (peek < e && *peek == ']') { 14150 has_terminating_bracket = TRUE; 14151 if (*p == ':') { 14152 has_terminating_colon = TRUE; 14153 } 14154 else if (*p == ';') { 14155 has_semi_colon = TRUE; 14156 has_terminating_colon = TRUE; 14157 } 14158 else { 14159 found_problem = TRUE; 14160 } 14161 p = peek + 1; 14162 goto try_posix; 14163 } 14164 } 14165 14166 /* Here we have punctuation we thought didn't end the class. 14167 * Keep track of the position of the key characters that are 14168 * more likely to have been class-enders */ 14169 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') { 14170 14171 /* Allow just one such possible class-ender not actually 14172 * ending the class. */ 14173 if (possible_end) { 14174 break; 14175 } 14176 possible_end = p; 14177 } 14178 14179 /* If we have too many punctuation characters, no use in 14180 * keeping going */ 14181 if (++punct_count > max_distance) { 14182 break; 14183 } 14184 14185 /* Treat the punctuation as a typo. */ 14186 input_text[name_len++] = *p; 14187 p++; 14188 } 14189 else if (isUPPER(*p)) { /* Use lowercase for lookup */ 14190 input_text[name_len++] = toLOWER(*p); 14191 has_upper = TRUE; 14192 found_problem = TRUE; 14193 p++; 14194 } else if (! UTF || UTF8_IS_INVARIANT(*p)) { 14195 input_text[name_len++] = *p; 14196 p++; 14197 } 14198 else { 14199 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL); 14200 p+= UTF8SKIP(p); 14201 } 14202 14203 /* The declaration of 'input_text' is how long we allow a potential 14204 * class name to be, before saying they didn't mean a class name at 14205 * all */ 14206 if (name_len >= C_ARRAY_LENGTH(input_text)) { 14207 break; 14208 } 14209 } 14210 14211 /* We get to here when the possible class name hasn't been properly 14212 * terminated before: 14213 * 1) we ran off the end of the pattern; or 14214 * 2) found two characters, each of which might have been intended to 14215 * be the name's terminator 14216 * 3) found so many punctuation characters in the purported name, 14217 * that the edit distance to a valid one is exceeded 14218 * 4) we decided it was more characters than anyone could have 14219 * intended to be one. */ 14220 14221 found_problem = TRUE; 14222 14223 /* In the final two cases, we know that looking up what we've 14224 * accumulated won't lead to a match, even a fuzzy one. */ 14225 if ( name_len >= C_ARRAY_LENGTH(input_text) 14226 || punct_count > max_distance) 14227 { 14228 /* If there was an intermediate key character that could have been 14229 * an intended end, redo the parse, but stop there */ 14230 if (possible_end && possible_end != (char *) -1) { 14231 possible_end = (char *) -1; /* Special signal value to say 14232 we've done a first pass */ 14233 p = name_start; 14234 goto parse_name; 14235 } 14236 14237 /* Otherwise, it can't have meant to have been a class */ 14238 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 14239 } 14240 14241 /* If we ran off the end, and the final character was a punctuation 14242 * one, back up one, to look at that final one just below. Later, we 14243 * will restore the parse pointer if appropriate */ 14244 if (name_len && p == e && isPUNCT(*(p-1))) { 14245 p--; 14246 name_len--; 14247 } 14248 14249 if (p < e && isPUNCT(*p)) { 14250 if (*p == ']') { 14251 has_terminating_bracket = TRUE; 14252 14253 /* If this is a 2nd ']', and the first one is just below this 14254 * one, consider that to be the real terminator. This gives a 14255 * uniform and better positioning for the warning message */ 14256 if ( possible_end 14257 && possible_end != (char *) -1 14258 && *possible_end == ']' 14259 && name_len && input_text[name_len - 1] == ']') 14260 { 14261 name_len--; 14262 p = possible_end; 14263 14264 /* And this is actually equivalent to having done the 2nd 14265 * pass now, so set it to not try again */ 14266 possible_end = (char *) -1; 14267 } 14268 } 14269 else { 14270 if (*p == ':') { 14271 has_terminating_colon = TRUE; 14272 } 14273 else if (*p == ';') { 14274 has_semi_colon = TRUE; 14275 has_terminating_colon = TRUE; 14276 } 14277 p++; 14278 } 14279 } 14280 14281 try_posix: 14282 14283 /* Here, we have a class name to look up. We can short circuit the 14284 * stuff below for short names that can't possibly be meant to be a 14285 * class name. (We can do this on the first pass, as any second pass 14286 * will yield an even shorter name) */ 14287 if (name_len < 3) { 14288 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 14289 } 14290 14291 /* Find which class it is. Initially switch on the length of the name. 14292 * */ 14293 switch (name_len) { 14294 case 4: 14295 if (memEQ(name_start, "word", 4)) { 14296 /* this is not POSIX, this is the Perl \w */ 14297 class_number = ANYOF_WORDCHAR; 14298 } 14299 break; 14300 case 5: 14301 /* Names all of length 5: alnum alpha ascii blank cntrl digit 14302 * graph lower print punct space upper 14303 * Offset 4 gives the best switch position. */ 14304 switch (name_start[4]) { 14305 case 'a': 14306 if (memEQ(name_start, "alph", 4)) /* alpha */ 14307 class_number = ANYOF_ALPHA; 14308 break; 14309 case 'e': 14310 if (memEQ(name_start, "spac", 4)) /* space */ 14311 class_number = ANYOF_SPACE; 14312 break; 14313 case 'h': 14314 if (memEQ(name_start, "grap", 4)) /* graph */ 14315 class_number = ANYOF_GRAPH; 14316 break; 14317 case 'i': 14318 if (memEQ(name_start, "asci", 4)) /* ascii */ 14319 class_number = ANYOF_ASCII; 14320 break; 14321 case 'k': 14322 if (memEQ(name_start, "blan", 4)) /* blank */ 14323 class_number = ANYOF_BLANK; 14324 break; 14325 case 'l': 14326 if (memEQ(name_start, "cntr", 4)) /* cntrl */ 14327 class_number = ANYOF_CNTRL; 14328 break; 14329 case 'm': 14330 if (memEQ(name_start, "alnu", 4)) /* alnum */ 14331 class_number = ANYOF_ALPHANUMERIC; 14332 break; 14333 case 'r': 14334 if (memEQ(name_start, "lowe", 4)) /* lower */ 14335 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; 14336 else if (memEQ(name_start, "uppe", 4)) /* upper */ 14337 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; 14338 break; 14339 case 't': 14340 if (memEQ(name_start, "digi", 4)) /* digit */ 14341 class_number = ANYOF_DIGIT; 14342 else if (memEQ(name_start, "prin", 4)) /* print */ 14343 class_number = ANYOF_PRINT; 14344 else if (memEQ(name_start, "punc", 4)) /* punct */ 14345 class_number = ANYOF_PUNCT; 14346 break; 14347 } 14348 break; 14349 case 6: 14350 if (memEQ(name_start, "xdigit", 6)) 14351 class_number = ANYOF_XDIGIT; 14352 break; 14353 } 14354 14355 /* If the name exactly matches a posix class name the class number will 14356 * here be set to it, and the input almost certainly was meant to be a 14357 * posix class, so we can skip further checking. If instead the syntax 14358 * is exactly correct, but the name isn't one of the legal ones, we 14359 * will return that as an error below. But if neither of these apply, 14360 * it could be that no posix class was intended at all, or that one 14361 * was, but there was a typo. We tease these apart by doing fuzzy 14362 * matching on the name */ 14363 if (class_number == OOB_NAMEDCLASS && found_problem) { 14364 const UV posix_names[][6] = { 14365 { 'a', 'l', 'n', 'u', 'm' }, 14366 { 'a', 'l', 'p', 'h', 'a' }, 14367 { 'a', 's', 'c', 'i', 'i' }, 14368 { 'b', 'l', 'a', 'n', 'k' }, 14369 { 'c', 'n', 't', 'r', 'l' }, 14370 { 'd', 'i', 'g', 'i', 't' }, 14371 { 'g', 'r', 'a', 'p', 'h' }, 14372 { 'l', 'o', 'w', 'e', 'r' }, 14373 { 'p', 'r', 'i', 'n', 't' }, 14374 { 'p', 'u', 'n', 'c', 't' }, 14375 { 's', 'p', 'a', 'c', 'e' }, 14376 { 'u', 'p', 'p', 'e', 'r' }, 14377 { 'w', 'o', 'r', 'd' }, 14378 { 'x', 'd', 'i', 'g', 'i', 't' } 14379 }; 14380 /* The names of the above all have added NULs to make them the same 14381 * size, so we need to also have the real lengths */ 14382 const UV posix_name_lengths[] = { 14383 sizeof("alnum") - 1, 14384 sizeof("alpha") - 1, 14385 sizeof("ascii") - 1, 14386 sizeof("blank") - 1, 14387 sizeof("cntrl") - 1, 14388 sizeof("digit") - 1, 14389 sizeof("graph") - 1, 14390 sizeof("lower") - 1, 14391 sizeof("print") - 1, 14392 sizeof("punct") - 1, 14393 sizeof("space") - 1, 14394 sizeof("upper") - 1, 14395 sizeof("word") - 1, 14396 sizeof("xdigit")- 1 14397 }; 14398 unsigned int i; 14399 int temp_max = max_distance; /* Use a temporary, so if we 14400 reparse, we haven't changed the 14401 outer one */ 14402 14403 /* Use a smaller max edit distance if we are missing one of the 14404 * delimiters */ 14405 if ( has_opening_bracket + has_opening_colon < 2 14406 || has_terminating_bracket + has_terminating_colon < 2) 14407 { 14408 temp_max--; 14409 } 14410 14411 /* See if the input name is close to a legal one */ 14412 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) { 14413 14414 /* Short circuit call if the lengths are too far apart to be 14415 * able to match */ 14416 if (abs( (int) (name_len - posix_name_lengths[i])) 14417 > temp_max) 14418 { 14419 continue; 14420 } 14421 14422 if (edit_distance(input_text, 14423 posix_names[i], 14424 name_len, 14425 posix_name_lengths[i], 14426 temp_max 14427 ) 14428 > -1) 14429 { /* If it is close, it probably was intended to be a class */ 14430 goto probably_meant_to_be; 14431 } 14432 } 14433 14434 /* Here the input name is not close enough to a valid class name 14435 * for us to consider it to be intended to be a posix class. If 14436 * we haven't already done so, and the parse found a character that 14437 * could have been terminators for the name, but which we absorbed 14438 * as typos during the first pass, repeat the parse, signalling it 14439 * to stop at that character */ 14440 if (possible_end && possible_end != (char *) -1) { 14441 possible_end = (char *) -1; 14442 p = name_start; 14443 goto parse_name; 14444 } 14445 14446 /* Here neither pass found a close-enough class name */ 14447 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 14448 } 14449 14450 probably_meant_to_be: 14451 14452 /* Here we think that a posix specification was intended. Update any 14453 * parse pointer */ 14454 if (updated_parse_ptr) { 14455 *updated_parse_ptr = (char *) p; 14456 } 14457 14458 /* If a posix class name was intended but incorrectly specified, we 14459 * output or return the warnings */ 14460 if (found_problem) { 14461 14462 /* We set flags for these issues in the parse loop above instead of 14463 * adding them to the list of warnings, because we can parse it 14464 * twice, and we only want one warning instance */ 14465 if (has_upper) { 14466 ADD_POSIX_WARNING(p, "the name must be all lowercase letters"); 14467 } 14468 if (has_blank) { 14469 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 14470 } 14471 if (has_semi_colon) { 14472 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 14473 } 14474 else if (! has_terminating_colon) { 14475 ADD_POSIX_WARNING(p, "there is no terminating ':'"); 14476 } 14477 if (! has_terminating_bracket) { 14478 ADD_POSIX_WARNING(p, "there is no terminating ']'"); 14479 } 14480 14481 if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) { 14482 *posix_warnings = RExC_warn_text; 14483 } 14484 } 14485 else if (class_number != OOB_NAMEDCLASS) { 14486 /* If it is a known class, return the class. The class number 14487 * #defines are structured so each complement is +1 to the normal 14488 * one */ 14489 return class_number + complement; 14490 } 14491 else if (! check_only) { 14492 14493 /* Here, it is an unrecognized class. This is an error (unless the 14494 * call is to check only, which we've already handled above) */ 14495 const char * const complement_string = (complement) 14496 ? "^" 14497 : ""; 14498 RExC_parse = (char *) p; 14499 vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown", 14500 complement_string, 14501 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start)); 14502 } 14503 } 14504 14505 return OOB_NAMEDCLASS; 14506 } 14507 #undef ADD_POSIX_WARNING 14508 14509 STATIC unsigned int 14510 S_regex_set_precedence(const U8 my_operator) { 14511 14512 /* Returns the precedence in the (?[...]) construct of the input operator, 14513 * specified by its character representation. The precedence follows 14514 * general Perl rules, but it extends this so that ')' and ']' have (low) 14515 * precedence even though they aren't really operators */ 14516 14517 switch (my_operator) { 14518 case '!': 14519 return 5; 14520 case '&': 14521 return 4; 14522 case '^': 14523 case '|': 14524 case '+': 14525 case '-': 14526 return 3; 14527 case ')': 14528 return 2; 14529 case ']': 14530 return 1; 14531 } 14532 14533 NOT_REACHED; /* NOTREACHED */ 14534 return 0; /* Silence compiler warning */ 14535 } 14536 14537 STATIC regnode * 14538 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, 14539 I32 *flagp, U32 depth, 14540 char * const oregcomp_parse) 14541 { 14542 /* Handle the (?[...]) construct to do set operations */ 14543 14544 U8 curchar; /* Current character being parsed */ 14545 UV start, end; /* End points of code point ranges */ 14546 SV* final = NULL; /* The end result inversion list */ 14547 SV* result_string; /* 'final' stringified */ 14548 AV* stack; /* stack of operators and operands not yet 14549 resolved */ 14550 AV* fence_stack = NULL; /* A stack containing the positions in 14551 'stack' of where the undealt-with left 14552 parens would be if they were actually 14553 put there */ 14554 /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug 14555 * in Solaris Studio 12.3. See RT #127455 */ 14556 VOL IV fence = 0; /* Position of where most recent undealt- 14557 with left paren in stack is; -1 if none. 14558 */ 14559 STRLEN len; /* Temporary */ 14560 regnode* node; /* Temporary, and final regnode returned by 14561 this function */ 14562 const bool save_fold = FOLD; /* Temporary */ 14563 char *save_end, *save_parse; /* Temporaries */ 14564 const bool in_locale = LOC; /* we turn off /l during processing */ 14565 AV* posix_warnings = NULL; 14566 14567 GET_RE_DEBUG_FLAGS_DECL; 14568 14569 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; 14570 14571 if (in_locale) { 14572 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); 14573 } 14574 14575 REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u. 14576 This is required so that the compile 14577 time values are valid in all runtime 14578 cases */ 14579 14580 /* This will return only an ANYOF regnode, or (unlikely) something smaller 14581 * (such as EXACT). Thus we can skip most everything if just sizing. We 14582 * call regclass to handle '[]' so as to not have to reinvent its parsing 14583 * rules here (throwing away the size it computes each time). And, we exit 14584 * upon an unescaped ']' that isn't one ending a regclass. To do both 14585 * these things, we need to realize that something preceded by a backslash 14586 * is escaped, so we have to keep track of backslashes */ 14587 if (SIZE_ONLY) { 14588 UV depth = 0; /* how many nested (?[...]) constructs */ 14589 14590 while (RExC_parse < RExC_end) { 14591 SV* current = NULL; 14592 14593 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 14594 TRUE /* Force /x */ ); 14595 14596 switch (*RExC_parse) { 14597 case '?': 14598 if (RExC_parse[1] == '[') depth++, RExC_parse++; 14599 /* FALLTHROUGH */ 14600 default: 14601 break; 14602 case '\\': 14603 /* Skip past this, so the next character gets skipped, after 14604 * the switch */ 14605 RExC_parse++; 14606 if (*RExC_parse == 'c') { 14607 /* Skip the \cX notation for control characters */ 14608 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 14609 } 14610 break; 14611 14612 case '[': 14613 { 14614 /* See if this is a [:posix:] class. */ 14615 bool is_posix_class = (OOB_NAMEDCLASS 14616 < handle_possible_posix(pRExC_state, 14617 RExC_parse + 1, 14618 NULL, 14619 NULL, 14620 TRUE /* checking only */)); 14621 /* If it is a posix class, leave the parse pointer at the 14622 * '[' to fool regclass() into thinking it is part of a 14623 * '[[:posix:]]'. */ 14624 if (! is_posix_class) { 14625 RExC_parse++; 14626 } 14627 14628 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 14629 * if multi-char folds are allowed. */ 14630 if (!regclass(pRExC_state, flagp,depth+1, 14631 is_posix_class, /* parse the whole char 14632 class only if not a 14633 posix class */ 14634 FALSE, /* don't allow multi-char folds */ 14635 TRUE, /* silence non-portable warnings. */ 14636 TRUE, /* strict */ 14637 FALSE, /* Require return to be an ANYOF */ 14638 ¤t, 14639 &posix_warnings 14640 )) 14641 FAIL2("panic: regclass returned NULL to handle_sets, " 14642 "flags=%#"UVxf"", (UV) *flagp); 14643 14644 /* function call leaves parse pointing to the ']', except 14645 * if we faked it */ 14646 if (is_posix_class) { 14647 RExC_parse--; 14648 } 14649 14650 SvREFCNT_dec(current); /* In case it returned something */ 14651 break; 14652 } 14653 14654 case ']': 14655 if (depth--) break; 14656 RExC_parse++; 14657 if (*RExC_parse == ')') { 14658 node = reganode(pRExC_state, ANYOF, 0); 14659 RExC_size += ANYOF_SKIP; 14660 nextchar(pRExC_state); 14661 Set_Node_Length(node, 14662 RExC_parse - oregcomp_parse + 1); /* MJD */ 14663 if (in_locale) { 14664 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); 14665 } 14666 14667 return node; 14668 } 14669 goto no_close; 14670 } 14671 14672 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; 14673 } 14674 14675 no_close: 14676 /* We output the messages even if warnings are off, because we'll fail 14677 * the very next thing, and these give a likely diagnosis for that */ 14678 if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) { 14679 output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL); 14680 } 14681 14682 FAIL("Syntax error in (?[...])"); 14683 } 14684 14685 /* Pass 2 only after this. */ 14686 Perl_ck_warner_d(aTHX_ 14687 packWARN(WARN_EXPERIMENTAL__REGEX_SETS), 14688 "The regex_sets feature is experimental" REPORT_LOCATION, 14689 REPORT_LOCATION_ARGS(RExC_parse)); 14690 14691 /* Everything in this construct is a metacharacter. Operands begin with 14692 * either a '\' (for an escape sequence), or a '[' for a bracketed 14693 * character class. Any other character should be an operator, or 14694 * parenthesis for grouping. Both types of operands are handled by calling 14695 * regclass() to parse them. It is called with a parameter to indicate to 14696 * return the computed inversion list. The parsing here is implemented via 14697 * a stack. Each entry on the stack is a single character representing one 14698 * of the operators; or else a pointer to an operand inversion list. */ 14699 14700 #define IS_OPERATOR(a) SvIOK(a) 14701 #define IS_OPERAND(a) (! IS_OPERATOR(a)) 14702 14703 /* The stack is kept in Łukasiewicz order. (That's pronounced similar 14704 * to luke-a-shave-itch (or -itz), but people who didn't want to bother 14705 * with pronouncing it called it Reverse Polish instead, but now that YOU 14706 * know how to pronounce it you can use the correct term, thus giving due 14707 * credit to the person who invented it, and impressing your geek friends. 14708 * Wikipedia says that the pronounciation of "Ł" has been changing so that 14709 * it is now more like an English initial W (as in wonk) than an L.) 14710 * 14711 * This means that, for example, 'a | b & c' is stored on the stack as 14712 * 14713 * c [4] 14714 * b [3] 14715 * & [2] 14716 * a [1] 14717 * | [0] 14718 * 14719 * where the numbers in brackets give the stack [array] element number. 14720 * In this implementation, parentheses are not stored on the stack. 14721 * Instead a '(' creates a "fence" so that the part of the stack below the 14722 * fence is invisible except to the corresponding ')' (this allows us to 14723 * replace testing for parens, by using instead subtraction of the fence 14724 * position). As new operands are processed they are pushed onto the stack 14725 * (except as noted in the next paragraph). New operators of higher 14726 * precedence than the current final one are inserted on the stack before 14727 * the lhs operand (so that when the rhs is pushed next, everything will be 14728 * in the correct positions shown above. When an operator of equal or 14729 * lower precedence is encountered in parsing, all the stacked operations 14730 * of equal or higher precedence are evaluated, leaving the result as the 14731 * top entry on the stack. This makes higher precedence operations 14732 * evaluate before lower precedence ones, and causes operations of equal 14733 * precedence to left associate. 14734 * 14735 * The only unary operator '!' is immediately pushed onto the stack when 14736 * encountered. When an operand is encountered, if the top of the stack is 14737 * a '!", the complement is immediately performed, and the '!' popped. The 14738 * resulting value is treated as a new operand, and the logic in the 14739 * previous paragraph is executed. Thus in the expression 14740 * [a] + ! [b] 14741 * the stack looks like 14742 * 14743 * ! 14744 * a 14745 * + 14746 * 14747 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack 14748 * becomes 14749 * 14750 * !b 14751 * a 14752 * + 14753 * 14754 * A ')' is treated as an operator with lower precedence than all the 14755 * aforementioned ones, which causes all operations on the stack above the 14756 * corresponding '(' to be evaluated down to a single resultant operand. 14757 * Then the fence for the '(' is removed, and the operand goes through the 14758 * algorithm above, without the fence. 14759 * 14760 * A separate stack is kept of the fence positions, so that the position of 14761 * the latest so-far unbalanced '(' is at the top of it. 14762 * 14763 * The ']' ending the construct is treated as the lowest operator of all, 14764 * so that everything gets evaluated down to a single operand, which is the 14765 * result */ 14766 14767 sv_2mortal((SV *)(stack = newAV())); 14768 sv_2mortal((SV *)(fence_stack = newAV())); 14769 14770 while (RExC_parse < RExC_end) { 14771 I32 top_index; /* Index of top-most element in 'stack' */ 14772 SV** top_ptr; /* Pointer to top 'stack' element */ 14773 SV* current = NULL; /* To contain the current inversion list 14774 operand */ 14775 SV* only_to_avoid_leaks; 14776 14777 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 14778 TRUE /* Force /x */ ); 14779 if (RExC_parse >= RExC_end) { 14780 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); 14781 } 14782 14783 curchar = UCHARAT(RExC_parse); 14784 14785 redo_curchar: 14786 14787 top_index = av_tindex_nomg(stack); 14788 14789 switch (curchar) { 14790 SV** stacked_ptr; /* Ptr to something already on 'stack' */ 14791 char stacked_operator; /* The topmost operator on the 'stack'. */ 14792 SV* lhs; /* Operand to the left of the operator */ 14793 SV* rhs; /* Operand to the right of the operator */ 14794 SV* fence_ptr; /* Pointer to top element of the fence 14795 stack */ 14796 14797 case '(': 14798 14799 if ( RExC_parse < RExC_end - 1 14800 && (UCHARAT(RExC_parse + 1) == '?')) 14801 { 14802 /* If is a '(?', could be an embedded '(?flags:(?[...])'. 14803 * This happens when we have some thing like 14804 * 14805 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; 14806 * ... 14807 * qr/(?[ \p{Digit} & $thai_or_lao ])/; 14808 * 14809 * Here we would be handling the interpolated 14810 * '$thai_or_lao'. We handle this by a recursive call to 14811 * ourselves which returns the inversion list the 14812 * interpolated expression evaluates to. We use the flags 14813 * from the interpolated pattern. */ 14814 U32 save_flags = RExC_flags; 14815 const char * save_parse; 14816 14817 RExC_parse += 2; /* Skip past the '(?' */ 14818 save_parse = RExC_parse; 14819 14820 /* Parse any flags for the '(?' */ 14821 parse_lparen_question_flags(pRExC_state); 14822 14823 if (RExC_parse == save_parse /* Makes sure there was at 14824 least one flag (or else 14825 this embedding wasn't 14826 compiled) */ 14827 || RExC_parse >= RExC_end - 4 14828 || UCHARAT(RExC_parse) != ':' 14829 || UCHARAT(++RExC_parse) != '(' 14830 || UCHARAT(++RExC_parse) != '?' 14831 || UCHARAT(++RExC_parse) != '[') 14832 { 14833 14834 /* In combination with the above, this moves the 14835 * pointer to the point just after the first erroneous 14836 * character (or if there are no flags, to where they 14837 * should have been) */ 14838 if (RExC_parse >= RExC_end - 4) { 14839 RExC_parse = RExC_end; 14840 } 14841 else if (RExC_parse != save_parse) { 14842 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 14843 } 14844 vFAIL("Expecting '(?flags:(?[...'"); 14845 } 14846 14847 /* Recurse, with the meat of the embedded expression */ 14848 RExC_parse++; 14849 (void) handle_regex_sets(pRExC_state, ¤t, flagp, 14850 depth+1, oregcomp_parse); 14851 14852 /* Here, 'current' contains the embedded expression's 14853 * inversion list, and RExC_parse points to the trailing 14854 * ']'; the next character should be the ')' */ 14855 RExC_parse++; 14856 assert(UCHARAT(RExC_parse) == ')'); 14857 14858 /* Then the ')' matching the original '(' handled by this 14859 * case: statement */ 14860 RExC_parse++; 14861 assert(UCHARAT(RExC_parse) == ')'); 14862 14863 RExC_parse++; 14864 RExC_flags = save_flags; 14865 goto handle_operand; 14866 } 14867 14868 /* A regular '('. Look behind for illegal syntax */ 14869 if (top_index - fence >= 0) { 14870 /* If the top entry on the stack is an operator, it had 14871 * better be a '!', otherwise the entry below the top 14872 * operand should be an operator */ 14873 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) 14874 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!') 14875 || ( IS_OPERAND(*top_ptr) 14876 && ( top_index - fence < 1 14877 || ! (stacked_ptr = av_fetch(stack, 14878 top_index - 1, 14879 FALSE)) 14880 || ! IS_OPERATOR(*stacked_ptr)))) 14881 { 14882 RExC_parse++; 14883 vFAIL("Unexpected '(' with no preceding operator"); 14884 } 14885 } 14886 14887 /* Stack the position of this undealt-with left paren */ 14888 av_push(fence_stack, newSViv(fence)); 14889 fence = top_index + 1; 14890 break; 14891 14892 case '\\': 14893 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if 14894 * multi-char folds are allowed. */ 14895 if (!regclass(pRExC_state, flagp,depth+1, 14896 TRUE, /* means parse just the next thing */ 14897 FALSE, /* don't allow multi-char folds */ 14898 FALSE, /* don't silence non-portable warnings. */ 14899 TRUE, /* strict */ 14900 FALSE, /* Require return to be an ANYOF */ 14901 ¤t, 14902 NULL)) 14903 { 14904 FAIL2("panic: regclass returned NULL to handle_sets, " 14905 "flags=%#"UVxf"", (UV) *flagp); 14906 } 14907 14908 /* regclass() will return with parsing just the \ sequence, 14909 * leaving the parse pointer at the next thing to parse */ 14910 RExC_parse--; 14911 goto handle_operand; 14912 14913 case '[': /* Is a bracketed character class */ 14914 { 14915 /* See if this is a [:posix:] class. */ 14916 bool is_posix_class = (OOB_NAMEDCLASS 14917 < handle_possible_posix(pRExC_state, 14918 RExC_parse + 1, 14919 NULL, 14920 NULL, 14921 TRUE /* checking only */)); 14922 /* If it is a posix class, leave the parse pointer at the '[' 14923 * to fool regclass() into thinking it is part of a 14924 * '[[:posix:]]'. */ 14925 if (! is_posix_class) { 14926 RExC_parse++; 14927 } 14928 14929 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if 14930 * multi-char folds are allowed. */ 14931 if (!regclass(pRExC_state, flagp,depth+1, 14932 is_posix_class, /* parse the whole char 14933 class only if not a 14934 posix class */ 14935 FALSE, /* don't allow multi-char folds */ 14936 TRUE, /* silence non-portable warnings. */ 14937 TRUE, /* strict */ 14938 FALSE, /* Require return to be an ANYOF */ 14939 ¤t, 14940 NULL 14941 )) 14942 { 14943 FAIL2("panic: regclass returned NULL to handle_sets, " 14944 "flags=%#"UVxf"", (UV) *flagp); 14945 } 14946 14947 /* function call leaves parse pointing to the ']', except if we 14948 * faked it */ 14949 if (is_posix_class) { 14950 RExC_parse--; 14951 } 14952 14953 goto handle_operand; 14954 } 14955 14956 case ']': 14957 if (top_index >= 1) { 14958 goto join_operators; 14959 } 14960 14961 /* Only a single operand on the stack: are done */ 14962 goto done; 14963 14964 case ')': 14965 if (av_tindex_nomg(fence_stack) < 0) { 14966 RExC_parse++; 14967 vFAIL("Unexpected ')'"); 14968 } 14969 14970 /* If nothing after the fence, is missing an operand */ 14971 if (top_index - fence < 0) { 14972 RExC_parse++; 14973 goto bad_syntax; 14974 } 14975 /* If at least two things on the stack, treat this as an 14976 * operator */ 14977 if (top_index - fence >= 1) { 14978 goto join_operators; 14979 } 14980 14981 /* Here only a single thing on the fenced stack, and there is a 14982 * fence. Get rid of it */ 14983 fence_ptr = av_pop(fence_stack); 14984 assert(fence_ptr); 14985 fence = SvIV(fence_ptr) - 1; 14986 SvREFCNT_dec_NN(fence_ptr); 14987 fence_ptr = NULL; 14988 14989 if (fence < 0) { 14990 fence = 0; 14991 } 14992 14993 /* Having gotten rid of the fence, we pop the operand at the 14994 * stack top and process it as a newly encountered operand */ 14995 current = av_pop(stack); 14996 if (IS_OPERAND(current)) { 14997 goto handle_operand; 14998 } 14999 15000 RExC_parse++; 15001 goto bad_syntax; 15002 15003 case '&': 15004 case '|': 15005 case '+': 15006 case '-': 15007 case '^': 15008 15009 /* These binary operators should have a left operand already 15010 * parsed */ 15011 if ( top_index - fence < 0 15012 || top_index - fence == 1 15013 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) 15014 || ! IS_OPERAND(*top_ptr)) 15015 { 15016 goto unexpected_binary; 15017 } 15018 15019 /* If only the one operand is on the part of the stack visible 15020 * to us, we just place this operator in the proper position */ 15021 if (top_index - fence < 2) { 15022 15023 /* Place the operator before the operand */ 15024 15025 SV* lhs = av_pop(stack); 15026 av_push(stack, newSVuv(curchar)); 15027 av_push(stack, lhs); 15028 break; 15029 } 15030 15031 /* But if there is something else on the stack, we need to 15032 * process it before this new operator if and only if the 15033 * stacked operation has equal or higher precedence than the 15034 * new one */ 15035 15036 join_operators: 15037 15038 /* The operator on the stack is supposed to be below both its 15039 * operands */ 15040 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE)) 15041 || IS_OPERAND(*stacked_ptr)) 15042 { 15043 /* But if not, it's legal and indicates we are completely 15044 * done if and only if we're currently processing a ']', 15045 * which should be the final thing in the expression */ 15046 if (curchar == ']') { 15047 goto done; 15048 } 15049 15050 unexpected_binary: 15051 RExC_parse++; 15052 vFAIL2("Unexpected binary operator '%c' with no " 15053 "preceding operand", curchar); 15054 } 15055 stacked_operator = (char) SvUV(*stacked_ptr); 15056 15057 if (regex_set_precedence(curchar) 15058 > regex_set_precedence(stacked_operator)) 15059 { 15060 /* Here, the new operator has higher precedence than the 15061 * stacked one. This means we need to add the new one to 15062 * the stack to await its rhs operand (and maybe more 15063 * stuff). We put it before the lhs operand, leaving 15064 * untouched the stacked operator and everything below it 15065 * */ 15066 lhs = av_pop(stack); 15067 assert(IS_OPERAND(lhs)); 15068 15069 av_push(stack, newSVuv(curchar)); 15070 av_push(stack, lhs); 15071 break; 15072 } 15073 15074 /* Here, the new operator has equal or lower precedence than 15075 * what's already there. This means the operation already 15076 * there should be performed now, before the new one. */ 15077 15078 rhs = av_pop(stack); 15079 if (! IS_OPERAND(rhs)) { 15080 15081 /* This can happen when a ! is not followed by an operand, 15082 * like in /(?[\t &!])/ */ 15083 goto bad_syntax; 15084 } 15085 15086 lhs = av_pop(stack); 15087 15088 if (! IS_OPERAND(lhs)) { 15089 15090 /* This can happen when there is an empty (), like in 15091 * /(?[[0]+()+])/ */ 15092 goto bad_syntax; 15093 } 15094 15095 switch (stacked_operator) { 15096 case '&': 15097 _invlist_intersection(lhs, rhs, &rhs); 15098 break; 15099 15100 case '|': 15101 case '+': 15102 _invlist_union(lhs, rhs, &rhs); 15103 break; 15104 15105 case '-': 15106 _invlist_subtract(lhs, rhs, &rhs); 15107 break; 15108 15109 case '^': /* The union minus the intersection */ 15110 { 15111 SV* i = NULL; 15112 SV* u = NULL; 15113 SV* element; 15114 15115 _invlist_union(lhs, rhs, &u); 15116 _invlist_intersection(lhs, rhs, &i); 15117 /* _invlist_subtract will overwrite rhs 15118 without freeing what it already contains */ 15119 element = rhs; 15120 _invlist_subtract(u, i, &rhs); 15121 SvREFCNT_dec_NN(i); 15122 SvREFCNT_dec_NN(u); 15123 SvREFCNT_dec_NN(element); 15124 break; 15125 } 15126 } 15127 SvREFCNT_dec(lhs); 15128 15129 /* Here, the higher precedence operation has been done, and the 15130 * result is in 'rhs'. We overwrite the stacked operator with 15131 * the result. Then we redo this code to either push the new 15132 * operator onto the stack or perform any higher precedence 15133 * stacked operation */ 15134 only_to_avoid_leaks = av_pop(stack); 15135 SvREFCNT_dec(only_to_avoid_leaks); 15136 av_push(stack, rhs); 15137 goto redo_curchar; 15138 15139 case '!': /* Highest priority, right associative */ 15140 15141 /* If what's already at the top of the stack is another '!", 15142 * they just cancel each other out */ 15143 if ( (top_ptr = av_fetch(stack, top_index, FALSE)) 15144 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!')) 15145 { 15146 only_to_avoid_leaks = av_pop(stack); 15147 SvREFCNT_dec(only_to_avoid_leaks); 15148 } 15149 else { /* Otherwise, since it's right associative, just push 15150 onto the stack */ 15151 av_push(stack, newSVuv(curchar)); 15152 } 15153 break; 15154 15155 default: 15156 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 15157 vFAIL("Unexpected character"); 15158 15159 handle_operand: 15160 15161 /* Here 'current' is the operand. If something is already on the 15162 * stack, we have to check if it is a !. But first, the code above 15163 * may have altered the stack in the time since we earlier set 15164 * 'top_index'. */ 15165 15166 top_index = av_tindex_nomg(stack); 15167 if (top_index - fence >= 0) { 15168 /* If the top entry on the stack is an operator, it had better 15169 * be a '!', otherwise the entry below the top operand should 15170 * be an operator */ 15171 top_ptr = av_fetch(stack, top_index, FALSE); 15172 assert(top_ptr); 15173 if (IS_OPERATOR(*top_ptr)) { 15174 15175 /* The only permissible operator at the top of the stack is 15176 * '!', which is applied immediately to this operand. */ 15177 curchar = (char) SvUV(*top_ptr); 15178 if (curchar != '!') { 15179 SvREFCNT_dec(current); 15180 vFAIL2("Unexpected binary operator '%c' with no " 15181 "preceding operand", curchar); 15182 } 15183 15184 _invlist_invert(current); 15185 15186 only_to_avoid_leaks = av_pop(stack); 15187 SvREFCNT_dec(only_to_avoid_leaks); 15188 15189 /* And we redo with the inverted operand. This allows 15190 * handling multiple ! in a row */ 15191 goto handle_operand; 15192 } 15193 /* Single operand is ok only for the non-binary ')' 15194 * operator */ 15195 else if ((top_index - fence == 0 && curchar != ')') 15196 || (top_index - fence > 0 15197 && (! (stacked_ptr = av_fetch(stack, 15198 top_index - 1, 15199 FALSE)) 15200 || IS_OPERAND(*stacked_ptr)))) 15201 { 15202 SvREFCNT_dec(current); 15203 vFAIL("Operand with no preceding operator"); 15204 } 15205 } 15206 15207 /* Here there was nothing on the stack or the top element was 15208 * another operand. Just add this new one */ 15209 av_push(stack, current); 15210 15211 } /* End of switch on next parse token */ 15212 15213 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 15214 } /* End of loop parsing through the construct */ 15215 15216 done: 15217 if (av_tindex_nomg(fence_stack) >= 0) { 15218 vFAIL("Unmatched ("); 15219 } 15220 15221 if (av_tindex_nomg(stack) < 0 /* Was empty */ 15222 || ((final = av_pop(stack)) == NULL) 15223 || ! IS_OPERAND(final) 15224 || SvTYPE(final) != SVt_INVLIST 15225 || av_tindex_nomg(stack) >= 0) /* More left on stack */ 15226 { 15227 bad_syntax: 15228 SvREFCNT_dec(final); 15229 vFAIL("Incomplete expression within '(?[ ])'"); 15230 } 15231 15232 /* Here, 'final' is the resultant inversion list from evaluating the 15233 * expression. Return it if so requested */ 15234 if (return_invlist) { 15235 *return_invlist = final; 15236 return END; 15237 } 15238 15239 /* Otherwise generate a resultant node, based on 'final'. regclass() is 15240 * expecting a string of ranges and individual code points */ 15241 invlist_iterinit(final); 15242 result_string = newSVpvs(""); 15243 while (invlist_iternext(final, &start, &end)) { 15244 if (start == end) { 15245 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start); 15246 } 15247 else { 15248 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}", 15249 start, end); 15250 } 15251 } 15252 15253 /* About to generate an ANYOF (or similar) node from the inversion list we 15254 * have calculated */ 15255 save_parse = RExC_parse; 15256 RExC_parse = SvPV(result_string, len); 15257 save_end = RExC_end; 15258 RExC_end = RExC_parse + len; 15259 15260 /* We turn off folding around the call, as the class we have constructed 15261 * already has all folding taken into consideration, and we don't want 15262 * regclass() to add to that */ 15263 RExC_flags &= ~RXf_PMf_FOLD; 15264 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char 15265 * folds are allowed. */ 15266 node = regclass(pRExC_state, flagp,depth+1, 15267 FALSE, /* means parse the whole char class */ 15268 FALSE, /* don't allow multi-char folds */ 15269 TRUE, /* silence non-portable warnings. The above may very 15270 well have generated non-portable code points, but 15271 they're valid on this machine */ 15272 FALSE, /* similarly, no need for strict */ 15273 FALSE, /* Require return to be an ANYOF */ 15274 NULL, 15275 NULL 15276 ); 15277 if (!node) 15278 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, 15279 PTR2UV(flagp)); 15280 15281 /* Fix up the node type if we are in locale. (We have pretended we are 15282 * under /u for the purposes of regclass(), as this construct will only 15283 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so 15284 * as to cause any warnings about bad locales to be output in regexec.c), 15285 * and add the flag that indicates to check if not in a UTF-8 locale. The 15286 * reason we above forbid optimization into something other than an ANYOF 15287 * node is simply to minimize the number of code changes in regexec.c. 15288 * Otherwise we would have to create new EXACTish node types and deal with 15289 * them. This decision could be revisited should this construct become 15290 * popular. 15291 * 15292 * (One might think we could look at the resulting ANYOF node and suppress 15293 * the flag if everything is above 255, as those would be UTF-8 only, 15294 * but this isn't true, as the components that led to that result could 15295 * have been locale-affected, and just happen to cancel each other out 15296 * under UTF-8 locales.) */ 15297 if (in_locale) { 15298 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); 15299 15300 assert(OP(node) == ANYOF); 15301 15302 OP(node) = ANYOFL; 15303 ANYOF_FLAGS(node) 15304 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; 15305 } 15306 15307 if (save_fold) { 15308 RExC_flags |= RXf_PMf_FOLD; 15309 } 15310 15311 RExC_parse = save_parse + 1; 15312 RExC_end = save_end; 15313 SvREFCNT_dec_NN(final); 15314 SvREFCNT_dec_NN(result_string); 15315 15316 nextchar(pRExC_state); 15317 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ 15318 return node; 15319 } 15320 #undef IS_OPERATOR 15321 #undef IS_OPERAND 15322 15323 STATIC void 15324 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) 15325 { 15326 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an 15327 * innocent-looking character class, like /[ks]/i won't have to go out to 15328 * disk to find the possible matches. 15329 * 15330 * This should be called only for a Latin1-range code points, cp, which is 15331 * known to be involved in a simple fold with other code points above 15332 * Latin1. It would give false results if /aa has been specified. 15333 * Multi-char folds are outside the scope of this, and must be handled 15334 * specially. 15335 * 15336 * XXX It would be better to generate these via regen, in case a new 15337 * version of the Unicode standard adds new mappings, though that is not 15338 * really likely, and may be caught by the default: case of the switch 15339 * below. */ 15340 15341 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; 15342 15343 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp)); 15344 15345 switch (cp) { 15346 case 'k': 15347 case 'K': 15348 *invlist = 15349 add_cp_to_invlist(*invlist, KELVIN_SIGN); 15350 break; 15351 case 's': 15352 case 'S': 15353 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); 15354 break; 15355 case MICRO_SIGN: 15356 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); 15357 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); 15358 break; 15359 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: 15360 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: 15361 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); 15362 break; 15363 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: 15364 *invlist = add_cp_to_invlist(*invlist, 15365 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); 15366 break; 15367 15368 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ 15369 15370 case LATIN_SMALL_LETTER_SHARP_S: 15371 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); 15372 break; 15373 15374 #endif 15375 15376 #if UNICODE_MAJOR_VERSION < 3 \ 15377 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0) 15378 15379 /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did 15380 * U+0131. */ 15381 case 'i': 15382 case 'I': 15383 *invlist = 15384 add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 15385 # if UNICODE_DOT_DOT_VERSION == 1 15386 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I); 15387 # endif 15388 break; 15389 #endif 15390 15391 default: 15392 /* Use deprecated warning to increase the chances of this being 15393 * output */ 15394 if (PASS2) { 15395 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); 15396 } 15397 break; 15398 } 15399 } 15400 15401 STATIC void 15402 S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings) 15403 { 15404 /* If the final parameter is NULL, output the elements of the array given 15405 * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are 15406 * pushed onto it, (creating if necessary) */ 15407 15408 SV * msg; 15409 const bool first_is_fatal = ! return_posix_warnings 15410 && ckDEAD(packWARN(WARN_REGEXP)); 15411 15412 PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS; 15413 15414 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) { 15415 if (return_posix_warnings) { 15416 if (! *return_posix_warnings) { /* mortalize to not leak if 15417 warnings are fatal */ 15418 *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV()); 15419 } 15420 av_push(*return_posix_warnings, msg); 15421 } 15422 else { 15423 if (first_is_fatal) { /* Avoid leaking this */ 15424 av_undef(posix_warnings); /* This isn't necessary if the 15425 array is mortal, but is a 15426 fail-safe */ 15427 (void) sv_2mortal(msg); 15428 if (PASS2) { 15429 SAVEFREESV(RExC_rx_sv); 15430 } 15431 } 15432 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg)); 15433 SvREFCNT_dec_NN(msg); 15434 } 15435 } 15436 } 15437 15438 STATIC AV * 15439 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) 15440 { 15441 /* This adds the string scalar <multi_string> to the array 15442 * <multi_char_matches>. <multi_string> is known to have exactly 15443 * <cp_count> code points in it. This is used when constructing a 15444 * bracketed character class and we find something that needs to match more 15445 * than a single character. 15446 * 15447 * <multi_char_matches> is actually an array of arrays. Each top-level 15448 * element is an array that contains all the strings known so far that are 15449 * the same length. And that length (in number of code points) is the same 15450 * as the index of the top-level array. Hence, the [2] element is an 15451 * array, each element thereof is a string containing TWO code points; 15452 * while element [3] is for strings of THREE characters, and so on. Since 15453 * this is for multi-char strings there can never be a [0] nor [1] element. 15454 * 15455 * When we rewrite the character class below, we will do so such that the 15456 * longest strings are written first, so that it prefers the longest 15457 * matching strings first. This is done even if it turns out that any 15458 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom 15459 * Christiansen has agreed that this is ok. This makes the test for the 15460 * ligature 'ffi' come before the test for 'ff', for example */ 15461 15462 AV* this_array; 15463 AV** this_array_ptr; 15464 15465 PERL_ARGS_ASSERT_ADD_MULTI_MATCH; 15466 15467 if (! multi_char_matches) { 15468 multi_char_matches = newAV(); 15469 } 15470 15471 if (av_exists(multi_char_matches, cp_count)) { 15472 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE); 15473 this_array = *this_array_ptr; 15474 } 15475 else { 15476 this_array = newAV(); 15477 av_store(multi_char_matches, cp_count, 15478 (SV*) this_array); 15479 } 15480 av_push(this_array, multi_string); 15481 15482 return multi_char_matches; 15483 } 15484 15485 /* The names of properties whose definitions are not known at compile time are 15486 * stored in this SV, after a constant heading. So if the length has been 15487 * changed since initialization, then there is a run-time definition. */ 15488 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ 15489 (SvCUR(listsv) != initial_listsv_len) 15490 15491 /* There is a restricted set of white space characters that are legal when 15492 * ignoring white space in a bracketed character class. This generates the 15493 * code to skip them. 15494 * 15495 * There is a line below that uses the same white space criteria but is outside 15496 * this macro. Both here and there must use the same definition */ 15497 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \ 15498 STMT_START { \ 15499 if (do_skip) { \ 15500 while (isBLANK_A(UCHARAT(p))) \ 15501 { \ 15502 p++; \ 15503 } \ 15504 } \ 15505 } STMT_END 15506 15507 STATIC regnode * 15508 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, 15509 const bool stop_at_1, /* Just parse the next thing, don't 15510 look for a full character class */ 15511 bool allow_multi_folds, 15512 const bool silence_non_portable, /* Don't output warnings 15513 about too large 15514 characters */ 15515 const bool strict, 15516 bool optimizable, /* ? Allow a non-ANYOF return 15517 node */ 15518 SV** ret_invlist, /* Return an inversion list, not a node */ 15519 AV** return_posix_warnings 15520 ) 15521 { 15522 /* parse a bracketed class specification. Most of these will produce an 15523 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an 15524 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex 15525 * under /i with multi-character folds: it will be rewritten following the 15526 * paradigm of this example, where the <multi-fold>s are characters which 15527 * fold to multiple character sequences: 15528 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i 15529 * gets effectively rewritten as: 15530 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i 15531 * reg() gets called (recursively) on the rewritten version, and this 15532 * function will return what it constructs. (Actually the <multi-fold>s 15533 * aren't physically removed from the [abcdefghi], it's just that they are 15534 * ignored in the recursion by means of a flag: 15535 * <RExC_in_multi_char_class>.) 15536 * 15537 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS 15538 * characters, with the corresponding bit set if that character is in the 15539 * list. For characters above this, a range list or swash is used. There 15540 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not 15541 * determinable at compile time 15542 * 15543 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs 15544 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded 15545 * to UTF-8. This can only happen if ret_invlist is non-NULL. 15546 */ 15547 15548 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; 15549 IV range = 0; 15550 UV value = OOB_UNICODE, save_value = OOB_UNICODE; 15551 regnode *ret; 15552 STRLEN numlen; 15553 int namedclass = OOB_NAMEDCLASS; 15554 char *rangebegin = NULL; 15555 bool need_class = 0; 15556 SV *listsv = NULL; 15557 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more 15558 than just initialized. */ 15559 SV* properties = NULL; /* Code points that match \p{} \P{} */ 15560 SV* posixes = NULL; /* Code points that match classes like [:word:], 15561 extended beyond the Latin1 range. These have to 15562 be kept separate from other code points for much 15563 of this function because their handling is 15564 different under /i, and for most classes under 15565 /d as well */ 15566 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept 15567 separate for a while from the non-complemented 15568 versions because of complications with /d 15569 matching */ 15570 SV* simple_posixes = NULL; /* But under some conditions, the classes can be 15571 treated more simply than the general case, 15572 leading to less compilation and execution 15573 work */ 15574 UV element_count = 0; /* Number of distinct elements in the class. 15575 Optimizations may be possible if this is tiny */ 15576 AV * multi_char_matches = NULL; /* Code points that fold to more than one 15577 character; used under /i */ 15578 UV n; 15579 char * stop_ptr = RExC_end; /* where to stop parsing */ 15580 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white 15581 space? */ 15582 15583 /* Unicode properties are stored in a swash; this holds the current one 15584 * being parsed. If this swash is the only above-latin1 component of the 15585 * character class, an optimization is to pass it directly on to the 15586 * execution engine. Otherwise, it is set to NULL to indicate that there 15587 * are other things in the class that have to be dealt with at execution 15588 * time */ 15589 SV* swash = NULL; /* Code points that match \p{} \P{} */ 15590 15591 /* Set if a component of this character class is user-defined; just passed 15592 * on to the engine */ 15593 bool has_user_defined_property = FALSE; 15594 15595 /* inversion list of code points this node matches only when the target 15596 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under 15597 * /d) */ 15598 SV* has_upper_latin1_only_utf8_matches = NULL; 15599 15600 /* Inversion list of code points this node matches regardless of things 15601 * like locale, folding, utf8ness of the target string */ 15602 SV* cp_list = NULL; 15603 15604 /* Like cp_list, but code points on this list need to be checked for things 15605 * that fold to/from them under /i */ 15606 SV* cp_foldable_list = NULL; 15607 15608 /* Like cp_list, but code points on this list are valid only when the 15609 * runtime locale is UTF-8 */ 15610 SV* only_utf8_locale_list = NULL; 15611 15612 /* In a range, if one of the endpoints is non-character-set portable, 15613 * meaning that it hard-codes a code point that may mean a different 15614 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a 15615 * mnemonic '\t' which each mean the same character no matter which 15616 * character set the platform is on. */ 15617 unsigned int non_portable_endpoint = 0; 15618 15619 /* Is the range unicode? which means on a platform that isn't 1-1 native 15620 * to Unicode (i.e. non-ASCII), each code point in it should be considered 15621 * to be a Unicode value. */ 15622 bool unicode_range = FALSE; 15623 bool invert = FALSE; /* Is this class to be complemented */ 15624 15625 bool warn_super = ALWAYS_WARN_SUPER; 15626 15627 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in 15628 case we need to change the emitted regop to an EXACT. */ 15629 const char * orig_parse = RExC_parse; 15630 const SSize_t orig_size = RExC_size; 15631 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ 15632 15633 /* This variable is used to mark where the end in the input is of something 15634 * that looks like a POSIX construct but isn't. During the parse, when 15635 * something looks like it could be such a construct is encountered, it is 15636 * checked for being one, but not if we've already checked this area of the 15637 * input. Only after this position is reached do we check again */ 15638 char *not_posix_region_end = RExC_parse - 1; 15639 15640 AV* posix_warnings = NULL; 15641 const bool do_posix_warnings = return_posix_warnings 15642 || (PASS2 && ckWARN(WARN_REGEXP)); 15643 15644 GET_RE_DEBUG_FLAGS_DECL; 15645 15646 PERL_ARGS_ASSERT_REGCLASS; 15647 #ifndef DEBUGGING 15648 PERL_UNUSED_ARG(depth); 15649 #endif 15650 15651 DEBUG_PARSE("clas"); 15652 15653 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \ 15654 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \ 15655 && UNICODE_DOT_DOT_VERSION == 0) 15656 allow_multi_folds = FALSE; 15657 #endif 15658 15659 /* Assume we are going to generate an ANYOF node. */ 15660 ret = reganode(pRExC_state, 15661 (LOC) 15662 ? ANYOFL 15663 : ANYOF, 15664 0); 15665 15666 if (SIZE_ONLY) { 15667 RExC_size += ANYOF_SKIP; 15668 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ 15669 } 15670 else { 15671 ANYOF_FLAGS(ret) = 0; 15672 15673 RExC_emit += ANYOF_SKIP; 15674 listsv = newSVpvs_flags("# comment\n", SVs_TEMP); 15675 initial_listsv_len = SvCUR(listsv); 15676 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ 15677 } 15678 15679 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); 15680 15681 assert(RExC_parse <= RExC_end); 15682 15683 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ 15684 RExC_parse++; 15685 invert = TRUE; 15686 allow_multi_folds = FALSE; 15687 MARK_NAUGHTY(1); 15688 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); 15689 } 15690 15691 /* Check that they didn't say [:posix:] instead of [[:posix:]] */ 15692 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) { 15693 int maybe_class = handle_possible_posix(pRExC_state, 15694 RExC_parse, 15695 ¬_posix_region_end, 15696 NULL, 15697 TRUE /* checking only */); 15698 if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) { 15699 SAVEFREESV(RExC_rx_sv); 15700 ckWARN4reg(not_posix_region_end, 15701 "POSIX syntax [%c %c] belongs inside character classes%s", 15702 *RExC_parse, *RExC_parse, 15703 (maybe_class == OOB_NAMEDCLASS) 15704 ? ((POSIXCC_NOTYET(*RExC_parse)) 15705 ? " (but this one isn't implemented)" 15706 : " (but this one isn't fully valid)") 15707 : "" 15708 ); 15709 (void)ReREFCNT_inc(RExC_rx_sv); 15710 } 15711 } 15712 15713 /* If the caller wants us to just parse a single element, accomplish this 15714 * by faking the loop ending condition */ 15715 if (stop_at_1 && RExC_end > RExC_parse) { 15716 stop_ptr = RExC_parse + 1; 15717 } 15718 15719 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ 15720 if (UCHARAT(RExC_parse) == ']') 15721 goto charclassloop; 15722 15723 while (1) { 15724 15725 if ( posix_warnings 15726 && av_tindex_nomg(posix_warnings) >= 0 15727 && RExC_parse > not_posix_region_end) 15728 { 15729 /* Warnings about posix class issues are considered tentative until 15730 * we are far enough along in the parse that we can no longer 15731 * change our mind, at which point we either output them or add 15732 * them, if it has so specified, to what gets returned to the 15733 * caller. This is done each time through the loop so that a later 15734 * class won't zap them before they have been dealt with. */ 15735 output_or_return_posix_warnings(pRExC_state, posix_warnings, 15736 return_posix_warnings); 15737 } 15738 15739 if (RExC_parse >= stop_ptr) { 15740 break; 15741 } 15742 15743 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); 15744 15745 if (UCHARAT(RExC_parse) == ']') { 15746 break; 15747 } 15748 15749 charclassloop: 15750 15751 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 15752 save_value = value; 15753 save_prevvalue = prevvalue; 15754 15755 if (!range) { 15756 rangebegin = RExC_parse; 15757 element_count++; 15758 non_portable_endpoint = 0; 15759 } 15760 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) { 15761 value = utf8n_to_uvchr((U8*)RExC_parse, 15762 RExC_end - RExC_parse, 15763 &numlen, UTF8_ALLOW_DEFAULT); 15764 RExC_parse += numlen; 15765 } 15766 else 15767 value = UCHARAT(RExC_parse++); 15768 15769 if (value == '[') { 15770 char * posix_class_end; 15771 namedclass = handle_possible_posix(pRExC_state, 15772 RExC_parse, 15773 &posix_class_end, 15774 do_posix_warnings ? &posix_warnings : NULL, 15775 FALSE /* die if error */); 15776 if (namedclass > OOB_NAMEDCLASS) { 15777 15778 /* If there was an earlier attempt to parse this particular 15779 * posix class, and it failed, it was a false alarm, as this 15780 * successful one proves */ 15781 if ( posix_warnings 15782 && av_tindex_nomg(posix_warnings) >= 0 15783 && not_posix_region_end >= RExC_parse 15784 && not_posix_region_end <= posix_class_end) 15785 { 15786 av_undef(posix_warnings); 15787 } 15788 15789 RExC_parse = posix_class_end; 15790 } 15791 else if (namedclass == OOB_NAMEDCLASS) { 15792 not_posix_region_end = posix_class_end; 15793 } 15794 else { 15795 namedclass = OOB_NAMEDCLASS; 15796 } 15797 } 15798 else if ( RExC_parse - 1 > not_posix_region_end 15799 && MAYBE_POSIXCC(value)) 15800 { 15801 (void) handle_possible_posix( 15802 pRExC_state, 15803 RExC_parse - 1, /* -1 because parse has already been 15804 advanced */ 15805 ¬_posix_region_end, 15806 do_posix_warnings ? &posix_warnings : NULL, 15807 TRUE /* checking only */); 15808 } 15809 else if (value == '\\') { 15810 /* Is a backslash; get the code point of the char after it */ 15811 15812 if (RExC_parse >= RExC_end) { 15813 vFAIL("Unmatched ["); 15814 } 15815 15816 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) { 15817 value = utf8n_to_uvchr((U8*)RExC_parse, 15818 RExC_end - RExC_parse, 15819 &numlen, UTF8_ALLOW_DEFAULT); 15820 RExC_parse += numlen; 15821 } 15822 else 15823 value = UCHARAT(RExC_parse++); 15824 15825 /* Some compilers cannot handle switching on 64-bit integer 15826 * values, therefore value cannot be an UV. Yes, this will 15827 * be a problem later if we want switch on Unicode. 15828 * A similar issue a little bit later when switching on 15829 * namedclass. --jhi */ 15830 15831 /* If the \ is escaping white space when white space is being 15832 * skipped, it means that that white space is wanted literally, and 15833 * is already in 'value'. Otherwise, need to translate the escape 15834 * into what it signifies. */ 15835 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) { 15836 15837 case 'w': namedclass = ANYOF_WORDCHAR; break; 15838 case 'W': namedclass = ANYOF_NWORDCHAR; break; 15839 case 's': namedclass = ANYOF_SPACE; break; 15840 case 'S': namedclass = ANYOF_NSPACE; break; 15841 case 'd': namedclass = ANYOF_DIGIT; break; 15842 case 'D': namedclass = ANYOF_NDIGIT; break; 15843 case 'v': namedclass = ANYOF_VERTWS; break; 15844 case 'V': namedclass = ANYOF_NVERTWS; break; 15845 case 'h': namedclass = ANYOF_HORIZWS; break; 15846 case 'H': namedclass = ANYOF_NHORIZWS; break; 15847 case 'N': /* Handle \N{NAME} in class */ 15848 { 15849 const char * const backslash_N_beg = RExC_parse - 2; 15850 int cp_count; 15851 15852 if (! grok_bslash_N(pRExC_state, 15853 NULL, /* No regnode */ 15854 &value, /* Yes single value */ 15855 &cp_count, /* Multiple code pt count */ 15856 flagp, 15857 strict, 15858 depth) 15859 ) { 15860 15861 if (*flagp & NEED_UTF8) 15862 FAIL("panic: grok_bslash_N set NEED_UTF8"); 15863 if (*flagp & RESTART_PASS1) 15864 return NULL; 15865 15866 if (cp_count < 0) { 15867 vFAIL("\\N in a character class must be a named character: \\N{...}"); 15868 } 15869 else if (cp_count == 0) { 15870 if (PASS2) { 15871 ckWARNreg(RExC_parse, 15872 "Ignoring zero length \\N{} in character class"); 15873 } 15874 } 15875 else { /* cp_count > 1 */ 15876 if (! RExC_in_multi_char_class) { 15877 if (invert || range || *RExC_parse == '-') { 15878 if (strict) { 15879 RExC_parse--; 15880 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character"); 15881 } 15882 else if (PASS2) { 15883 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class"); 15884 } 15885 break; /* <value> contains the first code 15886 point. Drop out of the switch to 15887 process it */ 15888 } 15889 else { 15890 SV * multi_char_N = newSVpvn(backslash_N_beg, 15891 RExC_parse - backslash_N_beg); 15892 multi_char_matches 15893 = add_multi_match(multi_char_matches, 15894 multi_char_N, 15895 cp_count); 15896 } 15897 } 15898 } /* End of cp_count != 1 */ 15899 15900 /* This element should not be processed further in this 15901 * class */ 15902 element_count--; 15903 value = save_value; 15904 prevvalue = save_prevvalue; 15905 continue; /* Back to top of loop to get next char */ 15906 } 15907 15908 /* Here, is a single code point, and <value> contains it */ 15909 unicode_range = TRUE; /* \N{} are Unicode */ 15910 } 15911 break; 15912 case 'p': 15913 case 'P': 15914 { 15915 char *e; 15916 15917 /* We will handle any undefined properties ourselves */ 15918 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF 15919 /* And we actually would prefer to get 15920 * the straight inversion list of the 15921 * swash, since we will be accessing it 15922 * anyway, to save a little time */ 15923 |_CORE_SWASH_INIT_ACCEPT_INVLIST; 15924 15925 if (RExC_parse >= RExC_end) 15926 vFAIL2("Empty \\%c", (U8)value); 15927 if (*RExC_parse == '{') { 15928 const U8 c = (U8)value; 15929 e = strchr(RExC_parse, '}'); 15930 if (!e) { 15931 RExC_parse++; 15932 vFAIL2("Missing right brace on \\%c{}", c); 15933 } 15934 15935 RExC_parse++; 15936 while (isSPACE(*RExC_parse)) { 15937 RExC_parse++; 15938 } 15939 15940 if (UCHARAT(RExC_parse) == '^') { 15941 15942 /* toggle. (The rhs xor gets the single bit that 15943 * differs between P and p; the other xor inverts just 15944 * that bit) */ 15945 value ^= 'P' ^ 'p'; 15946 15947 RExC_parse++; 15948 while (isSPACE(*RExC_parse)) { 15949 RExC_parse++; 15950 } 15951 } 15952 15953 if (e == RExC_parse) 15954 vFAIL2("Empty \\%c{}", c); 15955 15956 n = e - RExC_parse; 15957 while (isSPACE(*(RExC_parse + n - 1))) 15958 n--; 15959 } /* The \p isn't immediately followed by a '{' */ 15960 else if (! isALPHA(*RExC_parse)) { 15961 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 15962 vFAIL2("Character following \\%c must be '{' or a " 15963 "single-character Unicode property name", 15964 (U8) value); 15965 } 15966 else { 15967 e = RExC_parse; 15968 n = 1; 15969 } 15970 if (!SIZE_ONLY) { 15971 SV* invlist; 15972 char* name; 15973 char* base_name; /* name after any packages are stripped */ 15974 char* lookup_name = NULL; 15975 const char * const colon_colon = "::"; 15976 15977 /* Try to get the definition of the property into 15978 * <invlist>. If /i is in effect, the effective property 15979 * will have its name be <__NAME_i>. The design is 15980 * discussed in commit 15981 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ 15982 name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse)); 15983 SAVEFREEPV(name); 15984 if (FOLD) { 15985 lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name)); 15986 15987 /* The function call just below that uses this can fail 15988 * to return, leaking memory if we don't do this */ 15989 SAVEFREEPV(lookup_name); 15990 } 15991 15992 /* Look up the property name, and get its swash and 15993 * inversion list, if the property is found */ 15994 SvREFCNT_dec(swash); /* Free any left-overs */ 15995 swash = _core_swash_init("utf8", 15996 (lookup_name) 15997 ? lookup_name 15998 : name, 15999 &PL_sv_undef, 16000 1, /* binary */ 16001 0, /* not tr/// */ 16002 NULL, /* No inversion list */ 16003 &swash_init_flags 16004 ); 16005 if (! swash || ! (invlist = _get_swash_invlist(swash))) { 16006 HV* curpkg = (IN_PERL_COMPILETIME) 16007 ? PL_curstash 16008 : CopSTASH(PL_curcop); 16009 UV final_n = n; 16010 bool has_pkg; 16011 16012 if (swash) { /* Got a swash but no inversion list. 16013 Something is likely wrong that will 16014 be sorted-out later */ 16015 SvREFCNT_dec_NN(swash); 16016 swash = NULL; 16017 } 16018 16019 /* Here didn't find it. It could be a an error (like a 16020 * typo) in specifying a Unicode property, or it could 16021 * be a user-defined property that will be available at 16022 * run-time. The names of these must begin with 'In' 16023 * or 'Is' (after any packages are stripped off). So 16024 * if not one of those, or if we accept only 16025 * compile-time properties, is an error; otherwise add 16026 * it to the list for run-time look up. */ 16027 if ((base_name = rninstr(name, name + n, 16028 colon_colon, colon_colon + 2))) 16029 { /* Has ::. We know this must be a user-defined 16030 property */ 16031 base_name += 2; 16032 final_n -= base_name - name; 16033 has_pkg = TRUE; 16034 } 16035 else { 16036 base_name = name; 16037 has_pkg = FALSE; 16038 } 16039 16040 if ( final_n < 3 16041 || base_name[0] != 'I' 16042 || (base_name[1] != 's' && base_name[1] != 'n') 16043 || ret_invlist) 16044 { 16045 const char * const msg 16046 = (has_pkg) 16047 ? "Illegal user-defined property name" 16048 : "Can't find Unicode property definition"; 16049 RExC_parse = e + 1; 16050 16051 /* diag_listed_as: Can't find Unicode property definition "%s" */ 16052 vFAIL3utf8f("%s \"%"UTF8f"\"", 16053 msg, UTF8fARG(UTF, n, name)); 16054 } 16055 16056 /* If the property name doesn't already have a package 16057 * name, add the current one to it so that it can be 16058 * referred to outside it. [perl #121777] */ 16059 if (! has_pkg && curpkg) { 16060 char* pkgname = HvNAME(curpkg); 16061 if (strNE(pkgname, "main")) { 16062 char* full_name = Perl_form(aTHX_ 16063 "%s::%s", 16064 pkgname, 16065 name); 16066 n = strlen(full_name); 16067 name = savepvn(full_name, n); 16068 SAVEFREEPV(name); 16069 } 16070 } 16071 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n", 16072 (value == 'p' ? '+' : '!'), 16073 (FOLD) ? "__" : "", 16074 UTF8fARG(UTF, n, name), 16075 (FOLD) ? "_i" : ""); 16076 has_user_defined_property = TRUE; 16077 optimizable = FALSE; /* Will have to leave this an 16078 ANYOF node */ 16079 16080 /* We don't know yet what this matches, so have to flag 16081 * it */ 16082 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; 16083 } 16084 else { 16085 16086 /* Here, did get the swash and its inversion list. If 16087 * the swash is from a user-defined property, then this 16088 * whole character class should be regarded as such */ 16089 if (swash_init_flags 16090 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) 16091 { 16092 has_user_defined_property = TRUE; 16093 } 16094 else if 16095 /* We warn on matching an above-Unicode code point 16096 * if the match would return true, except don't 16097 * warn for \p{All}, which has exactly one element 16098 * = 0 */ 16099 (_invlist_contains_cp(invlist, 0x110000) 16100 && (! (_invlist_len(invlist) == 1 16101 && *invlist_array(invlist) == 0))) 16102 { 16103 warn_super = TRUE; 16104 } 16105 16106 16107 /* Invert if asking for the complement */ 16108 if (value == 'P') { 16109 _invlist_union_complement_2nd(properties, 16110 invlist, 16111 &properties); 16112 16113 /* The swash can't be used as-is, because we've 16114 * inverted things; delay removing it to here after 16115 * have copied its invlist above */ 16116 SvREFCNT_dec_NN(swash); 16117 swash = NULL; 16118 } 16119 else { 16120 _invlist_union(properties, invlist, &properties); 16121 } 16122 } 16123 } 16124 RExC_parse = e + 1; 16125 namedclass = ANYOF_UNIPROP; /* no official name, but it's 16126 named */ 16127 16128 /* \p means they want Unicode semantics */ 16129 REQUIRE_UNI_RULES(flagp, NULL); 16130 } 16131 break; 16132 case 'n': value = '\n'; break; 16133 case 'r': value = '\r'; break; 16134 case 't': value = '\t'; break; 16135 case 'f': value = '\f'; break; 16136 case 'b': value = '\b'; break; 16137 case 'e': value = ESC_NATIVE; break; 16138 case 'a': value = '\a'; break; 16139 case 'o': 16140 RExC_parse--; /* function expects to be pointed at the 'o' */ 16141 { 16142 const char* error_msg; 16143 bool valid = grok_bslash_o(&RExC_parse, 16144 &value, 16145 &error_msg, 16146 PASS2, /* warnings only in 16147 pass 2 */ 16148 strict, 16149 silence_non_portable, 16150 UTF); 16151 if (! valid) { 16152 vFAIL(error_msg); 16153 } 16154 } 16155 non_portable_endpoint++; 16156 if (IN_ENCODING && value < 0x100) { 16157 goto recode_encoding; 16158 } 16159 break; 16160 case 'x': 16161 RExC_parse--; /* function expects to be pointed at the 'x' */ 16162 { 16163 const char* error_msg; 16164 bool valid = grok_bslash_x(&RExC_parse, 16165 &value, 16166 &error_msg, 16167 PASS2, /* Output warnings */ 16168 strict, 16169 silence_non_portable, 16170 UTF); 16171 if (! valid) { 16172 vFAIL(error_msg); 16173 } 16174 } 16175 non_portable_endpoint++; 16176 if (IN_ENCODING && value < 0x100) 16177 goto recode_encoding; 16178 break; 16179 case 'c': 16180 value = grok_bslash_c(*RExC_parse++, PASS2); 16181 non_portable_endpoint++; 16182 break; 16183 case '0': case '1': case '2': case '3': case '4': 16184 case '5': case '6': case '7': 16185 { 16186 /* Take 1-3 octal digits */ 16187 I32 flags = PERL_SCAN_SILENT_ILLDIGIT; 16188 numlen = (strict) ? 4 : 3; 16189 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 16190 RExC_parse += numlen; 16191 if (numlen != 3) { 16192 if (strict) { 16193 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 16194 vFAIL("Need exactly 3 octal digits"); 16195 } 16196 else if (! SIZE_ONLY /* like \08, \178 */ 16197 && numlen < 3 16198 && RExC_parse < RExC_end 16199 && isDIGIT(*RExC_parse) 16200 && ckWARN(WARN_REGEXP)) 16201 { 16202 SAVEFREESV(RExC_rx_sv); 16203 reg_warn_non_literal_string( 16204 RExC_parse + 1, 16205 form_short_octal_warning(RExC_parse, numlen)); 16206 (void)ReREFCNT_inc(RExC_rx_sv); 16207 } 16208 } 16209 non_portable_endpoint++; 16210 if (IN_ENCODING && value < 0x100) 16211 goto recode_encoding; 16212 break; 16213 } 16214 recode_encoding: 16215 if (! RExC_override_recoding) { 16216 SV* enc = _get_encoding(); 16217 value = reg_recode((U8)value, &enc); 16218 if (!enc) { 16219 if (strict) { 16220 vFAIL("Invalid escape in the specified encoding"); 16221 } 16222 else if (PASS2) { 16223 ckWARNreg(RExC_parse, 16224 "Invalid escape in the specified encoding"); 16225 } 16226 } 16227 break; 16228 } 16229 default: 16230 /* Allow \_ to not give an error */ 16231 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') { 16232 if (strict) { 16233 vFAIL2("Unrecognized escape \\%c in character class", 16234 (int)value); 16235 } 16236 else { 16237 SAVEFREESV(RExC_rx_sv); 16238 ckWARN2reg(RExC_parse, 16239 "Unrecognized escape \\%c in character class passed through", 16240 (int)value); 16241 (void)ReREFCNT_inc(RExC_rx_sv); 16242 } 16243 } 16244 break; 16245 } /* End of switch on char following backslash */ 16246 } /* end of handling backslash escape sequences */ 16247 16248 /* Here, we have the current token in 'value' */ 16249 16250 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 16251 U8 classnum; 16252 16253 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a 16254 * literal, as is the character that began the false range, i.e. 16255 * the 'a' in the examples */ 16256 if (range) { 16257 if (!SIZE_ONLY) { 16258 const int w = (RExC_parse >= rangebegin) 16259 ? RExC_parse - rangebegin 16260 : 0; 16261 if (strict) { 16262 vFAIL2utf8f( 16263 "False [] range \"%"UTF8f"\"", 16264 UTF8fARG(UTF, w, rangebegin)); 16265 } 16266 else { 16267 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ 16268 ckWARN2reg(RExC_parse, 16269 "False [] range \"%"UTF8f"\"", 16270 UTF8fARG(UTF, w, rangebegin)); 16271 (void)ReREFCNT_inc(RExC_rx_sv); 16272 cp_list = add_cp_to_invlist(cp_list, '-'); 16273 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, 16274 prevvalue); 16275 } 16276 } 16277 16278 range = 0; /* this was not a true range */ 16279 element_count += 2; /* So counts for three values */ 16280 } 16281 16282 classnum = namedclass_to_classnum(namedclass); 16283 16284 if (LOC && namedclass < ANYOF_POSIXL_MAX 16285 #ifndef HAS_ISASCII 16286 && classnum != _CC_ASCII 16287 #endif 16288 ) { 16289 /* What the Posix classes (like \w, [:space:]) match in locale 16290 * isn't knowable under locale until actual match time. Room 16291 * must be reserved (one time per outer bracketed class) to 16292 * store such classes. The space will contain a bit for each 16293 * named class that is to be matched against. This isn't 16294 * needed for \p{} and pseudo-classes, as they are not affected 16295 * by locale, and hence are dealt with separately */ 16296 if (! need_class) { 16297 need_class = 1; 16298 if (SIZE_ONLY) { 16299 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; 16300 } 16301 else { 16302 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; 16303 } 16304 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL; 16305 ANYOF_POSIXL_ZERO(ret); 16306 16307 /* We can't change this into some other type of node 16308 * (unless this is the only element, in which case there 16309 * are nodes that mean exactly this) as has runtime 16310 * dependencies */ 16311 optimizable = FALSE; 16312 } 16313 16314 /* Coverity thinks it is possible for this to be negative; both 16315 * jhi and khw think it's not, but be safer */ 16316 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL) 16317 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); 16318 16319 /* See if it already matches the complement of this POSIX 16320 * class */ 16321 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL) 16322 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) 16323 ? -1 16324 : 1))) 16325 { 16326 posixl_matches_all = TRUE; 16327 break; /* No need to continue. Since it matches both 16328 e.g., \w and \W, it matches everything, and the 16329 bracketed class can be optimized into qr/./s */ 16330 } 16331 16332 /* Add this class to those that should be checked at runtime */ 16333 ANYOF_POSIXL_SET(ret, namedclass); 16334 16335 /* The above-Latin1 characters are not subject to locale rules. 16336 * Just add them, in the second pass, to the 16337 * unconditionally-matched list */ 16338 if (! SIZE_ONLY) { 16339 SV* scratch_list = NULL; 16340 16341 /* Get the list of the above-Latin1 code points this 16342 * matches */ 16343 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, 16344 PL_XPosix_ptrs[classnum], 16345 16346 /* Odd numbers are complements, like 16347 * NDIGIT, NASCII, ... */ 16348 namedclass % 2 != 0, 16349 &scratch_list); 16350 /* Checking if 'cp_list' is NULL first saves an extra 16351 * clone. Its reference count will be decremented at the 16352 * next union, etc, or if this is the only instance, at the 16353 * end of the routine */ 16354 if (! cp_list) { 16355 cp_list = scratch_list; 16356 } 16357 else { 16358 _invlist_union(cp_list, scratch_list, &cp_list); 16359 SvREFCNT_dec_NN(scratch_list); 16360 } 16361 continue; /* Go get next character */ 16362 } 16363 } 16364 else if (! SIZE_ONLY) { 16365 16366 /* Here, not in pass1 (in that pass we skip calculating the 16367 * contents of this class), and is /l, or is a POSIX class for 16368 * which /l doesn't matter (or is a Unicode property, which is 16369 * skipped here). */ 16370 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ 16371 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ 16372 16373 /* Here, should be \h, \H, \v, or \V. None of /d, /i 16374 * nor /l make a difference in what these match, 16375 * therefore we just add what they match to cp_list. */ 16376 if (classnum != _CC_VERTSPACE) { 16377 assert( namedclass == ANYOF_HORIZWS 16378 || namedclass == ANYOF_NHORIZWS); 16379 16380 /* It turns out that \h is just a synonym for 16381 * XPosixBlank */ 16382 classnum = _CC_BLANK; 16383 } 16384 16385 _invlist_union_maybe_complement_2nd( 16386 cp_list, 16387 PL_XPosix_ptrs[classnum], 16388 namedclass % 2 != 0, /* Complement if odd 16389 (NHORIZWS, NVERTWS) 16390 */ 16391 &cp_list); 16392 } 16393 } 16394 else if (UNI_SEMANTICS 16395 || classnum == _CC_ASCII 16396 || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT 16397 || classnum == _CC_XDIGIT))) 16398 { 16399 /* We usually have to worry about /d and /a affecting what 16400 * POSIX classes match, with special code needed for /d 16401 * because we won't know until runtime what all matches. 16402 * But there is no extra work needed under /u, and 16403 * [:ascii:] is unaffected by /a and /d; and :digit: and 16404 * :xdigit: don't have runtime differences under /d. So we 16405 * can special case these, and avoid some extra work below, 16406 * and at runtime. */ 16407 _invlist_union_maybe_complement_2nd( 16408 simple_posixes, 16409 PL_XPosix_ptrs[classnum], 16410 namedclass % 2 != 0, 16411 &simple_posixes); 16412 } 16413 else { /* Garden variety class. If is NUPPER, NALPHA, ... 16414 complement and use nposixes */ 16415 SV** posixes_ptr = namedclass % 2 == 0 16416 ? &posixes 16417 : &nposixes; 16418 _invlist_union_maybe_complement_2nd( 16419 *posixes_ptr, 16420 PL_XPosix_ptrs[classnum], 16421 namedclass % 2 != 0, 16422 posixes_ptr); 16423 } 16424 } 16425 } /* end of namedclass \blah */ 16426 16427 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); 16428 16429 /* If 'range' is set, 'value' is the ending of a range--check its 16430 * validity. (If value isn't a single code point in the case of a 16431 * range, we should have figured that out above in the code that 16432 * catches false ranges). Later, we will handle each individual code 16433 * point in the range. If 'range' isn't set, this could be the 16434 * beginning of a range, so check for that by looking ahead to see if 16435 * the next real character to be processed is the range indicator--the 16436 * minus sign */ 16437 16438 if (range) { 16439 #ifdef EBCDIC 16440 /* For unicode ranges, we have to test that the Unicode as opposed 16441 * to the native values are not decreasing. (Above 255, there is 16442 * no difference between native and Unicode) */ 16443 if (unicode_range && prevvalue < 255 && value < 255) { 16444 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) { 16445 goto backwards_range; 16446 } 16447 } 16448 else 16449 #endif 16450 if (prevvalue > value) /* b-a */ { 16451 int w; 16452 #ifdef EBCDIC 16453 backwards_range: 16454 #endif 16455 w = RExC_parse - rangebegin; 16456 vFAIL2utf8f( 16457 "Invalid [] range \"%"UTF8f"\"", 16458 UTF8fARG(UTF, w, rangebegin)); 16459 NOT_REACHED; /* NOTREACHED */ 16460 } 16461 } 16462 else { 16463 prevvalue = value; /* save the beginning of the potential range */ 16464 if (! stop_at_1 /* Can't be a range if parsing just one thing */ 16465 && *RExC_parse == '-') 16466 { 16467 char* next_char_ptr = RExC_parse + 1; 16468 16469 /* Get the next real char after the '-' */ 16470 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr); 16471 16472 /* If the '-' is at the end of the class (just before the ']', 16473 * it is a literal minus; otherwise it is a range */ 16474 if (next_char_ptr < RExC_end && *next_char_ptr != ']') { 16475 RExC_parse = next_char_ptr; 16476 16477 /* a bad range like \w-, [:word:]- ? */ 16478 if (namedclass > OOB_NAMEDCLASS) { 16479 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) { 16480 const int w = RExC_parse >= rangebegin 16481 ? RExC_parse - rangebegin 16482 : 0; 16483 if (strict) { 16484 vFAIL4("False [] range \"%*.*s\"", 16485 w, w, rangebegin); 16486 } 16487 else if (PASS2) { 16488 vWARN4(RExC_parse, 16489 "False [] range \"%*.*s\"", 16490 w, w, rangebegin); 16491 } 16492 } 16493 if (!SIZE_ONLY) { 16494 cp_list = add_cp_to_invlist(cp_list, '-'); 16495 } 16496 element_count++; 16497 } else 16498 range = 1; /* yeah, it's a range! */ 16499 continue; /* but do it the next time */ 16500 } 16501 } 16502 } 16503 16504 if (namedclass > OOB_NAMEDCLASS) { 16505 continue; 16506 } 16507 16508 /* Here, we have a single value this time through the loop, and 16509 * <prevvalue> is the beginning of the range, if any; or <value> if 16510 * not. */ 16511 16512 /* non-Latin1 code point implies unicode semantics. Must be set in 16513 * pass1 so is there for the whole of pass 2 */ 16514 if (value > 255) { 16515 REQUIRE_UNI_RULES(flagp, NULL); 16516 } 16517 16518 /* Ready to process either the single value, or the completed range. 16519 * For single-valued non-inverted ranges, we consider the possibility 16520 * of multi-char folds. (We made a conscious decision to not do this 16521 * for the other cases because it can often lead to non-intuitive 16522 * results. For example, you have the peculiar case that: 16523 * "s s" =~ /^[^\xDF]+$/i => Y 16524 * "ss" =~ /^[^\xDF]+$/i => N 16525 * 16526 * See [perl #89750] */ 16527 if (FOLD && allow_multi_folds && value == prevvalue) { 16528 if (value == LATIN_SMALL_LETTER_SHARP_S 16529 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, 16530 value))) 16531 { 16532 /* Here <value> is indeed a multi-char fold. Get what it is */ 16533 16534 U8 foldbuf[UTF8_MAXBYTES_CASE]; 16535 STRLEN foldlen; 16536 16537 UV folded = _to_uni_fold_flags( 16538 value, 16539 foldbuf, 16540 &foldlen, 16541 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED 16542 ? FOLD_FLAGS_NOMIX_ASCII 16543 : 0) 16544 ); 16545 16546 /* Here, <folded> should be the first character of the 16547 * multi-char fold of <value>, with <foldbuf> containing the 16548 * whole thing. But, if this fold is not allowed (because of 16549 * the flags), <fold> will be the same as <value>, and should 16550 * be processed like any other character, so skip the special 16551 * handling */ 16552 if (folded != value) { 16553 16554 /* Skip if we are recursed, currently parsing the class 16555 * again. Otherwise add this character to the list of 16556 * multi-char folds. */ 16557 if (! RExC_in_multi_char_class) { 16558 STRLEN cp_count = utf8_length(foldbuf, 16559 foldbuf + foldlen); 16560 SV* multi_fold = sv_2mortal(newSVpvs("")); 16561 16562 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); 16563 16564 multi_char_matches 16565 = add_multi_match(multi_char_matches, 16566 multi_fold, 16567 cp_count); 16568 16569 } 16570 16571 /* This element should not be processed further in this 16572 * class */ 16573 element_count--; 16574 value = save_value; 16575 prevvalue = save_prevvalue; 16576 continue; 16577 } 16578 } 16579 } 16580 16581 if (strict && PASS2 && ckWARN(WARN_REGEXP)) { 16582 if (range) { 16583 16584 /* If the range starts above 255, everything is portable and 16585 * likely to be so for any forseeable character set, so don't 16586 * warn. */ 16587 if (unicode_range && non_portable_endpoint && prevvalue < 256) { 16588 vWARN(RExC_parse, "Both or neither range ends should be Unicode"); 16589 } 16590 else if (prevvalue != value) { 16591 16592 /* Under strict, ranges that stop and/or end in an ASCII 16593 * printable should have each end point be a portable value 16594 * for it (preferably like 'A', but we don't warn if it is 16595 * a (portable) Unicode name or code point), and the range 16596 * must be be all digits or all letters of the same case. 16597 * Otherwise, the range is non-portable and unclear as to 16598 * what it contains */ 16599 if ((isPRINT_A(prevvalue) || isPRINT_A(value)) 16600 && (non_portable_endpoint 16601 || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value)) 16602 || (isLOWER_A(prevvalue) && isLOWER_A(value)) 16603 || (isUPPER_A(prevvalue) && isUPPER_A(value))))) 16604 { 16605 vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\""); 16606 } 16607 else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */ 16608 16609 /* But the nature of Unicode and languages mean we 16610 * can't do the same checks for above-ASCII ranges, 16611 * except in the case of digit ones. These should 16612 * contain only digits from the same group of 10. The 16613 * ASCII case is handled just above. 0x660 is the 16614 * first digit character beyond ASCII. Hence here, the 16615 * range could be a range of digits. Find out. */ 16616 IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], 16617 prevvalue); 16618 IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], 16619 value); 16620 16621 /* If the range start and final points are in the same 16622 * inversion list element, it means that either both 16623 * are not digits, or both are digits in a consecutive 16624 * sequence of digits. (So far, Unicode has kept all 16625 * such sequences as distinct groups of 10, but assert 16626 * to make sure). If the end points are not in the 16627 * same element, neither should be a digit. */ 16628 if (index_start == index_final) { 16629 assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start) 16630 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] 16631 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] 16632 == 10) 16633 /* But actually Unicode did have one group of 11 16634 * 'digits' in 5.2, so in case we are operating 16635 * on that version, let that pass */ 16636 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] 16637 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] 16638 == 11 16639 && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] 16640 == 0x19D0) 16641 ); 16642 } 16643 else if ((index_start >= 0 16644 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)) 16645 || (index_final >= 0 16646 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))) 16647 { 16648 vWARN(RExC_parse, "Ranges of digits should be from the same group of 10"); 16649 } 16650 } 16651 } 16652 } 16653 if ((! range || prevvalue == value) && non_portable_endpoint) { 16654 if (isPRINT_A(value)) { 16655 char literal[3]; 16656 unsigned d = 0; 16657 if (isBACKSLASHED_PUNCT(value)) { 16658 literal[d++] = '\\'; 16659 } 16660 literal[d++] = (char) value; 16661 literal[d++] = '\0'; 16662 16663 vWARN4(RExC_parse, 16664 "\"%.*s\" is more clearly written simply as \"%s\"", 16665 (int) (RExC_parse - rangebegin), 16666 rangebegin, 16667 literal 16668 ); 16669 } 16670 else if isMNEMONIC_CNTRL(value) { 16671 vWARN4(RExC_parse, 16672 "\"%.*s\" is more clearly written simply as \"%s\"", 16673 (int) (RExC_parse - rangebegin), 16674 rangebegin, 16675 cntrl_to_mnemonic((U8) value) 16676 ); 16677 } 16678 } 16679 } 16680 16681 /* Deal with this element of the class */ 16682 if (! SIZE_ONLY) { 16683 16684 #ifndef EBCDIC 16685 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 16686 prevvalue, value); 16687 #else 16688 /* On non-ASCII platforms, for ranges that span all of 0..255, and 16689 * ones that don't require special handling, we can just add the 16690 * range like we do for ASCII platforms */ 16691 if ((UNLIKELY(prevvalue == 0) && value >= 255) 16692 || ! (prevvalue < 256 16693 && (unicode_range 16694 || (! non_portable_endpoint 16695 && ((isLOWER_A(prevvalue) && isLOWER_A(value)) 16696 || (isUPPER_A(prevvalue) 16697 && isUPPER_A(value))))))) 16698 { 16699 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 16700 prevvalue, value); 16701 } 16702 else { 16703 /* Here, requires special handling. This can be because it is 16704 * a range whose code points are considered to be Unicode, and 16705 * so must be individually translated into native, or because 16706 * its a subrange of 'A-Z' or 'a-z' which each aren't 16707 * contiguous in EBCDIC, but we have defined them to include 16708 * only the "expected" upper or lower case ASCII alphabetics. 16709 * Subranges above 255 are the same in native and Unicode, so 16710 * can be added as a range */ 16711 U8 start = NATIVE_TO_LATIN1(prevvalue); 16712 unsigned j; 16713 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255; 16714 for (j = start; j <= end; j++) { 16715 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j)); 16716 } 16717 if (value > 255) { 16718 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 16719 256, value); 16720 } 16721 } 16722 #endif 16723 } 16724 16725 range = 0; /* this range (if it was one) is done now */ 16726 } /* End of loop through all the text within the brackets */ 16727 16728 16729 if ( posix_warnings && av_tindex_nomg(posix_warnings) >= 0) { 16730 output_or_return_posix_warnings(pRExC_state, posix_warnings, 16731 return_posix_warnings); 16732 } 16733 16734 /* If anything in the class expands to more than one character, we have to 16735 * deal with them by building up a substitute parse string, and recursively 16736 * calling reg() on it, instead of proceeding */ 16737 if (multi_char_matches) { 16738 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); 16739 I32 cp_count; 16740 STRLEN len; 16741 char *save_end = RExC_end; 16742 char *save_parse = RExC_parse; 16743 char *save_start = RExC_start; 16744 STRLEN prefix_end = 0; /* We copy the character class after a 16745 prefix supplied here. This is the size 16746 + 1 of that prefix */ 16747 bool first_time = TRUE; /* First multi-char occurrence doesn't get 16748 a "|" */ 16749 I32 reg_flags; 16750 16751 assert(! invert); 16752 assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */ 16753 16754 #if 0 /* Have decided not to deal with multi-char folds in inverted classes, 16755 because too confusing */ 16756 if (invert) { 16757 sv_catpv(substitute_parse, "(?:"); 16758 } 16759 #endif 16760 16761 /* Look at the longest folds first */ 16762 for (cp_count = av_tindex_nomg(multi_char_matches); 16763 cp_count > 0; 16764 cp_count--) 16765 { 16766 16767 if (av_exists(multi_char_matches, cp_count)) { 16768 AV** this_array_ptr; 16769 SV* this_sequence; 16770 16771 this_array_ptr = (AV**) av_fetch(multi_char_matches, 16772 cp_count, FALSE); 16773 while ((this_sequence = av_pop(*this_array_ptr)) != 16774 &PL_sv_undef) 16775 { 16776 if (! first_time) { 16777 sv_catpv(substitute_parse, "|"); 16778 } 16779 first_time = FALSE; 16780 16781 sv_catpv(substitute_parse, SvPVX(this_sequence)); 16782 } 16783 } 16784 } 16785 16786 /* If the character class contains anything else besides these 16787 * multi-character folds, have to include it in recursive parsing */ 16788 if (element_count) { 16789 sv_catpv(substitute_parse, "|["); 16790 prefix_end = SvCUR(substitute_parse); 16791 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); 16792 16793 /* Put in a closing ']' only if not going off the end, as otherwise 16794 * we are adding something that really isn't there */ 16795 if (RExC_parse < RExC_end) { 16796 sv_catpv(substitute_parse, "]"); 16797 } 16798 } 16799 16800 sv_catpv(substitute_parse, ")"); 16801 #if 0 16802 if (invert) { 16803 /* This is a way to get the parse to skip forward a whole named 16804 * sequence instead of matching the 2nd character when it fails the 16805 * first */ 16806 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); 16807 } 16808 #endif 16809 16810 /* Set up the data structure so that any errors will be properly 16811 * reported. See the comments at the definition of 16812 * REPORT_LOCATION_ARGS for details */ 16813 RExC_precomp_adj = orig_parse - RExC_precomp; 16814 RExC_start = RExC_parse = SvPV(substitute_parse, len); 16815 RExC_adjusted_start = RExC_start + prefix_end; 16816 RExC_end = RExC_parse + len; 16817 RExC_in_multi_char_class = 1; 16818 RExC_override_recoding = 1; 16819 RExC_emit = (regnode *)orig_emit; 16820 16821 ret = reg(pRExC_state, 1, ®_flags, depth+1); 16822 16823 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8); 16824 16825 /* And restore so can parse the rest of the pattern */ 16826 RExC_parse = save_parse; 16827 RExC_start = RExC_adjusted_start = save_start; 16828 RExC_precomp_adj = 0; 16829 RExC_end = save_end; 16830 RExC_in_multi_char_class = 0; 16831 RExC_override_recoding = 0; 16832 SvREFCNT_dec_NN(multi_char_matches); 16833 return ret; 16834 } 16835 16836 /* Here, we've gone through the entire class and dealt with multi-char 16837 * folds. We are now in a position that we can do some checks to see if we 16838 * can optimize this ANYOF node into a simpler one, even in Pass 1. 16839 * Currently we only do two checks: 16840 * 1) is in the unlikely event that the user has specified both, eg. \w and 16841 * \W under /l, then the class matches everything. (This optimization 16842 * is done only to make the optimizer code run later work.) 16843 * 2) if the character class contains only a single element (including a 16844 * single range), we see if there is an equivalent node for it. 16845 * Other checks are possible */ 16846 if ( optimizable 16847 && ! ret_invlist /* Can't optimize if returning the constructed 16848 inversion list */ 16849 && (UNLIKELY(posixl_matches_all) || element_count == 1)) 16850 { 16851 U8 op = END; 16852 U8 arg = 0; 16853 16854 if (UNLIKELY(posixl_matches_all)) { 16855 op = SANY; 16856 } 16857 else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named 16858 class, like \w or [:digit:] 16859 or \p{foo} */ 16860 16861 /* All named classes are mapped into POSIXish nodes, with its FLAG 16862 * argument giving which class it is */ 16863 switch ((I32)namedclass) { 16864 case ANYOF_UNIPROP: 16865 break; 16866 16867 /* These don't depend on the charset modifiers. They always 16868 * match under /u rules */ 16869 case ANYOF_NHORIZWS: 16870 case ANYOF_HORIZWS: 16871 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS; 16872 /* FALLTHROUGH */ 16873 16874 case ANYOF_NVERTWS: 16875 case ANYOF_VERTWS: 16876 op = POSIXU; 16877 goto join_posix; 16878 16879 /* The actual POSIXish node for all the rest depends on the 16880 * charset modifier. The ones in the first set depend only on 16881 * ASCII or, if available on this platform, also locale */ 16882 case ANYOF_ASCII: 16883 case ANYOF_NASCII: 16884 #ifdef HAS_ISASCII 16885 op = (LOC) ? POSIXL : POSIXA; 16886 #else 16887 op = POSIXA; 16888 #endif 16889 goto join_posix; 16890 16891 /* The following don't have any matches in the upper Latin1 16892 * range, hence /d is equivalent to /u for them. Making it /u 16893 * saves some branches at runtime */ 16894 case ANYOF_DIGIT: 16895 case ANYOF_NDIGIT: 16896 case ANYOF_XDIGIT: 16897 case ANYOF_NXDIGIT: 16898 if (! DEPENDS_SEMANTICS) { 16899 goto treat_as_default; 16900 } 16901 16902 op = POSIXU; 16903 goto join_posix; 16904 16905 /* The following change to CASED under /i */ 16906 case ANYOF_LOWER: 16907 case ANYOF_NLOWER: 16908 case ANYOF_UPPER: 16909 case ANYOF_NUPPER: 16910 if (FOLD) { 16911 namedclass = ANYOF_CASED + (namedclass % 2); 16912 } 16913 /* FALLTHROUGH */ 16914 16915 /* The rest have more possibilities depending on the charset. 16916 * We take advantage of the enum ordering of the charset 16917 * modifiers to get the exact node type, */ 16918 default: 16919 treat_as_default: 16920 op = POSIXD + get_regex_charset(RExC_flags); 16921 if (op > POSIXA) { /* /aa is same as /a */ 16922 op = POSIXA; 16923 } 16924 16925 join_posix: 16926 /* The odd numbered ones are the complements of the 16927 * next-lower even number one */ 16928 if (namedclass % 2 == 1) { 16929 invert = ! invert; 16930 namedclass--; 16931 } 16932 arg = namedclass_to_classnum(namedclass); 16933 break; 16934 } 16935 } 16936 else if (value == prevvalue) { 16937 16938 /* Here, the class consists of just a single code point */ 16939 16940 if (invert) { 16941 if (! LOC && value == '\n') { 16942 op = REG_ANY; /* Optimize [^\n] */ 16943 *flagp |= HASWIDTH|SIMPLE; 16944 MARK_NAUGHTY(1); 16945 } 16946 } 16947 else if (value < 256 || UTF) { 16948 16949 /* Optimize a single value into an EXACTish node, but not if it 16950 * would require converting the pattern to UTF-8. */ 16951 op = compute_EXACTish(pRExC_state); 16952 } 16953 } /* Otherwise is a range */ 16954 else if (! LOC) { /* locale could vary these */ 16955 if (prevvalue == '0') { 16956 if (value == '9') { 16957 arg = _CC_DIGIT; 16958 op = POSIXA; 16959 } 16960 } 16961 else if (! FOLD || ASCII_FOLD_RESTRICTED) { 16962 /* We can optimize A-Z or a-z, but not if they could match 16963 * something like the KELVIN SIGN under /i. */ 16964 if (prevvalue == 'A') { 16965 if (value == 'Z' 16966 #ifdef EBCDIC 16967 && ! non_portable_endpoint 16968 #endif 16969 ) { 16970 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; 16971 op = POSIXA; 16972 } 16973 } 16974 else if (prevvalue == 'a') { 16975 if (value == 'z' 16976 #ifdef EBCDIC 16977 && ! non_portable_endpoint 16978 #endif 16979 ) { 16980 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; 16981 op = POSIXA; 16982 } 16983 } 16984 } 16985 } 16986 16987 /* Here, we have changed <op> away from its initial value iff we found 16988 * an optimization */ 16989 if (op != END) { 16990 16991 /* Throw away this ANYOF regnode, and emit the calculated one, 16992 * which should correspond to the beginning, not current, state of 16993 * the parse */ 16994 const char * cur_parse = RExC_parse; 16995 RExC_parse = (char *)orig_parse; 16996 if ( SIZE_ONLY) { 16997 if (! LOC) { 16998 16999 /* To get locale nodes to not use the full ANYOF size would 17000 * require moving the code above that writes the portions 17001 * of it that aren't in other nodes to after this point. 17002 * e.g. ANYOF_POSIXL_SET */ 17003 RExC_size = orig_size; 17004 } 17005 } 17006 else { 17007 RExC_emit = (regnode *)orig_emit; 17008 if (PL_regkind[op] == POSIXD) { 17009 if (op == POSIXL) { 17010 RExC_contains_locale = 1; 17011 } 17012 if (invert) { 17013 op += NPOSIXD - POSIXD; 17014 } 17015 } 17016 } 17017 17018 ret = reg_node(pRExC_state, op); 17019 17020 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { 17021 if (! SIZE_ONLY) { 17022 FLAGS(ret) = arg; 17023 } 17024 *flagp |= HASWIDTH|SIMPLE; 17025 } 17026 else if (PL_regkind[op] == EXACT) { 17027 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, 17028 TRUE /* downgradable to EXACT */ 17029 ); 17030 } 17031 17032 RExC_parse = (char *) cur_parse; 17033 17034 SvREFCNT_dec(posixes); 17035 SvREFCNT_dec(nposixes); 17036 SvREFCNT_dec(simple_posixes); 17037 SvREFCNT_dec(cp_list); 17038 SvREFCNT_dec(cp_foldable_list); 17039 return ret; 17040 } 17041 } 17042 17043 if (SIZE_ONLY) 17044 return ret; 17045 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ 17046 17047 /* If folding, we calculate all characters that could fold to or from the 17048 * ones already on the list */ 17049 if (cp_foldable_list) { 17050 if (FOLD) { 17051 UV start, end; /* End points of code point ranges */ 17052 17053 SV* fold_intersection = NULL; 17054 SV** use_list; 17055 17056 /* Our calculated list will be for Unicode rules. For locale 17057 * matching, we have to keep a separate list that is consulted at 17058 * runtime only when the locale indicates Unicode rules. For 17059 * non-locale, we just use the general list */ 17060 if (LOC) { 17061 use_list = &only_utf8_locale_list; 17062 } 17063 else { 17064 use_list = &cp_list; 17065 } 17066 17067 /* Only the characters in this class that participate in folds need 17068 * be checked. Get the intersection of this class and all the 17069 * possible characters that are foldable. This can quickly narrow 17070 * down a large class */ 17071 _invlist_intersection(PL_utf8_foldable, cp_foldable_list, 17072 &fold_intersection); 17073 17074 /* The folds for all the Latin1 characters are hard-coded into this 17075 * program, but we have to go out to disk to get the others. */ 17076 if (invlist_highest(cp_foldable_list) >= 256) { 17077 17078 /* This is a hash that for a particular fold gives all 17079 * characters that are involved in it */ 17080 if (! PL_utf8_foldclosures) { 17081 _load_PL_utf8_foldclosures(); 17082 } 17083 } 17084 17085 /* Now look at the foldable characters in this class individually */ 17086 invlist_iterinit(fold_intersection); 17087 while (invlist_iternext(fold_intersection, &start, &end)) { 17088 UV j; 17089 17090 /* Look at every character in the range */ 17091 for (j = start; j <= end; j++) { 17092 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 17093 STRLEN foldlen; 17094 SV** listp; 17095 17096 if (j < 256) { 17097 17098 if (IS_IN_SOME_FOLD_L1(j)) { 17099 17100 /* ASCII is always matched; non-ASCII is matched 17101 * only under Unicode rules (which could happen 17102 * under /l if the locale is a UTF-8 one */ 17103 if (isASCII(j) || ! DEPENDS_SEMANTICS) { 17104 *use_list = add_cp_to_invlist(*use_list, 17105 PL_fold_latin1[j]); 17106 } 17107 else { 17108 has_upper_latin1_only_utf8_matches 17109 = add_cp_to_invlist( 17110 has_upper_latin1_only_utf8_matches, 17111 PL_fold_latin1[j]); 17112 } 17113 } 17114 17115 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) 17116 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) 17117 { 17118 add_above_Latin1_folds(pRExC_state, 17119 (U8) j, 17120 use_list); 17121 } 17122 continue; 17123 } 17124 17125 /* Here is an above Latin1 character. We don't have the 17126 * rules hard-coded for it. First, get its fold. This is 17127 * the simple fold, as the multi-character folds have been 17128 * handled earlier and separated out */ 17129 _to_uni_fold_flags(j, foldbuf, &foldlen, 17130 (ASCII_FOLD_RESTRICTED) 17131 ? FOLD_FLAGS_NOMIX_ASCII 17132 : 0); 17133 17134 /* Single character fold of above Latin1. Add everything in 17135 * its fold closure to the list that this node should match. 17136 * The fold closures data structure is a hash with the keys 17137 * being the UTF-8 of every character that is folded to, like 17138 * 'k', and the values each an array of all code points that 17139 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. 17140 * Multi-character folds are not included */ 17141 if ((listp = hv_fetch(PL_utf8_foldclosures, 17142 (char *) foldbuf, foldlen, FALSE))) 17143 { 17144 AV* list = (AV*) *listp; 17145 IV k; 17146 for (k = 0; k <= av_tindex_nomg(list); k++) { 17147 SV** c_p = av_fetch(list, k, FALSE); 17148 UV c; 17149 assert(c_p); 17150 17151 c = SvUV(*c_p); 17152 17153 /* /aa doesn't allow folds between ASCII and non- */ 17154 if ((ASCII_FOLD_RESTRICTED 17155 && (isASCII(c) != isASCII(j)))) 17156 { 17157 continue; 17158 } 17159 17160 /* Folds under /l which cross the 255/256 boundary 17161 * are added to a separate list. (These are valid 17162 * only when the locale is UTF-8.) */ 17163 if (c < 256 && LOC) { 17164 *use_list = add_cp_to_invlist(*use_list, c); 17165 continue; 17166 } 17167 17168 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) 17169 { 17170 cp_list = add_cp_to_invlist(cp_list, c); 17171 } 17172 else { 17173 /* Similarly folds involving non-ascii Latin1 17174 * characters under /d are added to their list */ 17175 has_upper_latin1_only_utf8_matches 17176 = add_cp_to_invlist( 17177 has_upper_latin1_only_utf8_matches, 17178 c); 17179 } 17180 } 17181 } 17182 } 17183 } 17184 SvREFCNT_dec_NN(fold_intersection); 17185 } 17186 17187 /* Now that we have finished adding all the folds, there is no reason 17188 * to keep the foldable list separate */ 17189 _invlist_union(cp_list, cp_foldable_list, &cp_list); 17190 SvREFCNT_dec_NN(cp_foldable_list); 17191 } 17192 17193 /* And combine the result (if any) with any inversion list from posix 17194 * classes. The lists are kept separate up to now because we don't want to 17195 * fold the classes (folding of those is automatically handled by the swash 17196 * fetching code) */ 17197 if (simple_posixes) { 17198 _invlist_union(cp_list, simple_posixes, &cp_list); 17199 SvREFCNT_dec_NN(simple_posixes); 17200 } 17201 if (posixes || nposixes) { 17202 if (posixes && AT_LEAST_ASCII_RESTRICTED) { 17203 /* Under /a and /aa, nothing above ASCII matches these */ 17204 _invlist_intersection(posixes, 17205 PL_XPosix_ptrs[_CC_ASCII], 17206 &posixes); 17207 } 17208 if (nposixes) { 17209 if (DEPENDS_SEMANTICS) { 17210 /* Under /d, everything in the upper half of the Latin1 range 17211 * matches these complements */ 17212 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; 17213 } 17214 else if (AT_LEAST_ASCII_RESTRICTED) { 17215 /* Under /a and /aa, everything above ASCII matches these 17216 * complements */ 17217 _invlist_union_complement_2nd(nposixes, 17218 PL_XPosix_ptrs[_CC_ASCII], 17219 &nposixes); 17220 } 17221 if (posixes) { 17222 _invlist_union(posixes, nposixes, &posixes); 17223 SvREFCNT_dec_NN(nposixes); 17224 } 17225 else { 17226 posixes = nposixes; 17227 } 17228 } 17229 if (! DEPENDS_SEMANTICS) { 17230 if (cp_list) { 17231 _invlist_union(cp_list, posixes, &cp_list); 17232 SvREFCNT_dec_NN(posixes); 17233 } 17234 else { 17235 cp_list = posixes; 17236 } 17237 } 17238 else { 17239 /* Under /d, we put into a separate list the Latin1 things that 17240 * match only when the target string is utf8 */ 17241 SV* nonascii_but_latin1_properties = NULL; 17242 _invlist_intersection(posixes, PL_UpperLatin1, 17243 &nonascii_but_latin1_properties); 17244 _invlist_subtract(posixes, nonascii_but_latin1_properties, 17245 &posixes); 17246 if (cp_list) { 17247 _invlist_union(cp_list, posixes, &cp_list); 17248 SvREFCNT_dec_NN(posixes); 17249 } 17250 else { 17251 cp_list = posixes; 17252 } 17253 17254 if (has_upper_latin1_only_utf8_matches) { 17255 _invlist_union(has_upper_latin1_only_utf8_matches, 17256 nonascii_but_latin1_properties, 17257 &has_upper_latin1_only_utf8_matches); 17258 SvREFCNT_dec_NN(nonascii_but_latin1_properties); 17259 } 17260 else { 17261 has_upper_latin1_only_utf8_matches 17262 = nonascii_but_latin1_properties; 17263 } 17264 } 17265 } 17266 17267 /* And combine the result (if any) with any inversion list from properties. 17268 * The lists are kept separate up to now so that we can distinguish the two 17269 * in regards to matching above-Unicode. A run-time warning is generated 17270 * if a Unicode property is matched against a non-Unicode code point. But, 17271 * we allow user-defined properties to match anything, without any warning, 17272 * and we also suppress the warning if there is a portion of the character 17273 * class that isn't a Unicode property, and which matches above Unicode, \W 17274 * or [\x{110000}] for example. 17275 * (Note that in this case, unlike the Posix one above, there is no 17276 * <has_upper_latin1_only_utf8_matches>, because having a Unicode property 17277 * forces Unicode semantics */ 17278 if (properties) { 17279 if (cp_list) { 17280 17281 /* If it matters to the final outcome, see if a non-property 17282 * component of the class matches above Unicode. If so, the 17283 * warning gets suppressed. This is true even if just a single 17284 * such code point is specified, as, though not strictly correct if 17285 * another such code point is matched against, the fact that they 17286 * are using above-Unicode code points indicates they should know 17287 * the issues involved */ 17288 if (warn_super) { 17289 warn_super = ! (invert 17290 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); 17291 } 17292 17293 _invlist_union(properties, cp_list, &cp_list); 17294 SvREFCNT_dec_NN(properties); 17295 } 17296 else { 17297 cp_list = properties; 17298 } 17299 17300 if (warn_super) { 17301 ANYOF_FLAGS(ret) 17302 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; 17303 17304 /* Because an ANYOF node is the only one that warns, this node 17305 * can't be optimized into something else */ 17306 optimizable = FALSE; 17307 } 17308 } 17309 17310 /* Here, we have calculated what code points should be in the character 17311 * class. 17312 * 17313 * Now we can see about various optimizations. Fold calculation (which we 17314 * did above) needs to take place before inversion. Otherwise /[^k]/i 17315 * would invert to include K, which under /i would match k, which it 17316 * shouldn't. Therefore we can't invert folded locale now, as it won't be 17317 * folded until runtime */ 17318 17319 /* If we didn't do folding, it's because some information isn't available 17320 * until runtime; set the run-time fold flag for these. (We don't have to 17321 * worry about properties folding, as that is taken care of by the swash 17322 * fetching). We know to set the flag if we have a non-NULL list for UTF-8 17323 * locales, or the class matches at least one 0-255 range code point */ 17324 if (LOC && FOLD) { 17325 17326 /* Some things on the list might be unconditionally included because of 17327 * other components. Remove them, and clean up the list if it goes to 17328 * 0 elements */ 17329 if (only_utf8_locale_list && cp_list) { 17330 _invlist_subtract(only_utf8_locale_list, cp_list, 17331 &only_utf8_locale_list); 17332 17333 if (_invlist_len(only_utf8_locale_list) == 0) { 17334 SvREFCNT_dec_NN(only_utf8_locale_list); 17335 only_utf8_locale_list = NULL; 17336 } 17337 } 17338 if (only_utf8_locale_list) { 17339 ANYOF_FLAGS(ret) 17340 |= ANYOFL_FOLD 17341 |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; 17342 } 17343 else if (cp_list) { /* Look to see if a 0-255 code point is in list */ 17344 UV start, end; 17345 invlist_iterinit(cp_list); 17346 if (invlist_iternext(cp_list, &start, &end) && start < 256) { 17347 ANYOF_FLAGS(ret) |= ANYOFL_FOLD; 17348 } 17349 invlist_iterfinish(cp_list); 17350 } 17351 } 17352 17353 #define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \ 17354 ( DEPENDS_SEMANTICS \ 17355 && (ANYOF_FLAGS(ret) \ 17356 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) 17357 17358 /* See if we can simplify things under /d */ 17359 if ( has_upper_latin1_only_utf8_matches 17360 || MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) 17361 { 17362 /* But not if we are inverting, as that screws it up */ 17363 if (! invert) { 17364 if (has_upper_latin1_only_utf8_matches) { 17365 if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) { 17366 17367 /* Here, we have both the flag and inversion list. Any 17368 * character in 'has_upper_latin1_only_utf8_matches' 17369 * matches when UTF-8 is in effect, but it also matches 17370 * when UTF-8 is not in effect because of 17371 * MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it matches 17372 * unconditionally, so can be added to the regular list, 17373 * and 'has_upper_latin1_only_utf8_matches' cleared */ 17374 _invlist_union(cp_list, 17375 has_upper_latin1_only_utf8_matches, 17376 &cp_list); 17377 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); 17378 has_upper_latin1_only_utf8_matches = NULL; 17379 } 17380 else if (cp_list) { 17381 17382 /* Here, 'cp_list' gives chars that always match, and 17383 * 'has_upper_latin1_only_utf8_matches' gives chars that 17384 * were specified to match only if the target string is in 17385 * UTF-8. It may be that these overlap, so we can subtract 17386 * the unconditionally matching from the conditional ones, 17387 * to make the conditional list as small as possible, 17388 * perhaps even clearing it, in which case more 17389 * optimizations are possible later */ 17390 _invlist_subtract(has_upper_latin1_only_utf8_matches, 17391 cp_list, 17392 &has_upper_latin1_only_utf8_matches); 17393 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) { 17394 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); 17395 has_upper_latin1_only_utf8_matches = NULL; 17396 } 17397 } 17398 } 17399 17400 /* Similarly, if the unconditional matches include every upper 17401 * latin1 character, we can clear that flag to permit later 17402 * optimizations */ 17403 if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) { 17404 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1); 17405 _invlist_subtract(only_non_utf8_list, cp_list, 17406 &only_non_utf8_list); 17407 if (_invlist_len(only_non_utf8_list) == 0) { 17408 ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; 17409 } 17410 SvREFCNT_dec_NN(only_non_utf8_list); 17411 only_non_utf8_list = NULL;; 17412 } 17413 } 17414 17415 /* If we haven't gotten rid of all conditional matching, we change the 17416 * regnode type to indicate that */ 17417 if ( has_upper_latin1_only_utf8_matches 17418 || MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) 17419 { 17420 OP(ret) = ANYOFD; 17421 optimizable = FALSE; 17422 } 17423 } 17424 #undef MATCHES_ALL_NON_UTF8_NON_ASCII 17425 17426 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known 17427 * at compile time. Besides not inverting folded locale now, we can't 17428 * invert if there are things such as \w, which aren't known until runtime 17429 * */ 17430 if (cp_list 17431 && invert 17432 && OP(ret) != ANYOFD 17433 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) 17434 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) 17435 { 17436 _invlist_invert(cp_list); 17437 17438 /* Any swash can't be used as-is, because we've inverted things */ 17439 if (swash) { 17440 SvREFCNT_dec_NN(swash); 17441 swash = NULL; 17442 } 17443 17444 /* Clear the invert flag since have just done it here */ 17445 invert = FALSE; 17446 } 17447 17448 if (ret_invlist) { 17449 assert(cp_list); 17450 17451 *ret_invlist = cp_list; 17452 SvREFCNT_dec(swash); 17453 17454 /* Discard the generated node */ 17455 if (SIZE_ONLY) { 17456 RExC_size = orig_size; 17457 } 17458 else { 17459 RExC_emit = orig_emit; 17460 } 17461 return orig_emit; 17462 } 17463 17464 /* Some character classes are equivalent to other nodes. Such nodes take 17465 * up less room and generally fewer operations to execute than ANYOF nodes. 17466 * Above, we checked for and optimized into some such equivalents for 17467 * certain common classes that are easy to test. Getting to this point in 17468 * the code means that the class didn't get optimized there. Since this 17469 * code is only executed in Pass 2, it is too late to save space--it has 17470 * been allocated in Pass 1, and currently isn't given back. But turning 17471 * things into an EXACTish node can allow the optimizer to join it to any 17472 * adjacent such nodes. And if the class is equivalent to things like /./, 17473 * expensive run-time swashes can be avoided. Now that we have more 17474 * complete information, we can find things necessarily missed by the 17475 * earlier code. Another possible "optimization" that isn't done is that 17476 * something like [Ee] could be changed into an EXACTFU. khw tried this 17477 * and found that the ANYOF is faster, including for code points not in the 17478 * bitmap. This still might make sense to do, provided it got joined with 17479 * an adjacent node(s) to create a longer EXACTFU one. This could be 17480 * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join 17481 * routine would know is joinable. If that didn't happen, the node type 17482 * could then be made a straight ANYOF */ 17483 17484 if (optimizable && cp_list && ! invert) { 17485 UV start, end; 17486 U8 op = END; /* The optimzation node-type */ 17487 int posix_class = -1; /* Illegal value */ 17488 const char * cur_parse= RExC_parse; 17489 17490 invlist_iterinit(cp_list); 17491 if (! invlist_iternext(cp_list, &start, &end)) { 17492 17493 /* Here, the list is empty. This happens, for example, when a 17494 * Unicode property that doesn't match anything is the only element 17495 * in the character class (perluniprops.pod notes such properties). 17496 * */ 17497 op = OPFAIL; 17498 *flagp |= HASWIDTH|SIMPLE; 17499 } 17500 else if (start == end) { /* The range is a single code point */ 17501 if (! invlist_iternext(cp_list, &start, &end) 17502 17503 /* Don't do this optimization if it would require changing 17504 * the pattern to UTF-8 */ 17505 && (start < 256 || UTF)) 17506 { 17507 /* Here, the list contains a single code point. Can optimize 17508 * into an EXACTish node */ 17509 17510 value = start; 17511 17512 if (! FOLD) { 17513 op = (LOC) 17514 ? EXACTL 17515 : EXACT; 17516 } 17517 else if (LOC) { 17518 17519 /* A locale node under folding with one code point can be 17520 * an EXACTFL, as its fold won't be calculated until 17521 * runtime */ 17522 op = EXACTFL; 17523 } 17524 else { 17525 17526 /* Here, we are generally folding, but there is only one 17527 * code point to match. If we have to, we use an EXACT 17528 * node, but it would be better for joining with adjacent 17529 * nodes in the optimization pass if we used the same 17530 * EXACTFish node that any such are likely to be. We can 17531 * do this iff the code point doesn't participate in any 17532 * folds. For example, an EXACTF of a colon is the same as 17533 * an EXACT one, since nothing folds to or from a colon. */ 17534 if (value < 256) { 17535 if (IS_IN_SOME_FOLD_L1(value)) { 17536 op = EXACT; 17537 } 17538 } 17539 else { 17540 if (_invlist_contains_cp(PL_utf8_foldable, value)) { 17541 op = EXACT; 17542 } 17543 } 17544 17545 /* If we haven't found the node type, above, it means we 17546 * can use the prevailing one */ 17547 if (op == END) { 17548 op = compute_EXACTish(pRExC_state); 17549 } 17550 } 17551 } 17552 } /* End of first range contains just a single code point */ 17553 else if (start == 0) { 17554 if (end == UV_MAX) { 17555 op = SANY; 17556 *flagp |= HASWIDTH|SIMPLE; 17557 MARK_NAUGHTY(1); 17558 } 17559 else if (end == '\n' - 1 17560 && invlist_iternext(cp_list, &start, &end) 17561 && start == '\n' + 1 && end == UV_MAX) 17562 { 17563 op = REG_ANY; 17564 *flagp |= HASWIDTH|SIMPLE; 17565 MARK_NAUGHTY(1); 17566 } 17567 } 17568 invlist_iterfinish(cp_list); 17569 17570 if (op == END) { 17571 const UV cp_list_len = _invlist_len(cp_list); 17572 const UV* cp_list_array = invlist_array(cp_list); 17573 17574 /* Here, didn't find an optimization. See if this matches any of 17575 * the POSIX classes. These run slightly faster for above-Unicode 17576 * code points, so don't bother with POSIXA ones nor the 2 that 17577 * have no above-Unicode matches. We can avoid these checks unless 17578 * the ANYOF matches at least as high as the lowest POSIX one 17579 * (which was manually found to be \v. The actual code point may 17580 * increase in later Unicode releases, if a higher code point is 17581 * assigned to be \v, but this code will never break. It would 17582 * just mean we could execute the checks for posix optimizations 17583 * unnecessarily) */ 17584 17585 if (cp_list_array[cp_list_len-1] > 0x2029) { 17586 for (posix_class = 0; 17587 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC; 17588 posix_class++) 17589 { 17590 int try_inverted; 17591 if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) { 17592 continue; 17593 } 17594 for (try_inverted = 0; try_inverted < 2; try_inverted++) { 17595 17596 /* Check if matches normal or inverted */ 17597 if (_invlistEQ(cp_list, 17598 PL_XPosix_ptrs[posix_class], 17599 try_inverted)) 17600 { 17601 op = (try_inverted) 17602 ? NPOSIXU 17603 : POSIXU; 17604 *flagp |= HASWIDTH|SIMPLE; 17605 goto found_posix; 17606 } 17607 } 17608 } 17609 found_posix: ; 17610 } 17611 } 17612 17613 if (op != END) { 17614 RExC_parse = (char *)orig_parse; 17615 RExC_emit = (regnode *)orig_emit; 17616 17617 if (regarglen[op]) { 17618 ret = reganode(pRExC_state, op, 0); 17619 } else { 17620 ret = reg_node(pRExC_state, op); 17621 } 17622 17623 RExC_parse = (char *)cur_parse; 17624 17625 if (PL_regkind[op] == EXACT) { 17626 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, 17627 TRUE /* downgradable to EXACT */ 17628 ); 17629 } 17630 else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { 17631 FLAGS(ret) = posix_class; 17632 } 17633 17634 SvREFCNT_dec_NN(cp_list); 17635 return ret; 17636 } 17637 } 17638 17639 /* Here, <cp_list> contains all the code points we can determine at 17640 * compile time that match under all conditions. Go through it, and 17641 * for things that belong in the bitmap, put them there, and delete from 17642 * <cp_list>. While we are at it, see if everything above 255 is in the 17643 * list, and if so, set a flag to speed up execution */ 17644 17645 populate_ANYOF_from_invlist(ret, &cp_list); 17646 17647 if (invert) { 17648 ANYOF_FLAGS(ret) |= ANYOF_INVERT; 17649 } 17650 17651 /* Here, the bitmap has been populated with all the Latin1 code points that 17652 * always match. Can now add to the overall list those that match only 17653 * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>). 17654 * */ 17655 if (has_upper_latin1_only_utf8_matches) { 17656 if (cp_list) { 17657 _invlist_union(cp_list, 17658 has_upper_latin1_only_utf8_matches, 17659 &cp_list); 17660 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); 17661 } 17662 else { 17663 cp_list = has_upper_latin1_only_utf8_matches; 17664 } 17665 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; 17666 } 17667 17668 /* If there is a swash and more than one element, we can't use the swash in 17669 * the optimization below. */ 17670 if (swash && element_count > 1) { 17671 SvREFCNT_dec_NN(swash); 17672 swash = NULL; 17673 } 17674 17675 /* Note that the optimization of using 'swash' if it is the only thing in 17676 * the class doesn't have us change swash at all, so it can include things 17677 * that are also in the bitmap; otherwise we have purposely deleted that 17678 * duplicate information */ 17679 set_ANYOF_arg(pRExC_state, ret, cp_list, 17680 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) 17681 ? listsv : NULL, 17682 only_utf8_locale_list, 17683 swash, has_user_defined_property); 17684 17685 *flagp |= HASWIDTH|SIMPLE; 17686 17687 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { 17688 RExC_contains_locale = 1; 17689 } 17690 17691 return ret; 17692 } 17693 17694 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION 17695 17696 STATIC void 17697 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, 17698 regnode* const node, 17699 SV* const cp_list, 17700 SV* const runtime_defns, 17701 SV* const only_utf8_locale_list, 17702 SV* const swash, 17703 const bool has_user_defined_property) 17704 { 17705 /* Sets the arg field of an ANYOF-type node 'node', using information about 17706 * the node passed-in. If there is nothing outside the node's bitmap, the 17707 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to 17708 * the count returned by add_data(), having allocated and stored an array, 17709 * av, that that count references, as follows: 17710 * av[0] stores the character class description in its textual form. 17711 * This is used later (regexec.c:Perl_regclass_swash()) to 17712 * initialize the appropriate swash, and is also useful for dumping 17713 * the regnode. This is set to &PL_sv_undef if the textual 17714 * description is not needed at run-time (as happens if the other 17715 * elements completely define the class) 17716 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash 17717 * computed from av[0]. But if no further computation need be done, 17718 * the swash is stored here now (and av[0] is &PL_sv_undef). 17719 * av[2] stores the inversion list of code points that match only if the 17720 * current locale is UTF-8 17721 * av[3] stores the cp_list inversion list for use in addition or instead 17722 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. 17723 * (Otherwise everything needed is already in av[0] and av[1]) 17724 * av[4] is set if any component of the class is from a user-defined 17725 * property; used only if av[3] exists */ 17726 17727 UV n; 17728 17729 PERL_ARGS_ASSERT_SET_ANYOF_ARG; 17730 17731 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { 17732 assert(! (ANYOF_FLAGS(node) 17733 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)); 17734 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP); 17735 } 17736 else { 17737 AV * const av = newAV(); 17738 SV *rv; 17739 17740 av_store(av, 0, (runtime_defns) 17741 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); 17742 if (swash) { 17743 assert(cp_list); 17744 av_store(av, 1, swash); 17745 SvREFCNT_dec_NN(cp_list); 17746 } 17747 else { 17748 av_store(av, 1, &PL_sv_undef); 17749 if (cp_list) { 17750 av_store(av, 3, cp_list); 17751 av_store(av, 4, newSVuv(has_user_defined_property)); 17752 } 17753 } 17754 17755 if (only_utf8_locale_list) { 17756 av_store(av, 2, only_utf8_locale_list); 17757 } 17758 else { 17759 av_store(av, 2, &PL_sv_undef); 17760 } 17761 17762 rv = newRV_noinc(MUTABLE_SV(av)); 17763 n = add_data(pRExC_state, STR_WITH_LEN("s")); 17764 RExC_rxi->data->data[n] = (void*)rv; 17765 ARG_SET(node, n); 17766 } 17767 } 17768 17769 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 17770 SV * 17771 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, 17772 const regnode* node, 17773 bool doinit, 17774 SV** listsvp, 17775 SV** only_utf8_locale_ptr, 17776 SV** output_invlist) 17777 17778 { 17779 /* For internal core use only. 17780 * Returns the swash for the input 'node' in the regex 'prog'. 17781 * If <doinit> is 'true', will attempt to create the swash if not already 17782 * done. 17783 * If <listsvp> is non-null, will return the printable contents of the 17784 * swash. This can be used to get debugging information even before the 17785 * swash exists, by calling this function with 'doinit' set to false, in 17786 * which case the components that will be used to eventually create the 17787 * swash are returned (in a printable form). 17788 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to 17789 * store an inversion list of code points that should match only if the 17790 * execution-time locale is a UTF-8 one. 17791 * If <output_invlist> is not NULL, it is where this routine is to store an 17792 * inversion list of the code points that would be instead returned in 17793 * <listsvp> if this were NULL. Thus, what gets output in <listsvp> 17794 * when this parameter is used, is just the non-code point data that 17795 * will go into creating the swash. This currently should be just 17796 * user-defined properties whose definitions were not known at compile 17797 * time. Using this parameter allows for easier manipulation of the 17798 * swash's data by the caller. It is illegal to call this function with 17799 * this parameter set, but not <listsvp> 17800 * 17801 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note 17802 * that, in spite of this function's name, the swash it returns may include 17803 * the bitmap data as well */ 17804 17805 SV *sw = NULL; 17806 SV *si = NULL; /* Input swash initialization string */ 17807 SV* invlist = NULL; 17808 17809 RXi_GET_DECL(prog,progi); 17810 const struct reg_data * const data = prog ? progi->data : NULL; 17811 17812 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; 17813 assert(! output_invlist || listsvp); 17814 17815 if (data && data->count) { 17816 const U32 n = ARG(node); 17817 17818 if (data->what[n] == 's') { 17819 SV * const rv = MUTABLE_SV(data->data[n]); 17820 AV * const av = MUTABLE_AV(SvRV(rv)); 17821 SV **const ary = AvARRAY(av); 17822 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; 17823 17824 si = *ary; /* ary[0] = the string to initialize the swash with */ 17825 17826 if (av_tindex_nomg(av) >= 2) { 17827 if (only_utf8_locale_ptr 17828 && ary[2] 17829 && ary[2] != &PL_sv_undef) 17830 { 17831 *only_utf8_locale_ptr = ary[2]; 17832 } 17833 else { 17834 assert(only_utf8_locale_ptr); 17835 *only_utf8_locale_ptr = NULL; 17836 } 17837 17838 /* Elements 3 and 4 are either both present or both absent. [3] 17839 * is any inversion list generated at compile time; [4] 17840 * indicates if that inversion list has any user-defined 17841 * properties in it. */ 17842 if (av_tindex_nomg(av) >= 3) { 17843 invlist = ary[3]; 17844 if (SvUV(ary[4])) { 17845 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; 17846 } 17847 } 17848 else { 17849 invlist = NULL; 17850 } 17851 } 17852 17853 /* Element [1] is reserved for the set-up swash. If already there, 17854 * return it; if not, create it and store it there */ 17855 if (ary[1] && SvROK(ary[1])) { 17856 sw = ary[1]; 17857 } 17858 else if (doinit && ((si && si != &PL_sv_undef) 17859 || (invlist && invlist != &PL_sv_undef))) { 17860 assert(si); 17861 sw = _core_swash_init("utf8", /* the utf8 package */ 17862 "", /* nameless */ 17863 si, 17864 1, /* binary */ 17865 0, /* not from tr/// */ 17866 invlist, 17867 &swash_init_flags); 17868 (void)av_store(av, 1, sw); 17869 } 17870 } 17871 } 17872 17873 /* If requested, return a printable version of what this swash matches */ 17874 if (listsvp) { 17875 SV* matches_string = NULL; 17876 17877 /* The swash should be used, if possible, to get the data, as it 17878 * contains the resolved data. But this function can be called at 17879 * compile-time, before everything gets resolved, in which case we 17880 * return the currently best available information, which is the string 17881 * that will eventually be used to do that resolving, 'si' */ 17882 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) 17883 && (si && si != &PL_sv_undef)) 17884 { 17885 /* Here, we only have 'si' (and possibly some passed-in data in 17886 * 'invlist', which is handled below) If the caller only wants 17887 * 'si', use that. */ 17888 if (! output_invlist) { 17889 matches_string = newSVsv(si); 17890 } 17891 else { 17892 /* But if the caller wants an inversion list of the node, we 17893 * need to parse 'si' and place as much as possible in the 17894 * desired output inversion list, making 'matches_string' only 17895 * contain the currently unresolvable things */ 17896 const char *si_string = SvPVX(si); 17897 STRLEN remaining = SvCUR(si); 17898 UV prev_cp = 0; 17899 U8 count = 0; 17900 17901 /* Ignore everything before the first new-line */ 17902 while (*si_string != '\n' && remaining > 0) { 17903 si_string++; 17904 remaining--; 17905 } 17906 assert(remaining > 0); 17907 17908 si_string++; 17909 remaining--; 17910 17911 while (remaining > 0) { 17912 17913 /* The data consists of just strings defining user-defined 17914 * property names, but in prior incarnations, and perhaps 17915 * somehow from pluggable regex engines, it could still 17916 * hold hex code point definitions. Each component of a 17917 * range would be separated by a tab, and each range by a 17918 * new-line. If these are found, instead add them to the 17919 * inversion list */ 17920 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT 17921 |PERL_SCAN_SILENT_NON_PORTABLE; 17922 STRLEN len = remaining; 17923 UV cp = grok_hex(si_string, &len, &grok_flags, NULL); 17924 17925 /* If the hex decode routine found something, it should go 17926 * up to the next \n */ 17927 if ( *(si_string + len) == '\n') { 17928 if (count) { /* 2nd code point on line */ 17929 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp); 17930 } 17931 else { 17932 *output_invlist = add_cp_to_invlist(*output_invlist, cp); 17933 } 17934 count = 0; 17935 goto prepare_for_next_iteration; 17936 } 17937 17938 /* If the hex decode was instead for the lower range limit, 17939 * save it, and go parse the upper range limit */ 17940 if (*(si_string + len) == '\t') { 17941 assert(count == 0); 17942 17943 prev_cp = cp; 17944 count = 1; 17945 prepare_for_next_iteration: 17946 si_string += len + 1; 17947 remaining -= len + 1; 17948 continue; 17949 } 17950 17951 /* Here, didn't find a legal hex number. Just add it from 17952 * here to the next \n */ 17953 17954 remaining -= len; 17955 while (*(si_string + len) != '\n' && remaining > 0) { 17956 remaining--; 17957 len++; 17958 } 17959 if (*(si_string + len) == '\n') { 17960 len++; 17961 remaining--; 17962 } 17963 if (matches_string) { 17964 sv_catpvn(matches_string, si_string, len - 1); 17965 } 17966 else { 17967 matches_string = newSVpvn(si_string, len - 1); 17968 } 17969 si_string += len; 17970 sv_catpvs(matches_string, " "); 17971 } /* end of loop through the text */ 17972 17973 assert(matches_string); 17974 if (SvCUR(matches_string)) { /* Get rid of trailing blank */ 17975 SvCUR_set(matches_string, SvCUR(matches_string) - 1); 17976 } 17977 } /* end of has an 'si' but no swash */ 17978 } 17979 17980 /* If we have a swash in place, its equivalent inversion list was above 17981 * placed into 'invlist'. If not, this variable may contain a stored 17982 * inversion list which is information beyond what is in 'si' */ 17983 if (invlist) { 17984 17985 /* Again, if the caller doesn't want the output inversion list, put 17986 * everything in 'matches-string' */ 17987 if (! output_invlist) { 17988 if ( ! matches_string) { 17989 matches_string = newSVpvs("\n"); 17990 } 17991 sv_catsv(matches_string, invlist_contents(invlist, 17992 TRUE /* traditional style */ 17993 )); 17994 } 17995 else if (! *output_invlist) { 17996 *output_invlist = invlist_clone(invlist); 17997 } 17998 else { 17999 _invlist_union(*output_invlist, invlist, output_invlist); 18000 } 18001 } 18002 18003 *listsvp = matches_string; 18004 } 18005 18006 return sw; 18007 } 18008 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ 18009 18010 /* reg_skipcomment() 18011 18012 Absorbs an /x style # comment from the input stream, 18013 returning a pointer to the first character beyond the comment, or if the 18014 comment terminates the pattern without anything following it, this returns 18015 one past the final character of the pattern (in other words, RExC_end) and 18016 sets the REG_RUN_ON_COMMENT_SEEN flag. 18017 18018 Note it's the callers responsibility to ensure that we are 18019 actually in /x mode 18020 18021 */ 18022 18023 PERL_STATIC_INLINE char* 18024 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) 18025 { 18026 PERL_ARGS_ASSERT_REG_SKIPCOMMENT; 18027 18028 assert(*p == '#'); 18029 18030 while (p < RExC_end) { 18031 if (*(++p) == '\n') { 18032 return p+1; 18033 } 18034 } 18035 18036 /* we ran off the end of the pattern without ending the comment, so we have 18037 * to add an \n when wrapping */ 18038 RExC_seen |= REG_RUN_ON_COMMENT_SEEN; 18039 return p; 18040 } 18041 18042 STATIC void 18043 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, 18044 char ** p, 18045 const bool force_to_xmod 18046 ) 18047 { 18048 /* If the text at the current parse position '*p' is a '(?#...)' comment, 18049 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p' 18050 * is /x whitespace, advance '*p' so that on exit it points to the first 18051 * byte past all such white space and comments */ 18052 18053 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED); 18054 18055 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT; 18056 18057 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p)); 18058 18059 for (;;) { 18060 if (RExC_end - (*p) >= 3 18061 && *(*p) == '(' 18062 && *(*p + 1) == '?' 18063 && *(*p + 2) == '#') 18064 { 18065 while (*(*p) != ')') { 18066 if ((*p) == RExC_end) 18067 FAIL("Sequence (?#... not terminated"); 18068 (*p)++; 18069 } 18070 (*p)++; 18071 continue; 18072 } 18073 18074 if (use_xmod) { 18075 const char * save_p = *p; 18076 while ((*p) < RExC_end) { 18077 STRLEN len; 18078 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) { 18079 (*p) += len; 18080 } 18081 else if (*(*p) == '#') { 18082 (*p) = reg_skipcomment(pRExC_state, (*p)); 18083 } 18084 else { 18085 break; 18086 } 18087 } 18088 if (*p != save_p) { 18089 continue; 18090 } 18091 } 18092 18093 break; 18094 } 18095 18096 return; 18097 } 18098 18099 /* nextchar() 18100 18101 Advances the parse position by one byte, unless that byte is the beginning 18102 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In 18103 those two cases, the parse position is advanced beyond all such comments and 18104 white space. 18105 18106 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++. 18107 */ 18108 18109 STATIC void 18110 S_nextchar(pTHX_ RExC_state_t *pRExC_state) 18111 { 18112 PERL_ARGS_ASSERT_NEXTCHAR; 18113 18114 if (RExC_parse < RExC_end) { 18115 assert( ! UTF 18116 || UTF8_IS_INVARIANT(*RExC_parse) 18117 || UTF8_IS_START(*RExC_parse)); 18118 18119 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; 18120 18121 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 18122 FALSE /* Don't assume /x */ ); 18123 } 18124 } 18125 18126 STATIC regnode * 18127 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name) 18128 { 18129 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra 18130 * space. In pass1, it aligns and increments RExC_size; in pass2, 18131 * RExC_emit */ 18132 18133 regnode * const ret = RExC_emit; 18134 GET_RE_DEBUG_FLAGS_DECL; 18135 18136 PERL_ARGS_ASSERT_REGNODE_GUTS; 18137 18138 assert(extra_size >= regarglen[op]); 18139 18140 if (SIZE_ONLY) { 18141 SIZE_ALIGN(RExC_size); 18142 RExC_size += 1 + extra_size; 18143 return(ret); 18144 } 18145 if (RExC_emit >= RExC_emit_bound) 18146 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", 18147 op, (void*)RExC_emit, (void*)RExC_emit_bound); 18148 18149 NODE_ALIGN_FILL(ret); 18150 #ifndef RE_TRACK_PATTERN_OFFSETS 18151 PERL_UNUSED_ARG(name); 18152 #else 18153 if (RExC_offsets) { /* MJD */ 18154 MJD_OFFSET_DEBUG( 18155 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", 18156 name, __LINE__, 18157 PL_reg_name[op], 18158 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] 18159 ? "Overwriting end of array!\n" : "OK", 18160 (UV)(RExC_emit - RExC_emit_start), 18161 (UV)(RExC_parse - RExC_start), 18162 (UV)RExC_offsets[0])); 18163 Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); 18164 } 18165 #endif 18166 return(ret); 18167 } 18168 18169 /* 18170 - reg_node - emit a node 18171 */ 18172 STATIC regnode * /* Location. */ 18173 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 18174 { 18175 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node"); 18176 18177 PERL_ARGS_ASSERT_REG_NODE; 18178 18179 assert(regarglen[op] == 0); 18180 18181 if (PASS2) { 18182 regnode *ptr = ret; 18183 FILL_ADVANCE_NODE(ptr, op); 18184 RExC_emit = ptr; 18185 } 18186 return(ret); 18187 } 18188 18189 /* 18190 - reganode - emit a node with an argument 18191 */ 18192 STATIC regnode * /* Location. */ 18193 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 18194 { 18195 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode"); 18196 18197 PERL_ARGS_ASSERT_REGANODE; 18198 18199 assert(regarglen[op] == 1); 18200 18201 if (PASS2) { 18202 regnode *ptr = ret; 18203 FILL_ADVANCE_NODE_ARG(ptr, op, arg); 18204 RExC_emit = ptr; 18205 } 18206 return(ret); 18207 } 18208 18209 STATIC regnode * 18210 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2) 18211 { 18212 /* emit a node with U32 and I32 arguments */ 18213 18214 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode"); 18215 18216 PERL_ARGS_ASSERT_REG2LANODE; 18217 18218 assert(regarglen[op] == 2); 18219 18220 if (PASS2) { 18221 regnode *ptr = ret; 18222 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2); 18223 RExC_emit = ptr; 18224 } 18225 return(ret); 18226 } 18227 18228 /* 18229 - reginsert - insert an operator in front of already-emitted operand 18230 * 18231 * Means relocating the operand. 18232 */ 18233 STATIC void 18234 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) 18235 { 18236 regnode *src; 18237 regnode *dst; 18238 regnode *place; 18239 const int offset = regarglen[(U8)op]; 18240 const int size = NODE_STEP_REGNODE + offset; 18241 GET_RE_DEBUG_FLAGS_DECL; 18242 18243 PERL_ARGS_ASSERT_REGINSERT; 18244 PERL_UNUSED_CONTEXT; 18245 PERL_UNUSED_ARG(depth); 18246 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ 18247 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); 18248 if (SIZE_ONLY) { 18249 RExC_size += size; 18250 return; 18251 } 18252 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started 18253 studying. If this is wrong then we need to adjust RExC_recurse 18254 below like we do with RExC_open_parens/RExC_close_parens. */ 18255 src = RExC_emit; 18256 RExC_emit += size; 18257 dst = RExC_emit; 18258 if (RExC_open_parens) { 18259 int paren; 18260 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ 18261 /* remember that RExC_npar is rex->nparens + 1, 18262 * iow it is 1 more than the number of parens seen in 18263 * the pattern so far. */ 18264 for ( paren=0 ; paren < RExC_npar ; paren++ ) { 18265 /* note, RExC_open_parens[0] is the start of the 18266 * regex, it can't move. RExC_close_parens[0] is the end 18267 * of the regex, it *can* move. */ 18268 if ( paren && RExC_open_parens[paren] >= opnd ) { 18269 /*DEBUG_PARSE_FMT("open"," - %d",size);*/ 18270 RExC_open_parens[paren] += size; 18271 } else { 18272 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ 18273 } 18274 if ( RExC_close_parens[paren] >= opnd ) { 18275 /*DEBUG_PARSE_FMT("close"," - %d",size);*/ 18276 RExC_close_parens[paren] += size; 18277 } else { 18278 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ 18279 } 18280 } 18281 } 18282 if (RExC_end_op) 18283 RExC_end_op += size; 18284 18285 while (src > opnd) { 18286 StructCopy(--src, --dst, regnode); 18287 #ifdef RE_TRACK_PATTERN_OFFSETS 18288 if (RExC_offsets) { /* MJD 20010112 */ 18289 MJD_OFFSET_DEBUG( 18290 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", 18291 "reg_insert", 18292 __LINE__, 18293 PL_reg_name[op], 18294 (UV)(dst - RExC_emit_start) > RExC_offsets[0] 18295 ? "Overwriting end of array!\n" : "OK", 18296 (UV)(src - RExC_emit_start), 18297 (UV)(dst - RExC_emit_start), 18298 (UV)RExC_offsets[0])); 18299 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); 18300 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); 18301 } 18302 #endif 18303 } 18304 18305 18306 place = opnd; /* Op node, where operand used to be. */ 18307 #ifdef RE_TRACK_PATTERN_OFFSETS 18308 if (RExC_offsets) { /* MJD */ 18309 MJD_OFFSET_DEBUG( 18310 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", 18311 "reginsert", 18312 __LINE__, 18313 PL_reg_name[op], 18314 (UV)(place - RExC_emit_start) > RExC_offsets[0] 18315 ? "Overwriting end of array!\n" : "OK", 18316 (UV)(place - RExC_emit_start), 18317 (UV)(RExC_parse - RExC_start), 18318 (UV)RExC_offsets[0])); 18319 Set_Node_Offset(place, RExC_parse); 18320 Set_Node_Length(place, 1); 18321 } 18322 #endif 18323 src = NEXTOPER(place); 18324 FILL_ADVANCE_NODE(place, op); 18325 Zero(src, offset, regnode); 18326 } 18327 18328 /* 18329 - regtail - set the next-pointer at the end of a node chain of p to val. 18330 - SEE ALSO: regtail_study 18331 */ 18332 STATIC void 18333 S_regtail(pTHX_ RExC_state_t * pRExC_state, 18334 const regnode * const p, 18335 const regnode * const val, 18336 const U32 depth) 18337 { 18338 regnode *scan; 18339 GET_RE_DEBUG_FLAGS_DECL; 18340 18341 PERL_ARGS_ASSERT_REGTAIL; 18342 #ifndef DEBUGGING 18343 PERL_UNUSED_ARG(depth); 18344 #endif 18345 18346 if (SIZE_ONLY) 18347 return; 18348 18349 /* Find last node. */ 18350 scan = (regnode *) p; 18351 for (;;) { 18352 regnode * const temp = regnext(scan); 18353 DEBUG_PARSE_r({ 18354 DEBUG_PARSE_MSG((scan==p ? "tail" : "")); 18355 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); 18356 Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n", 18357 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan), 18358 (temp == NULL ? "->" : ""), 18359 (temp == NULL ? PL_reg_name[OP(val)] : "") 18360 ); 18361 }); 18362 if (temp == NULL) 18363 break; 18364 scan = temp; 18365 } 18366 18367 if (reg_off_by_arg[OP(scan)]) { 18368 ARG_SET(scan, val - scan); 18369 } 18370 else { 18371 NEXT_OFF(scan) = val - scan; 18372 } 18373 } 18374 18375 #ifdef DEBUGGING 18376 /* 18377 - regtail_study - set the next-pointer at the end of a node chain of p to val. 18378 - Look for optimizable sequences at the same time. 18379 - currently only looks for EXACT chains. 18380 18381 This is experimental code. The idea is to use this routine to perform 18382 in place optimizations on branches and groups as they are constructed, 18383 with the long term intention of removing optimization from study_chunk so 18384 that it is purely analytical. 18385 18386 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used 18387 to control which is which. 18388 18389 */ 18390 /* TODO: All four parms should be const */ 18391 18392 STATIC U8 18393 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, 18394 const regnode *val,U32 depth) 18395 { 18396 regnode *scan; 18397 U8 exact = PSEUDO; 18398 #ifdef EXPERIMENTAL_INPLACESCAN 18399 I32 min = 0; 18400 #endif 18401 GET_RE_DEBUG_FLAGS_DECL; 18402 18403 PERL_ARGS_ASSERT_REGTAIL_STUDY; 18404 18405 18406 if (SIZE_ONLY) 18407 return exact; 18408 18409 /* Find last node. */ 18410 18411 scan = p; 18412 for (;;) { 18413 regnode * const temp = regnext(scan); 18414 #ifdef EXPERIMENTAL_INPLACESCAN 18415 if (PL_regkind[OP(scan)] == EXACT) { 18416 bool unfolded_multi_char; /* Unexamined in this routine */ 18417 if (join_exact(pRExC_state, scan, &min, 18418 &unfolded_multi_char, 1, val, depth+1)) 18419 return EXACT; 18420 } 18421 #endif 18422 if ( exact ) { 18423 switch (OP(scan)) { 18424 case EXACT: 18425 case EXACTL: 18426 case EXACTF: 18427 case EXACTFA_NO_TRIE: 18428 case EXACTFA: 18429 case EXACTFU: 18430 case EXACTFLU8: 18431 case EXACTFU_SS: 18432 case EXACTFL: 18433 if( exact == PSEUDO ) 18434 exact= OP(scan); 18435 else if ( exact != OP(scan) ) 18436 exact= 0; 18437 case NOTHING: 18438 break; 18439 default: 18440 exact= 0; 18441 } 18442 } 18443 DEBUG_PARSE_r({ 18444 DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); 18445 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); 18446 Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n", 18447 SvPV_nolen_const(RExC_mysv), 18448 REG_NODE_NUM(scan), 18449 PL_reg_name[exact]); 18450 }); 18451 if (temp == NULL) 18452 break; 18453 scan = temp; 18454 } 18455 DEBUG_PARSE_r({ 18456 DEBUG_PARSE_MSG(""); 18457 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state); 18458 Perl_re_printf( aTHX_ 18459 "~ attach to %s (%"IVdf") offset to %"IVdf"\n", 18460 SvPV_nolen_const(RExC_mysv), 18461 (IV)REG_NODE_NUM(val), 18462 (IV)(val - scan) 18463 ); 18464 }); 18465 if (reg_off_by_arg[OP(scan)]) { 18466 ARG_SET(scan, val - scan); 18467 } 18468 else { 18469 NEXT_OFF(scan) = val - scan; 18470 } 18471 18472 return exact; 18473 } 18474 #endif 18475 18476 /* 18477 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 18478 */ 18479 #ifdef DEBUGGING 18480 18481 static void 18482 S_regdump_intflags(pTHX_ const char *lead, const U32 flags) 18483 { 18484 int bit; 18485 int set=0; 18486 18487 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); 18488 18489 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) { 18490 if (flags & (1<<bit)) { 18491 if (!set++ && lead) 18492 Perl_re_printf( aTHX_ "%s",lead); 18493 Perl_re_printf( aTHX_ "%s ",PL_reg_intflags_name[bit]); 18494 } 18495 } 18496 if (lead) { 18497 if (set) 18498 Perl_re_printf( aTHX_ "\n"); 18499 else 18500 Perl_re_printf( aTHX_ "%s[none-set]\n",lead); 18501 } 18502 } 18503 18504 static void 18505 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) 18506 { 18507 int bit; 18508 int set=0; 18509 regex_charset cs; 18510 18511 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8); 18512 18513 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) { 18514 if (flags & (1<<bit)) { 18515 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ 18516 continue; 18517 } 18518 if (!set++ && lead) 18519 Perl_re_printf( aTHX_ "%s",lead); 18520 Perl_re_printf( aTHX_ "%s ",PL_reg_extflags_name[bit]); 18521 } 18522 } 18523 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { 18524 if (!set++ && lead) { 18525 Perl_re_printf( aTHX_ "%s",lead); 18526 } 18527 switch (cs) { 18528 case REGEX_UNICODE_CHARSET: 18529 Perl_re_printf( aTHX_ "UNICODE"); 18530 break; 18531 case REGEX_LOCALE_CHARSET: 18532 Perl_re_printf( aTHX_ "LOCALE"); 18533 break; 18534 case REGEX_ASCII_RESTRICTED_CHARSET: 18535 Perl_re_printf( aTHX_ "ASCII-RESTRICTED"); 18536 break; 18537 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 18538 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED"); 18539 break; 18540 default: 18541 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET"); 18542 break; 18543 } 18544 } 18545 if (lead) { 18546 if (set) 18547 Perl_re_printf( aTHX_ "\n"); 18548 else 18549 Perl_re_printf( aTHX_ "%s[none-set]\n",lead); 18550 } 18551 } 18552 #endif 18553 18554 void 18555 Perl_regdump(pTHX_ const regexp *r) 18556 { 18557 #ifdef DEBUGGING 18558 SV * const sv = sv_newmortal(); 18559 SV *dsv= sv_newmortal(); 18560 RXi_GET_DECL(r,ri); 18561 GET_RE_DEBUG_FLAGS_DECL; 18562 18563 PERL_ARGS_ASSERT_REGDUMP; 18564 18565 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); 18566 18567 /* Header fields of interest. */ 18568 if (r->anchored_substr) { 18569 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), 18570 RE_SV_DUMPLEN(r->anchored_substr), 30); 18571 Perl_re_printf( aTHX_ 18572 "anchored %s%s at %"IVdf" ", 18573 s, RE_SV_TAIL(r->anchored_substr), 18574 (IV)r->anchored_offset); 18575 } else if (r->anchored_utf8) { 18576 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), 18577 RE_SV_DUMPLEN(r->anchored_utf8), 30); 18578 Perl_re_printf( aTHX_ 18579 "anchored utf8 %s%s at %"IVdf" ", 18580 s, RE_SV_TAIL(r->anchored_utf8), 18581 (IV)r->anchored_offset); 18582 } 18583 if (r->float_substr) { 18584 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), 18585 RE_SV_DUMPLEN(r->float_substr), 30); 18586 Perl_re_printf( aTHX_ 18587 "floating %s%s at %"IVdf"..%"UVuf" ", 18588 s, RE_SV_TAIL(r->float_substr), 18589 (IV)r->float_min_offset, (UV)r->float_max_offset); 18590 } else if (r->float_utf8) { 18591 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), 18592 RE_SV_DUMPLEN(r->float_utf8), 30); 18593 Perl_re_printf( aTHX_ 18594 "floating utf8 %s%s at %"IVdf"..%"UVuf" ", 18595 s, RE_SV_TAIL(r->float_utf8), 18596 (IV)r->float_min_offset, (UV)r->float_max_offset); 18597 } 18598 if (r->check_substr || r->check_utf8) 18599 Perl_re_printf( aTHX_ 18600 (const char *) 18601 (r->check_substr == r->float_substr 18602 && r->check_utf8 == r->float_utf8 18603 ? "(checking floating" : "(checking anchored")); 18604 if (r->intflags & PREGf_NOSCAN) 18605 Perl_re_printf( aTHX_ " noscan"); 18606 if (r->extflags & RXf_CHECK_ALL) 18607 Perl_re_printf( aTHX_ " isall"); 18608 if (r->check_substr || r->check_utf8) 18609 Perl_re_printf( aTHX_ ") "); 18610 18611 if (ri->regstclass) { 18612 regprop(r, sv, ri->regstclass, NULL, NULL); 18613 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); 18614 } 18615 if (r->intflags & PREGf_ANCH) { 18616 Perl_re_printf( aTHX_ "anchored"); 18617 if (r->intflags & PREGf_ANCH_MBOL) 18618 Perl_re_printf( aTHX_ "(MBOL)"); 18619 if (r->intflags & PREGf_ANCH_SBOL) 18620 Perl_re_printf( aTHX_ "(SBOL)"); 18621 if (r->intflags & PREGf_ANCH_GPOS) 18622 Perl_re_printf( aTHX_ "(GPOS)"); 18623 Perl_re_printf( aTHX_ " "); 18624 } 18625 if (r->intflags & PREGf_GPOS_SEEN) 18626 Perl_re_printf( aTHX_ "GPOS:%"UVuf" ", (UV)r->gofs); 18627 if (r->intflags & PREGf_SKIP) 18628 Perl_re_printf( aTHX_ "plus "); 18629 if (r->intflags & PREGf_IMPLICIT) 18630 Perl_re_printf( aTHX_ "implicit "); 18631 Perl_re_printf( aTHX_ "minlen %"IVdf" ", (IV)r->minlen); 18632 if (r->extflags & RXf_EVAL_SEEN) 18633 Perl_re_printf( aTHX_ "with eval "); 18634 Perl_re_printf( aTHX_ "\n"); 18635 DEBUG_FLAGS_r({ 18636 regdump_extflags("r->extflags: ",r->extflags); 18637 regdump_intflags("r->intflags: ",r->intflags); 18638 }); 18639 #else 18640 PERL_ARGS_ASSERT_REGDUMP; 18641 PERL_UNUSED_CONTEXT; 18642 PERL_UNUSED_ARG(r); 18643 #endif /* DEBUGGING */ 18644 } 18645 18646 /* Should be synchronized with ANYOF_ #defines in regcomp.h */ 18647 #ifdef DEBUGGING 18648 18649 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \ 18650 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \ 18651 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \ 18652 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \ 18653 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \ 18654 || _CC_VERTSPACE != 15 18655 # error Need to adjust order of anyofs[] 18656 # endif 18657 static const char * const anyofs[] = { 18658 "\\w", 18659 "\\W", 18660 "\\d", 18661 "\\D", 18662 "[:alpha:]", 18663 "[:^alpha:]", 18664 "[:lower:]", 18665 "[:^lower:]", 18666 "[:upper:]", 18667 "[:^upper:]", 18668 "[:punct:]", 18669 "[:^punct:]", 18670 "[:print:]", 18671 "[:^print:]", 18672 "[:alnum:]", 18673 "[:^alnum:]", 18674 "[:graph:]", 18675 "[:^graph:]", 18676 "[:cased:]", 18677 "[:^cased:]", 18678 "\\s", 18679 "\\S", 18680 "[:blank:]", 18681 "[:^blank:]", 18682 "[:xdigit:]", 18683 "[:^xdigit:]", 18684 "[:cntrl:]", 18685 "[:^cntrl:]", 18686 "[:ascii:]", 18687 "[:^ascii:]", 18688 "\\v", 18689 "\\V" 18690 }; 18691 #endif 18692 18693 /* 18694 - regprop - printable representation of opcode, with run time support 18695 */ 18696 18697 void 18698 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) 18699 { 18700 #ifdef DEBUGGING 18701 int k; 18702 RXi_GET_DECL(prog,progi); 18703 GET_RE_DEBUG_FLAGS_DECL; 18704 18705 PERL_ARGS_ASSERT_REGPROP; 18706 18707 sv_setpvn(sv, "", 0); 18708 18709 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ 18710 /* It would be nice to FAIL() here, but this may be called from 18711 regexec.c, and it would be hard to supply pRExC_state. */ 18712 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", 18713 (int)OP(o), (int)REGNODE_MAX); 18714 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ 18715 18716 k = PL_regkind[OP(o)]; 18717 18718 if (k == EXACT) { 18719 sv_catpvs(sv, " "); 18720 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 18721 * is a crude hack but it may be the best for now since 18722 * we have no flag "this EXACTish node was UTF-8" 18723 * --jhi */ 18724 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], 18725 PERL_PV_ESCAPE_UNI_DETECT | 18726 PERL_PV_ESCAPE_NONASCII | 18727 PERL_PV_PRETTY_ELLIPSES | 18728 PERL_PV_PRETTY_LTGT | 18729 PERL_PV_PRETTY_NOCLEAR 18730 ); 18731 } else if (k == TRIE) { 18732 /* print the details of the trie in dumpuntil instead, as 18733 * progi->data isn't available here */ 18734 const char op = OP(o); 18735 const U32 n = ARG(o); 18736 const reg_ac_data * const ac = IS_TRIE_AC(op) ? 18737 (reg_ac_data *)progi->data->data[n] : 18738 NULL; 18739 const reg_trie_data * const trie 18740 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; 18741 18742 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); 18743 DEBUG_TRIE_COMPILE_r( 18744 Perl_sv_catpvf(aTHX_ sv, 18745 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">", 18746 (UV)trie->startstate, 18747 (IV)trie->statecount-1, /* -1 because of the unused 0 element */ 18748 (UV)trie->wordcount, 18749 (UV)trie->minlen, 18750 (UV)trie->maxlen, 18751 (UV)TRIE_CHARCOUNT(trie), 18752 (UV)trie->uniquecharcount 18753 ); 18754 ); 18755 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { 18756 sv_catpvs(sv, "["); 18757 (void) put_charclass_bitmap_innards(sv, 18758 ((IS_ANYOF_TRIE(op)) 18759 ? ANYOF_BITMAP(o) 18760 : TRIE_BITMAP(trie)), 18761 NULL, 18762 NULL, 18763 NULL, 18764 FALSE 18765 ); 18766 sv_catpvs(sv, "]"); 18767 } 18768 18769 } else if (k == CURLY) { 18770 U32 lo = ARG1(o), hi = ARG2(o); 18771 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) 18772 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ 18773 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); 18774 if (hi == REG_INFTY) 18775 sv_catpvs(sv, "INFTY"); 18776 else 18777 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); 18778 sv_catpvs(sv, "}"); 18779 } 18780 else if (k == WHILEM && o->flags) /* Ordinal/of */ 18781 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); 18782 else if (k == REF || k == OPEN || k == CLOSE 18783 || k == GROUPP || OP(o)==ACCEPT) 18784 { 18785 AV *name_list= NULL; 18786 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o); 18787 Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */ 18788 if ( RXp_PAREN_NAMES(prog) ) { 18789 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 18790 } else if ( pRExC_state ) { 18791 name_list= RExC_paren_name_list; 18792 } 18793 if (name_list) { 18794 if ( k != REF || (OP(o) < NREF)) { 18795 SV **name= av_fetch(name_list, parno, 0 ); 18796 if (name) 18797 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 18798 } 18799 else { 18800 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); 18801 I32 *nums=(I32*)SvPVX(sv_dat); 18802 SV **name= av_fetch(name_list, nums[0], 0 ); 18803 I32 n; 18804 if (name) { 18805 for ( n=0; n<SvIVX(sv_dat); n++ ) { 18806 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf, 18807 (n ? "," : ""), (IV)nums[n]); 18808 } 18809 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 18810 } 18811 } 18812 } 18813 if ( k == REF && reginfo) { 18814 U32 n = ARG(o); /* which paren pair */ 18815 I32 ln = prog->offs[n].start; 18816 if (prog->lastparen < n || ln == -1) 18817 Perl_sv_catpvf(aTHX_ sv, ": FAIL"); 18818 else if (ln == prog->offs[n].end) 18819 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); 18820 else { 18821 const char *s = reginfo->strbeg + ln; 18822 Perl_sv_catpvf(aTHX_ sv, ": "); 18823 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, 18824 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); 18825 } 18826 } 18827 } else if (k == GOSUB) { 18828 AV *name_list= NULL; 18829 if ( RXp_PAREN_NAMES(prog) ) { 18830 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 18831 } else if ( pRExC_state ) { 18832 name_list= RExC_paren_name_list; 18833 } 18834 18835 /* Paren and offset */ 18836 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o), 18837 (int)((o + (int)ARG2L(o)) - progi->program) ); 18838 if (name_list) { 18839 SV **name= av_fetch(name_list, ARG(o), 0 ); 18840 if (name) 18841 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 18842 } 18843 } 18844 else if (k == LOGICAL) 18845 /* 2: embedded, otherwise 1 */ 18846 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); 18847 else if (k == ANYOF) { 18848 const U8 flags = ANYOF_FLAGS(o); 18849 bool do_sep = FALSE; /* Do we need to separate various components of 18850 the output? */ 18851 /* Set if there is still an unresolved user-defined property */ 18852 SV *unresolved = NULL; 18853 18854 /* Things that are ignored except when the runtime locale is UTF-8 */ 18855 SV *only_utf8_locale_invlist = NULL; 18856 18857 /* Code points that don't fit in the bitmap */ 18858 SV *nonbitmap_invlist = NULL; 18859 18860 /* And things that aren't in the bitmap, but are small enough to be */ 18861 SV* bitmap_range_not_in_bitmap = NULL; 18862 18863 const bool inverted = flags & ANYOF_INVERT; 18864 18865 if (OP(o) == ANYOFL) { 18866 if (ANYOFL_UTF8_LOCALE_REQD(flags)) { 18867 sv_catpvs(sv, "{utf8-locale-reqd}"); 18868 } 18869 if (flags & ANYOFL_FOLD) { 18870 sv_catpvs(sv, "{i}"); 18871 } 18872 } 18873 18874 /* If there is stuff outside the bitmap, get it */ 18875 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { 18876 (void) _get_regclass_nonbitmap_data(prog, o, FALSE, 18877 &unresolved, 18878 &only_utf8_locale_invlist, 18879 &nonbitmap_invlist); 18880 /* The non-bitmap data may contain stuff that could fit in the 18881 * bitmap. This could come from a user-defined property being 18882 * finally resolved when this call was done; or much more likely 18883 * because there are matches that require UTF-8 to be valid, and so 18884 * aren't in the bitmap. This is teased apart later */ 18885 _invlist_intersection(nonbitmap_invlist, 18886 PL_InBitmap, 18887 &bitmap_range_not_in_bitmap); 18888 /* Leave just the things that don't fit into the bitmap */ 18889 _invlist_subtract(nonbitmap_invlist, 18890 PL_InBitmap, 18891 &nonbitmap_invlist); 18892 } 18893 18894 /* Obey this flag to add all above-the-bitmap code points */ 18895 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { 18896 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, 18897 NUM_ANYOF_CODE_POINTS, 18898 UV_MAX); 18899 } 18900 18901 /* Ready to start outputting. First, the initial left bracket */ 18902 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 18903 18904 /* Then all the things that could fit in the bitmap */ 18905 do_sep = put_charclass_bitmap_innards(sv, 18906 ANYOF_BITMAP(o), 18907 bitmap_range_not_in_bitmap, 18908 only_utf8_locale_invlist, 18909 o, 18910 18911 /* Can't try inverting for a 18912 * better display if there are 18913 * things that haven't been 18914 * resolved */ 18915 unresolved != NULL); 18916 SvREFCNT_dec(bitmap_range_not_in_bitmap); 18917 18918 /* If there are user-defined properties which haven't been defined yet, 18919 * output them. If the result is not to be inverted, it is clearest to 18920 * output them in a separate [] from the bitmap range stuff. If the 18921 * result is to be complemented, we have to show everything in one [], 18922 * as the inversion applies to the whole thing. Use {braces} to 18923 * separate them from anything in the bitmap and anything above the 18924 * bitmap. */ 18925 if (unresolved) { 18926 if (inverted) { 18927 if (! do_sep) { /* If didn't output anything in the bitmap */ 18928 sv_catpvs(sv, "^"); 18929 } 18930 sv_catpvs(sv, "{"); 18931 } 18932 else if (do_sep) { 18933 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); 18934 } 18935 sv_catsv(sv, unresolved); 18936 if (inverted) { 18937 sv_catpvs(sv, "}"); 18938 } 18939 do_sep = ! inverted; 18940 } 18941 18942 /* And, finally, add the above-the-bitmap stuff */ 18943 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { 18944 SV* contents; 18945 18946 /* See if truncation size is overridden */ 18947 const STRLEN dump_len = (PL_dump_re_max_len) 18948 ? PL_dump_re_max_len 18949 : 256; 18950 18951 /* This is output in a separate [] */ 18952 if (do_sep) { 18953 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); 18954 } 18955 18956 /* And, for easy of understanding, it is shown in the 18957 * uncomplemented form if possible. The one exception being if 18958 * there are unresolved items, where the inversion has to be 18959 * delayed until runtime */ 18960 if (inverted && ! unresolved) { 18961 _invlist_invert(nonbitmap_invlist); 18962 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); 18963 } 18964 18965 contents = invlist_contents(nonbitmap_invlist, 18966 FALSE /* output suitable for catsv */ 18967 ); 18968 18969 /* If the output is shorter than the permissible maximum, just do it. */ 18970 if (SvCUR(contents) <= dump_len) { 18971 sv_catsv(sv, contents); 18972 } 18973 else { 18974 const char * contents_string = SvPVX(contents); 18975 STRLEN i = dump_len; 18976 18977 /* Otherwise, start at the permissible max and work back to the 18978 * first break possibility */ 18979 while (i > 0 && contents_string[i] != ' ') { 18980 i--; 18981 } 18982 if (i == 0) { /* Fail-safe. Use the max if we couldn't 18983 find a legal break */ 18984 i = dump_len; 18985 } 18986 18987 sv_catpvn(sv, contents_string, i); 18988 sv_catpvs(sv, "..."); 18989 } 18990 18991 SvREFCNT_dec_NN(contents); 18992 SvREFCNT_dec_NN(nonbitmap_invlist); 18993 } 18994 18995 /* And finally the matching, closing ']' */ 18996 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 18997 18998 SvREFCNT_dec(unresolved); 18999 } 19000 else if (k == POSIXD || k == NPOSIXD) { 19001 U8 index = FLAGS(o) * 2; 19002 if (index < C_ARRAY_LENGTH(anyofs)) { 19003 if (*anyofs[index] != '[') { 19004 sv_catpv(sv, "["); 19005 } 19006 sv_catpv(sv, anyofs[index]); 19007 if (*anyofs[index] != '[') { 19008 sv_catpv(sv, "]"); 19009 } 19010 } 19011 else { 19012 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); 19013 } 19014 } 19015 else if (k == BOUND || k == NBOUND) { 19016 /* Must be synced with order of 'bound_type' in regcomp.h */ 19017 const char * const bounds[] = { 19018 "", /* Traditional */ 19019 "{gcb}", 19020 "{lb}", 19021 "{sb}", 19022 "{wb}" 19023 }; 19024 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); 19025 sv_catpv(sv, bounds[FLAGS(o)]); 19026 } 19027 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) 19028 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); 19029 else if (OP(o) == SBOL) 19030 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); 19031 19032 /* add on the verb argument if there is one */ 19033 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) { 19034 Perl_sv_catpvf(aTHX_ sv, ":%"SVf, 19035 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); 19036 } 19037 #else 19038 PERL_UNUSED_CONTEXT; 19039 PERL_UNUSED_ARG(sv); 19040 PERL_UNUSED_ARG(o); 19041 PERL_UNUSED_ARG(prog); 19042 PERL_UNUSED_ARG(reginfo); 19043 PERL_UNUSED_ARG(pRExC_state); 19044 #endif /* DEBUGGING */ 19045 } 19046 19047 19048 19049 SV * 19050 Perl_re_intuit_string(pTHX_ REGEXP * const r) 19051 { /* Assume that RE_INTUIT is set */ 19052 struct regexp *const prog = ReANY(r); 19053 GET_RE_DEBUG_FLAGS_DECL; 19054 19055 PERL_ARGS_ASSERT_RE_INTUIT_STRING; 19056 PERL_UNUSED_CONTEXT; 19057 19058 DEBUG_COMPILE_r( 19059 { 19060 const char * const s = SvPV_nolen_const(RX_UTF8(r) 19061 ? prog->check_utf8 : prog->check_substr); 19062 19063 if (!PL_colorset) reginitcolors(); 19064 Perl_re_printf( aTHX_ 19065 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", 19066 PL_colors[4], 19067 RX_UTF8(r) ? "utf8 " : "", 19068 PL_colors[5],PL_colors[0], 19069 s, 19070 PL_colors[1], 19071 (strlen(s) > 60 ? "..." : "")); 19072 } ); 19073 19074 /* use UTF8 check substring if regexp pattern itself is in UTF8 */ 19075 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr; 19076 } 19077 19078 /* 19079 pregfree() 19080 19081 handles refcounting and freeing the perl core regexp structure. When 19082 it is necessary to actually free the structure the first thing it 19083 does is call the 'free' method of the regexp_engine associated to 19084 the regexp, allowing the handling of the void *pprivate; member 19085 first. (This routine is not overridable by extensions, which is why 19086 the extensions free is called first.) 19087 19088 See regdupe and regdupe_internal if you change anything here. 19089 */ 19090 #ifndef PERL_IN_XSUB_RE 19091 void 19092 Perl_pregfree(pTHX_ REGEXP *r) 19093 { 19094 SvREFCNT_dec(r); 19095 } 19096 19097 void 19098 Perl_pregfree2(pTHX_ REGEXP *rx) 19099 { 19100 struct regexp *const r = ReANY(rx); 19101 GET_RE_DEBUG_FLAGS_DECL; 19102 19103 PERL_ARGS_ASSERT_PREGFREE2; 19104 19105 if (r->mother_re) { 19106 ReREFCNT_dec(r->mother_re); 19107 } else { 19108 CALLREGFREE_PVT(rx); /* free the private data */ 19109 SvREFCNT_dec(RXp_PAREN_NAMES(r)); 19110 Safefree(r->xpv_len_u.xpvlenu_pv); 19111 } 19112 if (r->substrs) { 19113 SvREFCNT_dec(r->anchored_substr); 19114 SvREFCNT_dec(r->anchored_utf8); 19115 SvREFCNT_dec(r->float_substr); 19116 SvREFCNT_dec(r->float_utf8); 19117 Safefree(r->substrs); 19118 } 19119 RX_MATCH_COPY_FREE(rx); 19120 #ifdef PERL_ANY_COW 19121 SvREFCNT_dec(r->saved_copy); 19122 #endif 19123 Safefree(r->offs); 19124 SvREFCNT_dec(r->qr_anoncv); 19125 if (r->recurse_locinput) 19126 Safefree(r->recurse_locinput); 19127 rx->sv_u.svu_rx = 0; 19128 } 19129 19130 /* reg_temp_copy() 19131 19132 This is a hacky workaround to the structural issue of match results 19133 being stored in the regexp structure which is in turn stored in 19134 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern 19135 could be PL_curpm in multiple contexts, and could require multiple 19136 result sets being associated with the pattern simultaneously, such 19137 as when doing a recursive match with (??{$qr}) 19138 19139 The solution is to make a lightweight copy of the regexp structure 19140 when a qr// is returned from the code executed by (??{$qr}) this 19141 lightweight copy doesn't actually own any of its data except for 19142 the starp/end and the actual regexp structure itself. 19143 19144 */ 19145 19146 19147 REGEXP * 19148 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) 19149 { 19150 struct regexp *ret; 19151 struct regexp *const r = ReANY(rx); 19152 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV; 19153 19154 PERL_ARGS_ASSERT_REG_TEMP_COPY; 19155 19156 if (!ret_x) 19157 ret_x = (REGEXP*) newSV_type(SVt_REGEXP); 19158 else { 19159 SvOK_off((SV *)ret_x); 19160 if (islv) { 19161 /* For PVLVs, SvANY points to the xpvlv body while sv_u points 19162 to the regexp. (For SVt_REGEXPs, sv_upgrade has already 19163 made both spots point to the same regexp body.) */ 19164 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); 19165 assert(!SvPVX(ret_x)); 19166 ret_x->sv_u.svu_rx = temp->sv_any; 19167 temp->sv_any = NULL; 19168 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; 19169 SvREFCNT_dec_NN(temp); 19170 /* SvCUR still resides in the xpvlv struct, so the regexp copy- 19171 ing below will not set it. */ 19172 SvCUR_set(ret_x, SvCUR(rx)); 19173 } 19174 } 19175 /* This ensures that SvTHINKFIRST(sv) is true, and hence that 19176 sv_force_normal(sv) is called. */ 19177 SvFAKE_on(ret_x); 19178 ret = ReANY(ret_x); 19179 19180 SvFLAGS(ret_x) |= SvUTF8(rx); 19181 /* We share the same string buffer as the original regexp, on which we 19182 hold a reference count, incremented when mother_re is set below. 19183 The string pointer is copied here, being part of the regexp struct. 19184 */ 19185 memcpy(&(ret->xpv_cur), &(r->xpv_cur), 19186 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); 19187 if (r->offs) { 19188 const I32 npar = r->nparens+1; 19189 Newx(ret->offs, npar, regexp_paren_pair); 19190 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 19191 } 19192 if (r->substrs) { 19193 Newx(ret->substrs, 1, struct reg_substr_data); 19194 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 19195 19196 SvREFCNT_inc_void(ret->anchored_substr); 19197 SvREFCNT_inc_void(ret->anchored_utf8); 19198 SvREFCNT_inc_void(ret->float_substr); 19199 SvREFCNT_inc_void(ret->float_utf8); 19200 19201 /* check_substr and check_utf8, if non-NULL, point to either their 19202 anchored or float namesakes, and don't hold a second reference. */ 19203 } 19204 RX_MATCH_COPIED_off(ret_x); 19205 #ifdef PERL_ANY_COW 19206 ret->saved_copy = NULL; 19207 #endif 19208 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); 19209 SvREFCNT_inc_void(ret->qr_anoncv); 19210 if (r->recurse_locinput) 19211 Newxz(ret->recurse_locinput,r->nparens + 1,char *); 19212 19213 return ret_x; 19214 } 19215 #endif 19216 19217 /* regfree_internal() 19218 19219 Free the private data in a regexp. This is overloadable by 19220 extensions. Perl takes care of the regexp structure in pregfree(), 19221 this covers the *pprivate pointer which technically perl doesn't 19222 know about, however of course we have to handle the 19223 regexp_internal structure when no extension is in use. 19224 19225 Note this is called before freeing anything in the regexp 19226 structure. 19227 */ 19228 19229 void 19230 Perl_regfree_internal(pTHX_ REGEXP * const rx) 19231 { 19232 struct regexp *const r = ReANY(rx); 19233 RXi_GET_DECL(r,ri); 19234 GET_RE_DEBUG_FLAGS_DECL; 19235 19236 PERL_ARGS_ASSERT_REGFREE_INTERNAL; 19237 19238 DEBUG_COMPILE_r({ 19239 if (!PL_colorset) 19240 reginitcolors(); 19241 { 19242 SV *dsv= sv_newmortal(); 19243 RE_PV_QUOTED_DECL(s, RX_UTF8(rx), 19244 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); 19245 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n", 19246 PL_colors[4],PL_colors[5],s); 19247 } 19248 }); 19249 #ifdef RE_TRACK_PATTERN_OFFSETS 19250 if (ri->u.offsets) 19251 Safefree(ri->u.offsets); /* 20010421 MJD */ 19252 #endif 19253 if (ri->code_blocks) { 19254 int n; 19255 for (n = 0; n < ri->num_code_blocks; n++) 19256 SvREFCNT_dec(ri->code_blocks[n].src_regex); 19257 Safefree(ri->code_blocks); 19258 } 19259 19260 if (ri->data) { 19261 int n = ri->data->count; 19262 19263 while (--n >= 0) { 19264 /* If you add a ->what type here, update the comment in regcomp.h */ 19265 switch (ri->data->what[n]) { 19266 case 'a': 19267 case 'r': 19268 case 's': 19269 case 'S': 19270 case 'u': 19271 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); 19272 break; 19273 case 'f': 19274 Safefree(ri->data->data[n]); 19275 break; 19276 case 'l': 19277 case 'L': 19278 break; 19279 case 'T': 19280 { /* Aho Corasick add-on structure for a trie node. 19281 Used in stclass optimization only */ 19282 U32 refcount; 19283 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; 19284 #ifdef USE_ITHREADS 19285 dVAR; 19286 #endif 19287 OP_REFCNT_LOCK; 19288 refcount = --aho->refcount; 19289 OP_REFCNT_UNLOCK; 19290 if ( !refcount ) { 19291 PerlMemShared_free(aho->states); 19292 PerlMemShared_free(aho->fail); 19293 /* do this last!!!! */ 19294 PerlMemShared_free(ri->data->data[n]); 19295 /* we should only ever get called once, so 19296 * assert as much, and also guard the free 19297 * which /might/ happen twice. At the least 19298 * it will make code anlyzers happy and it 19299 * doesn't cost much. - Yves */ 19300 assert(ri->regstclass); 19301 if (ri->regstclass) { 19302 PerlMemShared_free(ri->regstclass); 19303 ri->regstclass = 0; 19304 } 19305 } 19306 } 19307 break; 19308 case 't': 19309 { 19310 /* trie structure. */ 19311 U32 refcount; 19312 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; 19313 #ifdef USE_ITHREADS 19314 dVAR; 19315 #endif 19316 OP_REFCNT_LOCK; 19317 refcount = --trie->refcount; 19318 OP_REFCNT_UNLOCK; 19319 if ( !refcount ) { 19320 PerlMemShared_free(trie->charmap); 19321 PerlMemShared_free(trie->states); 19322 PerlMemShared_free(trie->trans); 19323 if (trie->bitmap) 19324 PerlMemShared_free(trie->bitmap); 19325 if (trie->jump) 19326 PerlMemShared_free(trie->jump); 19327 PerlMemShared_free(trie->wordinfo); 19328 /* do this last!!!! */ 19329 PerlMemShared_free(ri->data->data[n]); 19330 } 19331 } 19332 break; 19333 default: 19334 Perl_croak(aTHX_ "panic: regfree data code '%c'", 19335 ri->data->what[n]); 19336 } 19337 } 19338 Safefree(ri->data->what); 19339 Safefree(ri->data); 19340 } 19341 19342 Safefree(ri); 19343 } 19344 19345 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t)) 19346 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) 19347 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) 19348 19349 /* 19350 re_dup_guts - duplicate a regexp. 19351 19352 This routine is expected to clone a given regexp structure. It is only 19353 compiled under USE_ITHREADS. 19354 19355 After all of the core data stored in struct regexp is duplicated 19356 the regexp_engine.dupe method is used to copy any private data 19357 stored in the *pprivate pointer. This allows extensions to handle 19358 any duplication it needs to do. 19359 19360 See pregfree() and regfree_internal() if you change anything here. 19361 */ 19362 #if defined(USE_ITHREADS) 19363 #ifndef PERL_IN_XSUB_RE 19364 void 19365 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) 19366 { 19367 dVAR; 19368 I32 npar; 19369 const struct regexp *r = ReANY(sstr); 19370 struct regexp *ret = ReANY(dstr); 19371 19372 PERL_ARGS_ASSERT_RE_DUP_GUTS; 19373 19374 npar = r->nparens+1; 19375 Newx(ret->offs, npar, regexp_paren_pair); 19376 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 19377 19378 if (ret->substrs) { 19379 /* Do it this way to avoid reading from *r after the StructCopy(). 19380 That way, if any of the sv_dup_inc()s dislodge *r from the L1 19381 cache, it doesn't matter. */ 19382 const bool anchored = r->check_substr 19383 ? r->check_substr == r->anchored_substr 19384 : r->check_utf8 == r->anchored_utf8; 19385 Newx(ret->substrs, 1, struct reg_substr_data); 19386 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 19387 19388 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); 19389 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); 19390 ret->float_substr = sv_dup_inc(ret->float_substr, param); 19391 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); 19392 19393 /* check_substr and check_utf8, if non-NULL, point to either their 19394 anchored or float namesakes, and don't hold a second reference. */ 19395 19396 if (ret->check_substr) { 19397 if (anchored) { 19398 assert(r->check_utf8 == r->anchored_utf8); 19399 ret->check_substr = ret->anchored_substr; 19400 ret->check_utf8 = ret->anchored_utf8; 19401 } else { 19402 assert(r->check_substr == r->float_substr); 19403 assert(r->check_utf8 == r->float_utf8); 19404 ret->check_substr = ret->float_substr; 19405 ret->check_utf8 = ret->float_utf8; 19406 } 19407 } else if (ret->check_utf8) { 19408 if (anchored) { 19409 ret->check_utf8 = ret->anchored_utf8; 19410 } else { 19411 ret->check_utf8 = ret->float_utf8; 19412 } 19413 } 19414 } 19415 19416 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); 19417 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); 19418 if (r->recurse_locinput) 19419 Newxz(ret->recurse_locinput,r->nparens + 1,char *); 19420 19421 if (ret->pprivate) 19422 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); 19423 19424 if (RX_MATCH_COPIED(dstr)) 19425 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); 19426 else 19427 ret->subbeg = NULL; 19428 #ifdef PERL_ANY_COW 19429 ret->saved_copy = NULL; 19430 #endif 19431 19432 /* Whether mother_re be set or no, we need to copy the string. We 19433 cannot refrain from copying it when the storage points directly to 19434 our mother regexp, because that's 19435 1: a buffer in a different thread 19436 2: something we no longer hold a reference on 19437 so we need to copy it locally. */ 19438 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); 19439 ret->mother_re = NULL; 19440 } 19441 #endif /* PERL_IN_XSUB_RE */ 19442 19443 /* 19444 regdupe_internal() 19445 19446 This is the internal complement to regdupe() which is used to copy 19447 the structure pointed to by the *pprivate pointer in the regexp. 19448 This is the core version of the extension overridable cloning hook. 19449 The regexp structure being duplicated will be copied by perl prior 19450 to this and will be provided as the regexp *r argument, however 19451 with the /old/ structures pprivate pointer value. Thus this routine 19452 may override any copying normally done by perl. 19453 19454 It returns a pointer to the new regexp_internal structure. 19455 */ 19456 19457 void * 19458 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) 19459 { 19460 dVAR; 19461 struct regexp *const r = ReANY(rx); 19462 regexp_internal *reti; 19463 int len; 19464 RXi_GET_DECL(r,ri); 19465 19466 PERL_ARGS_ASSERT_REGDUPE_INTERNAL; 19467 19468 len = ProgLen(ri); 19469 19470 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), 19471 char, regexp_internal); 19472 Copy(ri->program, reti->program, len+1, regnode); 19473 19474 19475 reti->num_code_blocks = ri->num_code_blocks; 19476 if (ri->code_blocks) { 19477 int n; 19478 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, 19479 struct reg_code_block); 19480 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, 19481 struct reg_code_block); 19482 for (n = 0; n < ri->num_code_blocks; n++) 19483 reti->code_blocks[n].src_regex = (REGEXP*) 19484 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); 19485 } 19486 else 19487 reti->code_blocks = NULL; 19488 19489 reti->regstclass = NULL; 19490 19491 if (ri->data) { 19492 struct reg_data *d; 19493 const int count = ri->data->count; 19494 int i; 19495 19496 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), 19497 char, struct reg_data); 19498 Newx(d->what, count, U8); 19499 19500 d->count = count; 19501 for (i = 0; i < count; i++) { 19502 d->what[i] = ri->data->what[i]; 19503 switch (d->what[i]) { 19504 /* see also regcomp.h and regfree_internal() */ 19505 case 'a': /* actually an AV, but the dup function is identical. */ 19506 case 'r': 19507 case 's': 19508 case 'S': 19509 case 'u': /* actually an HV, but the dup function is identical. */ 19510 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); 19511 break; 19512 case 'f': 19513 /* This is cheating. */ 19514 Newx(d->data[i], 1, regnode_ssc); 19515 StructCopy(ri->data->data[i], d->data[i], regnode_ssc); 19516 reti->regstclass = (regnode*)d->data[i]; 19517 break; 19518 case 'T': 19519 /* Trie stclasses are readonly and can thus be shared 19520 * without duplication. We free the stclass in pregfree 19521 * when the corresponding reg_ac_data struct is freed. 19522 */ 19523 reti->regstclass= ri->regstclass; 19524 /* FALLTHROUGH */ 19525 case 't': 19526 OP_REFCNT_LOCK; 19527 ((reg_trie_data*)ri->data->data[i])->refcount++; 19528 OP_REFCNT_UNLOCK; 19529 /* FALLTHROUGH */ 19530 case 'l': 19531 case 'L': 19532 d->data[i] = ri->data->data[i]; 19533 break; 19534 default: 19535 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'", 19536 ri->data->what[i]); 19537 } 19538 } 19539 19540 reti->data = d; 19541 } 19542 else 19543 reti->data = NULL; 19544 19545 reti->name_list_idx = ri->name_list_idx; 19546 19547 #ifdef RE_TRACK_PATTERN_OFFSETS 19548 if (ri->u.offsets) { 19549 Newx(reti->u.offsets, 2*len+1, U32); 19550 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); 19551 } 19552 #else 19553 SetProgLen(reti,len); 19554 #endif 19555 19556 return (void*)reti; 19557 } 19558 19559 #endif /* USE_ITHREADS */ 19560 19561 #ifndef PERL_IN_XSUB_RE 19562 19563 /* 19564 - regnext - dig the "next" pointer out of a node 19565 */ 19566 regnode * 19567 Perl_regnext(pTHX_ regnode *p) 19568 { 19569 I32 offset; 19570 19571 if (!p) 19572 return(NULL); 19573 19574 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ 19575 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", 19576 (int)OP(p), (int)REGNODE_MAX); 19577 } 19578 19579 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); 19580 if (offset == 0) 19581 return(NULL); 19582 19583 return(p+offset); 19584 } 19585 #endif 19586 19587 STATIC void 19588 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) 19589 { 19590 va_list args; 19591 STRLEN l1 = strlen(pat1); 19592 STRLEN l2 = strlen(pat2); 19593 char buf[512]; 19594 SV *msv; 19595 const char *message; 19596 19597 PERL_ARGS_ASSERT_RE_CROAK2; 19598 19599 if (l1 > 510) 19600 l1 = 510; 19601 if (l1 + l2 > 510) 19602 l2 = 510 - l1; 19603 Copy(pat1, buf, l1 , char); 19604 Copy(pat2, buf + l1, l2 , char); 19605 buf[l1 + l2] = '\n'; 19606 buf[l1 + l2 + 1] = '\0'; 19607 va_start(args, pat2); 19608 msv = vmess(buf, &args); 19609 va_end(args); 19610 message = SvPV_const(msv,l1); 19611 if (l1 > 512) 19612 l1 = 512; 19613 Copy(message, buf, l1 , char); 19614 /* l1-1 to avoid \n */ 19615 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); 19616 } 19617 19618 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 19619 19620 #ifndef PERL_IN_XSUB_RE 19621 void 19622 Perl_save_re_context(pTHX) 19623 { 19624 I32 nparens = -1; 19625 I32 i; 19626 19627 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ 19628 19629 if (PL_curpm) { 19630 const REGEXP * const rx = PM_GETRE(PL_curpm); 19631 if (rx) 19632 nparens = RX_NPARENS(rx); 19633 } 19634 19635 /* RT #124109. This is a complete hack; in the SWASHNEW case we know 19636 * that PL_curpm will be null, but that utf8.pm and the modules it 19637 * loads will only use $1..$3. 19638 * The t/porting/re_context.t test file checks this assumption. 19639 */ 19640 if (nparens == -1) 19641 nparens = 3; 19642 19643 for (i = 1; i <= nparens; i++) { 19644 char digits[TYPE_CHARS(long)]; 19645 const STRLEN len = my_snprintf(digits, sizeof(digits), 19646 "%lu", (long)i); 19647 GV *const *const gvp 19648 = (GV**)hv_fetch(PL_defstash, digits, len, 0); 19649 19650 if (gvp) { 19651 GV * const gv = *gvp; 19652 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) 19653 save_scalar(gv); 19654 } 19655 } 19656 } 19657 #endif 19658 19659 #ifdef DEBUGGING 19660 19661 STATIC void 19662 S_put_code_point(pTHX_ SV *sv, UV c) 19663 { 19664 PERL_ARGS_ASSERT_PUT_CODE_POINT; 19665 19666 if (c > 255) { 19667 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c); 19668 } 19669 else if (isPRINT(c)) { 19670 const char string = (char) c; 19671 19672 /* We use {phrase} as metanotation in the class, so also escape literal 19673 * braces */ 19674 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') 19675 sv_catpvs(sv, "\\"); 19676 sv_catpvn(sv, &string, 1); 19677 } 19678 else if (isMNEMONIC_CNTRL(c)) { 19679 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); 19680 } 19681 else { 19682 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); 19683 } 19684 } 19685 19686 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 19687 19688 STATIC void 19689 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) 19690 { 19691 /* Appends to 'sv' a displayable version of the range of code points from 19692 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls 19693 * that have them, when they occur at the beginning or end of the range. 19694 * It uses hex to output the remaining code points, unless 'allow_literals' 19695 * is true, in which case the printable ASCII ones are output as-is (though 19696 * some of these will be escaped by put_code_point()). 19697 * 19698 * NOTE: This is designed only for printing ranges of code points that fit 19699 * inside an ANYOF bitmap. Higher code points are simply suppressed 19700 */ 19701 19702 const unsigned int min_range_count = 3; 19703 19704 assert(start <= end); 19705 19706 PERL_ARGS_ASSERT_PUT_RANGE; 19707 19708 while (start <= end) { 19709 UV this_end; 19710 const char * format; 19711 19712 if (end - start < min_range_count) { 19713 19714 /* Output chars individually when they occur in short ranges */ 19715 for (; start <= end; start++) { 19716 put_code_point(sv, start); 19717 } 19718 break; 19719 } 19720 19721 /* If permitted by the input options, and there is a possibility that 19722 * this range contains a printable literal, look to see if there is 19723 * one. */ 19724 if (allow_literals && start <= MAX_PRINT_A) { 19725 19726 /* If the character at the beginning of the range isn't an ASCII 19727 * printable, effectively split the range into two parts: 19728 * 1) the portion before the first such printable, 19729 * 2) the rest 19730 * and output them separately. */ 19731 if (! isPRINT_A(start)) { 19732 UV temp_end = start + 1; 19733 19734 /* There is no point looking beyond the final possible 19735 * printable, in MAX_PRINT_A */ 19736 UV max = MIN(end, MAX_PRINT_A); 19737 19738 while (temp_end <= max && ! isPRINT_A(temp_end)) { 19739 temp_end++; 19740 } 19741 19742 /* Here, temp_end points to one beyond the first printable if 19743 * found, or to one beyond 'max' if not. If none found, make 19744 * sure that we use the entire range */ 19745 if (temp_end > MAX_PRINT_A) { 19746 temp_end = end + 1; 19747 } 19748 19749 /* Output the first part of the split range: the part that 19750 * doesn't have printables, with the parameter set to not look 19751 * for literals (otherwise we would infinitely recurse) */ 19752 put_range(sv, start, temp_end - 1, FALSE); 19753 19754 /* The 2nd part of the range (if any) starts here. */ 19755 start = temp_end; 19756 19757 /* We do a continue, instead of dropping down, because even if 19758 * the 2nd part is non-empty, it could be so short that we want 19759 * to output it as individual characters, as tested for at the 19760 * top of this loop. */ 19761 continue; 19762 } 19763 19764 /* Here, 'start' is a printable ASCII. If it is an alphanumeric, 19765 * output a sub-range of just the digits or letters, then process 19766 * the remaining portion as usual. */ 19767 if (isALPHANUMERIC_A(start)) { 19768 UV mask = (isDIGIT_A(start)) 19769 ? _CC_DIGIT 19770 : isUPPER_A(start) 19771 ? _CC_UPPER 19772 : _CC_LOWER; 19773 UV temp_end = start + 1; 19774 19775 /* Find the end of the sub-range that includes just the 19776 * characters in the same class as the first character in it */ 19777 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) { 19778 temp_end++; 19779 } 19780 temp_end--; 19781 19782 /* For short ranges, don't duplicate the code above to output 19783 * them; just call recursively */ 19784 if (temp_end - start < min_range_count) { 19785 put_range(sv, start, temp_end, FALSE); 19786 } 19787 else { /* Output as a range */ 19788 put_code_point(sv, start); 19789 sv_catpvs(sv, "-"); 19790 put_code_point(sv, temp_end); 19791 } 19792 start = temp_end + 1; 19793 continue; 19794 } 19795 19796 /* We output any other printables as individual characters */ 19797 if (isPUNCT_A(start) || isSPACE_A(start)) { 19798 while (start <= end && (isPUNCT_A(start) 19799 || isSPACE_A(start))) 19800 { 19801 put_code_point(sv, start); 19802 start++; 19803 } 19804 continue; 19805 } 19806 } /* End of looking for literals */ 19807 19808 /* Here is not to output as a literal. Some control characters have 19809 * mnemonic names. Split off any of those at the beginning and end of 19810 * the range to print mnemonically. It isn't possible for many of 19811 * these to be in a row, so this won't overwhelm with output */ 19812 while (isMNEMONIC_CNTRL(start) && start <= end) { 19813 put_code_point(sv, start); 19814 start++; 19815 } 19816 if (start < end && isMNEMONIC_CNTRL(end)) { 19817 19818 /* Here, the final character in the range has a mnemonic name. 19819 * Work backwards from the end to find the final non-mnemonic */ 19820 UV temp_end = end - 1; 19821 while (isMNEMONIC_CNTRL(temp_end)) { 19822 temp_end--; 19823 } 19824 19825 /* And separately output the interior range that doesn't start or 19826 * end with mnemonics */ 19827 put_range(sv, start, temp_end, FALSE); 19828 19829 /* Then output the mnemonic trailing controls */ 19830 start = temp_end + 1; 19831 while (start <= end) { 19832 put_code_point(sv, start); 19833 start++; 19834 } 19835 break; 19836 } 19837 19838 /* As a final resort, output the range or subrange as hex. */ 19839 19840 this_end = (end < NUM_ANYOF_CODE_POINTS) 19841 ? end 19842 : NUM_ANYOF_CODE_POINTS - 1; 19843 #if NUM_ANYOF_CODE_POINTS > 256 19844 format = (this_end < 256) 19845 ? "\\x%02"UVXf"-\\x%02"UVXf"" 19846 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}"; 19847 #else 19848 format = "\\x%02"UVXf"-\\x%02"UVXf""; 19849 #endif 19850 GCC_DIAG_IGNORE(-Wformat-nonliteral); 19851 Perl_sv_catpvf(aTHX_ sv, format, start, this_end); 19852 GCC_DIAG_RESTORE; 19853 break; 19854 } 19855 } 19856 19857 STATIC void 19858 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) 19859 { 19860 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list 19861 * 'invlist' */ 19862 19863 UV start, end; 19864 bool allow_literals = TRUE; 19865 19866 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; 19867 19868 /* Generally, it is more readable if printable characters are output as 19869 * literals, but if a range (nearly) spans all of them, it's best to output 19870 * it as a single range. This code will use a single range if all but 2 19871 * ASCII printables are in it */ 19872 invlist_iterinit(invlist); 19873 while (invlist_iternext(invlist, &start, &end)) { 19874 19875 /* If the range starts beyond the final printable, it doesn't have any 19876 * in it */ 19877 if (start > MAX_PRINT_A) { 19878 break; 19879 } 19880 19881 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span 19882 * all but two, the range must start and end no later than 2 from 19883 * either end */ 19884 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) { 19885 if (end > MAX_PRINT_A) { 19886 end = MAX_PRINT_A; 19887 } 19888 if (start < ' ') { 19889 start = ' '; 19890 } 19891 if (end - start >= MAX_PRINT_A - ' ' - 2) { 19892 allow_literals = FALSE; 19893 } 19894 break; 19895 } 19896 } 19897 invlist_iterfinish(invlist); 19898 19899 /* Here we have figured things out. Output each range */ 19900 invlist_iterinit(invlist); 19901 while (invlist_iternext(invlist, &start, &end)) { 19902 if (start >= NUM_ANYOF_CODE_POINTS) { 19903 break; 19904 } 19905 put_range(sv, start, end, allow_literals); 19906 } 19907 invlist_iterfinish(invlist); 19908 19909 return; 19910 } 19911 19912 STATIC SV* 19913 S_put_charclass_bitmap_innards_common(pTHX_ 19914 SV* invlist, /* The bitmap */ 19915 SV* posixes, /* Under /l, things like [:word:], \S */ 19916 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ 19917 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ 19918 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ 19919 const bool invert /* Is the result to be inverted? */ 19920 ) 19921 { 19922 /* Create and return an SV containing a displayable version of the bitmap 19923 * and associated information determined by the input parameters. If the 19924 * output would have been only the inversion indicator '^', NULL is instead 19925 * returned. */ 19926 19927 SV * output; 19928 19929 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; 19930 19931 if (invert) { 19932 output = newSVpvs("^"); 19933 } 19934 else { 19935 output = newSVpvs(""); 19936 } 19937 19938 /* First, the code points in the bitmap that are unconditionally there */ 19939 put_charclass_bitmap_innards_invlist(output, invlist); 19940 19941 /* Traditionally, these have been placed after the main code points */ 19942 if (posixes) { 19943 sv_catsv(output, posixes); 19944 } 19945 19946 if (only_utf8 && _invlist_len(only_utf8)) { 19947 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); 19948 put_charclass_bitmap_innards_invlist(output, only_utf8); 19949 } 19950 19951 if (not_utf8 && _invlist_len(not_utf8)) { 19952 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); 19953 put_charclass_bitmap_innards_invlist(output, not_utf8); 19954 } 19955 19956 if (only_utf8_locale && _invlist_len(only_utf8_locale)) { 19957 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); 19958 put_charclass_bitmap_innards_invlist(output, only_utf8_locale); 19959 19960 /* This is the only list in this routine that can legally contain code 19961 * points outside the bitmap range. The call just above to 19962 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so 19963 * output them here. There's about a half-dozen possible, and none in 19964 * contiguous ranges longer than 2 */ 19965 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 19966 UV start, end; 19967 SV* above_bitmap = NULL; 19968 19969 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); 19970 19971 invlist_iterinit(above_bitmap); 19972 while (invlist_iternext(above_bitmap, &start, &end)) { 19973 UV i; 19974 19975 for (i = start; i <= end; i++) { 19976 put_code_point(output, i); 19977 } 19978 } 19979 invlist_iterfinish(above_bitmap); 19980 SvREFCNT_dec_NN(above_bitmap); 19981 } 19982 } 19983 19984 if (invert && SvCUR(output) == 1) { 19985 return NULL; 19986 } 19987 19988 return output; 19989 } 19990 19991 STATIC bool 19992 S_put_charclass_bitmap_innards(pTHX_ SV *sv, 19993 char *bitmap, 19994 SV *nonbitmap_invlist, 19995 SV *only_utf8_locale_invlist, 19996 const regnode * const node, 19997 const bool force_as_is_display) 19998 { 19999 /* Appends to 'sv' a displayable version of the innards of the bracketed 20000 * character class defined by the other arguments: 20001 * 'bitmap' points to the bitmap. 20002 * 'nonbitmap_invlist' is an inversion list of the code points that are in 20003 * the bitmap range, but for some reason aren't in the bitmap; NULL if 20004 * none. The reasons for this could be that they require some 20005 * condition such as the target string being or not being in UTF-8 20006 * (under /d), or because they came from a user-defined property that 20007 * was not resolved at the time of the regex compilation (under /u) 20008 * 'only_utf8_locale_invlist' is an inversion list of the code points that 20009 * are valid only if the runtime locale is a UTF-8 one; NULL if none 20010 * 'node' is the regex pattern node. It is needed only when the above two 20011 * parameters are not null, and is passed so that this routine can 20012 * tease apart the various reasons for them. 20013 * 'force_as_is_display' is TRUE if this routine should definitely NOT try 20014 * to invert things to see if that leads to a cleaner display. If 20015 * FALSE, this routine is free to use its judgment about doing this. 20016 * 20017 * It returns TRUE if there was actually something output. (It may be that 20018 * the bitmap, etc is empty.) 20019 * 20020 * When called for outputting the bitmap of a non-ANYOF node, just pass the 20021 * bitmap, with the succeeding parameters set to NULL, and the final one to 20022 * FALSE. 20023 */ 20024 20025 /* In general, it tries to display the 'cleanest' representation of the 20026 * innards, choosing whether to display them inverted or not, regardless of 20027 * whether the class itself is to be inverted. However, there are some 20028 * cases where it can't try inverting, as what actually matches isn't known 20029 * until runtime, and hence the inversion isn't either. */ 20030 bool inverting_allowed = ! force_as_is_display; 20031 20032 int i; 20033 STRLEN orig_sv_cur = SvCUR(sv); 20034 20035 SV* invlist; /* Inversion list we accumulate of code points that 20036 are unconditionally matched */ 20037 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is 20038 UTF-8 */ 20039 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 20040 */ 20041 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ 20042 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale 20043 is UTF-8 */ 20044 20045 SV* as_is_display; /* The output string when we take the inputs 20046 literally */ 20047 SV* inverted_display; /* The output string when we invert the inputs */ 20048 20049 U8 flags = (node) ? ANYOF_FLAGS(node) : 0; 20050 20051 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted 20052 to match? */ 20053 /* We are biased in favor of displaying things without them being inverted, 20054 * as that is generally easier to understand */ 20055 const int bias = 5; 20056 20057 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; 20058 20059 /* Start off with whatever code points are passed in. (We clone, so we 20060 * don't change the caller's list) */ 20061 if (nonbitmap_invlist) { 20062 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); 20063 invlist = invlist_clone(nonbitmap_invlist); 20064 } 20065 else { /* Worst case size is every other code point is matched */ 20066 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); 20067 } 20068 20069 if (flags) { 20070 if (OP(node) == ANYOFD) { 20071 20072 /* This flag indicates that the code points below 0x100 in the 20073 * nonbitmap list are precisely the ones that match only when the 20074 * target is UTF-8 (they should all be non-ASCII). */ 20075 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) 20076 { 20077 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); 20078 _invlist_subtract(invlist, only_utf8, &invlist); 20079 } 20080 20081 /* And this flag for matching all non-ASCII 0xFF and below */ 20082 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) 20083 { 20084 not_utf8 = invlist_clone(PL_UpperLatin1); 20085 } 20086 } 20087 else if (OP(node) == ANYOFL) { 20088 20089 /* If either of these flags are set, what matches isn't 20090 * determinable except during execution, so don't know enough here 20091 * to invert */ 20092 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { 20093 inverting_allowed = FALSE; 20094 } 20095 20096 /* What the posix classes match also varies at runtime, so these 20097 * will be output symbolically. */ 20098 if (ANYOF_POSIXL_TEST_ANY_SET(node)) { 20099 int i; 20100 20101 posixes = newSVpvs(""); 20102 for (i = 0; i < ANYOF_POSIXL_MAX; i++) { 20103 if (ANYOF_POSIXL_TEST(node,i)) { 20104 sv_catpv(posixes, anyofs[i]); 20105 } 20106 } 20107 } 20108 } 20109 } 20110 20111 /* Accumulate the bit map into the unconditional match list */ 20112 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { 20113 if (BITMAP_TEST(bitmap, i)) { 20114 int start = i++; 20115 for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) { 20116 /* empty */ 20117 } 20118 invlist = _add_range_to_invlist(invlist, start, i-1); 20119 } 20120 } 20121 20122 /* Make sure that the conditional match lists don't have anything in them 20123 * that match unconditionally; otherwise the output is quite confusing. 20124 * This could happen if the code that populates these misses some 20125 * duplication. */ 20126 if (only_utf8) { 20127 _invlist_subtract(only_utf8, invlist, &only_utf8); 20128 } 20129 if (not_utf8) { 20130 _invlist_subtract(not_utf8, invlist, ¬_utf8); 20131 } 20132 20133 if (only_utf8_locale_invlist) { 20134 20135 /* Since this list is passed in, we have to make a copy before 20136 * modifying it */ 20137 only_utf8_locale = invlist_clone(only_utf8_locale_invlist); 20138 20139 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); 20140 20141 /* And, it can get really weird for us to try outputting an inverted 20142 * form of this list when it has things above the bitmap, so don't even 20143 * try */ 20144 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 20145 inverting_allowed = FALSE; 20146 } 20147 } 20148 20149 /* Calculate what the output would be if we take the input as-is */ 20150 as_is_display = put_charclass_bitmap_innards_common(invlist, 20151 posixes, 20152 only_utf8, 20153 not_utf8, 20154 only_utf8_locale, 20155 invert); 20156 20157 /* If have to take the output as-is, just do that */ 20158 if (! inverting_allowed) { 20159 if (as_is_display) { 20160 sv_catsv(sv, as_is_display); 20161 SvREFCNT_dec_NN(as_is_display); 20162 } 20163 } 20164 else { /* But otherwise, create the output again on the inverted input, and 20165 use whichever version is shorter */ 20166 20167 int inverted_bias, as_is_bias; 20168 20169 /* We will apply our bias to whichever of the the results doesn't have 20170 * the '^' */ 20171 if (invert) { 20172 invert = FALSE; 20173 as_is_bias = bias; 20174 inverted_bias = 0; 20175 } 20176 else { 20177 invert = TRUE; 20178 as_is_bias = 0; 20179 inverted_bias = bias; 20180 } 20181 20182 /* Now invert each of the lists that contribute to the output, 20183 * excluding from the result things outside the possible range */ 20184 20185 /* For the unconditional inversion list, we have to add in all the 20186 * conditional code points, so that when inverted, they will be gone 20187 * from it */ 20188 _invlist_union(only_utf8, invlist, &invlist); 20189 _invlist_union(not_utf8, invlist, &invlist); 20190 _invlist_union(only_utf8_locale, invlist, &invlist); 20191 _invlist_invert(invlist); 20192 _invlist_intersection(invlist, PL_InBitmap, &invlist); 20193 20194 if (only_utf8) { 20195 _invlist_invert(only_utf8); 20196 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); 20197 } 20198 20199 if (not_utf8) { 20200 _invlist_invert(not_utf8); 20201 _invlist_intersection(not_utf8, PL_UpperLatin1, ¬_utf8); 20202 } 20203 20204 if (only_utf8_locale) { 20205 _invlist_invert(only_utf8_locale); 20206 _invlist_intersection(only_utf8_locale, 20207 PL_InBitmap, 20208 &only_utf8_locale); 20209 } 20210 20211 inverted_display = put_charclass_bitmap_innards_common( 20212 invlist, 20213 posixes, 20214 only_utf8, 20215 not_utf8, 20216 only_utf8_locale, invert); 20217 20218 /* Use the shortest representation, taking into account our bias 20219 * against showing it inverted */ 20220 if ( inverted_display 20221 && ( ! as_is_display 20222 || ( SvCUR(inverted_display) + inverted_bias 20223 < SvCUR(as_is_display) + as_is_bias))) 20224 { 20225 sv_catsv(sv, inverted_display); 20226 } 20227 else if (as_is_display) { 20228 sv_catsv(sv, as_is_display); 20229 } 20230 20231 SvREFCNT_dec(as_is_display); 20232 SvREFCNT_dec(inverted_display); 20233 } 20234 20235 SvREFCNT_dec_NN(invlist); 20236 SvREFCNT_dec(only_utf8); 20237 SvREFCNT_dec(not_utf8); 20238 SvREFCNT_dec(posixes); 20239 SvREFCNT_dec(only_utf8_locale); 20240 20241 return SvCUR(sv) > orig_sv_cur; 20242 } 20243 20244 #define CLEAR_OPTSTART \ 20245 if (optstart) STMT_START { \ 20246 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ 20247 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ 20248 optstart=NULL; \ 20249 } STMT_END 20250 20251 #define DUMPUNTIL(b,e) \ 20252 CLEAR_OPTSTART; \ 20253 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); 20254 20255 STATIC const regnode * 20256 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, 20257 const regnode *last, const regnode *plast, 20258 SV* sv, I32 indent, U32 depth) 20259 { 20260 U8 op = PSEUDO; /* Arbitrary non-END op. */ 20261 const regnode *next; 20262 const regnode *optstart= NULL; 20263 20264 RXi_GET_DECL(r,ri); 20265 GET_RE_DEBUG_FLAGS_DECL; 20266 20267 PERL_ARGS_ASSERT_DUMPUNTIL; 20268 20269 #ifdef DEBUG_DUMPUNTIL 20270 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n",indent,node-start, 20271 last ? last-start : 0,plast ? plast-start : 0); 20272 #endif 20273 20274 if (plast && plast < last) 20275 last= plast; 20276 20277 while (PL_regkind[op] != END && (!last || node < last)) { 20278 assert(node); 20279 /* While that wasn't END last time... */ 20280 NODE_ALIGN(node); 20281 op = OP(node); 20282 if (op == CLOSE || op == WHILEM) 20283 indent--; 20284 next = regnext((regnode *)node); 20285 20286 /* Where, what. */ 20287 if (OP(node) == OPTIMIZED) { 20288 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) 20289 optstart = node; 20290 else 20291 goto after_print; 20292 } else 20293 CLEAR_OPTSTART; 20294 20295 regprop(r, sv, node, NULL, NULL); 20296 Perl_re_printf( aTHX_ "%4"IVdf":%*s%s", (IV)(node - start), 20297 (int)(2*indent + 1), "", SvPVX_const(sv)); 20298 20299 if (OP(node) != OPTIMIZED) { 20300 if (next == NULL) /* Next ptr. */ 20301 Perl_re_printf( aTHX_ " (0)"); 20302 else if (PL_regkind[(U8)op] == BRANCH 20303 && PL_regkind[OP(next)] != BRANCH ) 20304 Perl_re_printf( aTHX_ " (FAIL)"); 20305 else 20306 Perl_re_printf( aTHX_ " (%"IVdf")", (IV)(next - start)); 20307 Perl_re_printf( aTHX_ "\n"); 20308 } 20309 20310 after_print: 20311 if (PL_regkind[(U8)op] == BRANCHJ) { 20312 assert(next); 20313 { 20314 const regnode *nnode = (OP(next) == LONGJMP 20315 ? regnext((regnode *)next) 20316 : next); 20317 if (last && nnode > last) 20318 nnode = last; 20319 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); 20320 } 20321 } 20322 else if (PL_regkind[(U8)op] == BRANCH) { 20323 assert(next); 20324 DUMPUNTIL(NEXTOPER(node), next); 20325 } 20326 else if ( PL_regkind[(U8)op] == TRIE ) { 20327 const regnode *this_trie = node; 20328 const char op = OP(node); 20329 const U32 n = ARG(node); 20330 const reg_ac_data * const ac = op>=AHOCORASICK ? 20331 (reg_ac_data *)ri->data->data[n] : 20332 NULL; 20333 const reg_trie_data * const trie = 20334 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; 20335 #ifdef DEBUGGING 20336 AV *const trie_words 20337 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); 20338 #endif 20339 const regnode *nextbranch= NULL; 20340 I32 word_idx; 20341 sv_setpvs(sv, ""); 20342 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { 20343 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); 20344 20345 Perl_re_indentf( aTHX_ "%s ", 20346 indent+3, 20347 elem_ptr 20348 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), 20349 SvCUR(*elem_ptr), 60, 20350 PL_colors[0], PL_colors[1], 20351 (SvUTF8(*elem_ptr) 20352 ? PERL_PV_ESCAPE_UNI 20353 : 0) 20354 | PERL_PV_PRETTY_ELLIPSES 20355 | PERL_PV_PRETTY_LTGT 20356 ) 20357 : "???" 20358 ); 20359 if (trie->jump) { 20360 U16 dist= trie->jump[word_idx+1]; 20361 Perl_re_printf( aTHX_ "(%"UVuf")\n", 20362 (UV)((dist ? this_trie + dist : next) - start)); 20363 if (dist) { 20364 if (!nextbranch) 20365 nextbranch= this_trie + trie->jump[0]; 20366 DUMPUNTIL(this_trie + dist, nextbranch); 20367 } 20368 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 20369 nextbranch= regnext((regnode *)nextbranch); 20370 } else { 20371 Perl_re_printf( aTHX_ "\n"); 20372 } 20373 } 20374 if (last && next > last) 20375 node= last; 20376 else 20377 node= next; 20378 } 20379 else if ( op == CURLY ) { /* "next" might be very big: optimizer */ 20380 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, 20381 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); 20382 } 20383 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { 20384 assert(next); 20385 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); 20386 } 20387 else if ( op == PLUS || op == STAR) { 20388 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); 20389 } 20390 else if (PL_regkind[(U8)op] == ANYOF) { 20391 /* arglen 1 + class block */ 20392 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL) 20393 ? ANYOF_POSIXL_SKIP 20394 : ANYOF_SKIP); 20395 node = NEXTOPER(node); 20396 } 20397 else if (PL_regkind[(U8)op] == EXACT) { 20398 /* Literal string, where present. */ 20399 node += NODE_SZ_STR(node) - 1; 20400 node = NEXTOPER(node); 20401 } 20402 else { 20403 node = NEXTOPER(node); 20404 node += regarglen[(U8)op]; 20405 } 20406 if (op == CURLYX || op == OPEN) 20407 indent++; 20408 } 20409 CLEAR_OPTSTART; 20410 #ifdef DEBUG_DUMPUNTIL 20411 Perl_re_printf( aTHX_ "--- %d\n", (int)indent); 20412 #endif 20413 return node; 20414 } 20415 20416 #endif /* DEBUGGING */ 20417 20418 /* 20419 * ex: set ts=8 sts=4 sw=4 et: 20420 */ 20421