1 /* regcomp.c 2 */ 3 4 /* 5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee 6 * 7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] 8 */ 9 10 /* This file contains functions for compiling a regular expression. See 11 * also regexec.c which funnily enough, contains functions for executing 12 * a regular expression. 13 * 14 * This file is also copied at build time to ext/re/re_comp.c, where 15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 16 * This causes the main functions to be compiled under new names and with 17 * debugging support added, which makes "use re 'debug'" work. 18 */ 19 20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 21 * confused with the original package (see point 3 below). Thanks, Henry! 22 */ 23 24 /* Additional note: this code is very heavily munged from Henry's version 25 * in places. In some spots I've traded clarity for efficiency, so don't 26 * blame Henry for some of the lack of readability. 27 */ 28 29 /* The names of the functions have been changed from regcomp and 30 * regexec to pregcomp and pregexec in order to avoid conflicts 31 * with the POSIX routines of the same names. 32 */ 33 34 #ifdef PERL_EXT_RE_BUILD 35 #include "re_top.h" 36 #endif 37 38 /* 39 * pregcomp and pregexec -- regsub and regerror are not used in perl 40 * 41 * Copyright (c) 1986 by University of Toronto. 42 * Written by Henry Spencer. Not derived from licensed software. 43 * 44 * Permission is granted to anyone to use this software for any 45 * purpose on any computer system, and to redistribute it freely, 46 * subject to the following restrictions: 47 * 48 * 1. The author is not responsible for the consequences of use of 49 * this software, no matter how awful, even if they arise 50 * from defects in it. 51 * 52 * 2. The origin of this software must not be misrepresented, either 53 * by explicit claim or by omission. 54 * 55 * 3. Altered versions must be plainly marked as such, and must not 56 * be misrepresented as being the original software. 57 * 58 * 59 **** Alterations to Henry's code are... 60 **** 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 63 **** by Larry Wall and others 64 **** 65 **** You may distribute under the terms of either the GNU General Public 66 **** License or the Artistic License, as specified in the README file. 67 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGCOMP_C 75 #include "perl.h" 76 77 #ifndef PERL_IN_XSUB_RE 78 # include "INTERN.h" 79 #endif 80 81 #define REG_COMP_C 82 #ifdef PERL_IN_XSUB_RE 83 # include "re_comp.h" 84 #else 85 # include "regcomp.h" 86 #endif 87 88 #ifdef op 89 #undef op 90 #endif /* op */ 91 92 #ifdef MSDOS 93 # if defined(BUGGY_MSC6) 94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ 95 # pragma optimize("a",off) 96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ 97 # pragma optimize("w",on ) 98 # endif /* BUGGY_MSC6 */ 99 #endif /* MSDOS */ 100 101 #ifndef STATIC 102 #define STATIC static 103 #endif 104 105 typedef struct RExC_state_t { 106 U32 flags; /* are we folding, multilining? */ 107 char *precomp; /* uncompiled string. */ 108 REGEXP *rx_sv; /* The SV that is the regexp. */ 109 regexp *rx; /* perl core regexp structure */ 110 regexp_internal *rxi; /* internal data for regexp object pprivate field */ 111 char *start; /* Start of input for compile */ 112 char *end; /* End of input for compile */ 113 char *parse; /* Input-scan pointer. */ 114 I32 whilem_seen; /* number of WHILEM in this expr */ 115 regnode *emit_start; /* Start of emitted-code area */ 116 regnode *emit_bound; /* First regnode outside of the allocated space */ 117 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ 118 I32 naughty; /* How bad is this pattern? */ 119 I32 sawback; /* Did we see \1, ...? */ 120 U32 seen; 121 I32 size; /* Code size. */ 122 I32 npar; /* Capture buffer count, (OPEN). */ 123 I32 cpar; /* Capture buffer count, (CLOSE). */ 124 I32 nestroot; /* root parens we are in - used by accept */ 125 I32 extralen; 126 I32 seen_zerolen; 127 I32 seen_evals; 128 regnode **open_parens; /* pointers to open parens */ 129 regnode **close_parens; /* pointers to close parens */ 130 regnode *opend; /* END node in program */ 131 I32 utf8; /* whether the pattern is utf8 or not */ 132 I32 orig_utf8; /* whether the pattern was originally in utf8 */ 133 /* XXX use this for future optimisation of case 134 * where pattern must be upgraded to utf8. */ 135 HV *paren_names; /* Paren names */ 136 137 regnode **recurse; /* Recurse regops */ 138 I32 recurse_count; /* Number of recurse regops */ 139 #if ADD_TO_REGEXEC 140 char *starttry; /* -Dr: where regtry was called. */ 141 #define RExC_starttry (pRExC_state->starttry) 142 #endif 143 #ifdef DEBUGGING 144 const char *lastparse; 145 I32 lastnum; 146 AV *paren_name_list; /* idx -> name */ 147 #define RExC_lastparse (pRExC_state->lastparse) 148 #define RExC_lastnum (pRExC_state->lastnum) 149 #define RExC_paren_name_list (pRExC_state->paren_name_list) 150 #endif 151 } RExC_state_t; 152 153 #define RExC_flags (pRExC_state->flags) 154 #define RExC_precomp (pRExC_state->precomp) 155 #define RExC_rx_sv (pRExC_state->rx_sv) 156 #define RExC_rx (pRExC_state->rx) 157 #define RExC_rxi (pRExC_state->rxi) 158 #define RExC_start (pRExC_state->start) 159 #define RExC_end (pRExC_state->end) 160 #define RExC_parse (pRExC_state->parse) 161 #define RExC_whilem_seen (pRExC_state->whilem_seen) 162 #ifdef RE_TRACK_PATTERN_OFFSETS 163 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ 164 #endif 165 #define RExC_emit (pRExC_state->emit) 166 #define RExC_emit_start (pRExC_state->emit_start) 167 #define RExC_emit_bound (pRExC_state->emit_bound) 168 #define RExC_naughty (pRExC_state->naughty) 169 #define RExC_sawback (pRExC_state->sawback) 170 #define RExC_seen (pRExC_state->seen) 171 #define RExC_size (pRExC_state->size) 172 #define RExC_npar (pRExC_state->npar) 173 #define RExC_nestroot (pRExC_state->nestroot) 174 #define RExC_extralen (pRExC_state->extralen) 175 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 176 #define RExC_seen_evals (pRExC_state->seen_evals) 177 #define RExC_utf8 (pRExC_state->utf8) 178 #define RExC_orig_utf8 (pRExC_state->orig_utf8) 179 #define RExC_open_parens (pRExC_state->open_parens) 180 #define RExC_close_parens (pRExC_state->close_parens) 181 #define RExC_opend (pRExC_state->opend) 182 #define RExC_paren_names (pRExC_state->paren_names) 183 #define RExC_recurse (pRExC_state->recurse) 184 #define RExC_recurse_count (pRExC_state->recurse_count) 185 186 187 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') 188 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ 189 ((*s) == '{' && regcurly(s))) 190 191 #ifdef SPSTART 192 #undef SPSTART /* dratted cpp namespace... */ 193 #endif 194 /* 195 * Flags to be passed up and down. 196 */ 197 #define WORST 0 /* Worst case. */ 198 #define HASWIDTH 0x01 /* Known to match non-null strings. */ 199 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ 200 #define SPSTART 0x04 /* Starts with * or +. */ 201 #define TRYAGAIN 0x08 /* Weeded out a declaration. */ 202 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ 203 204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) 205 206 /* whether trie related optimizations are enabled */ 207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 208 #define TRIE_STUDY_OPT 209 #define FULL_TRIE_STUDY 210 #define TRIE_STCLASS 211 #endif 212 213 214 215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] 216 #define PBITVAL(paren) (1 << ((paren) & 7)) 217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) 218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) 219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) 220 221 222 /* About scan_data_t. 223 224 During optimisation we recurse through the regexp program performing 225 various inplace (keyhole style) optimisations. In addition study_chunk 226 and scan_commit populate this data structure with information about 227 what strings MUST appear in the pattern. We look for the longest 228 string that must appear for at a fixed location, and we look for the 229 longest string that may appear at a floating location. So for instance 230 in the pattern: 231 232 /FOO[xX]A.*B[xX]BAR/ 233 234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating 235 strings (because they follow a .* construct). study_chunk will identify 236 both FOO and BAR as being the longest fixed and floating strings respectively. 237 238 The strings can be composites, for instance 239 240 /(f)(o)(o)/ 241 242 will result in a composite fixed substring 'foo'. 243 244 For each string some basic information is maintained: 245 246 - offset or min_offset 247 This is the position the string must appear at, or not before. 248 It also implicitly (when combined with minlenp) tells us how many 249 character must match before the string we are searching. 250 Likewise when combined with minlenp and the length of the string 251 tells us how many characters must appear after the string we have 252 found. 253 254 - max_offset 255 Only used for floating strings. This is the rightmost point that 256 the string can appear at. Ifset to I32 max it indicates that the 257 string can occur infinitely far to the right. 258 259 - minlenp 260 A pointer to the minimum length of the pattern that the string 261 was found inside. This is important as in the case of positive 262 lookahead or positive lookbehind we can have multiple patterns 263 involved. Consider 264 265 /(?=FOO).*F/ 266 267 The minimum length of the pattern overall is 3, the minimum length 268 of the lookahead part is 3, but the minimum length of the part that 269 will actually match is 1. So 'FOO's minimum length is 3, but the 270 minimum length for the F is 1. This is important as the minimum length 271 is used to determine offsets in front of and behind the string being 272 looked for. Since strings can be composites this is the length of the 273 pattern at the time it was commited with a scan_commit. Note that 274 the length is calculated by study_chunk, so that the minimum lengths 275 are not known until the full pattern has been compiled, thus the 276 pointer to the value. 277 278 - lookbehind 279 280 In the case of lookbehind the string being searched for can be 281 offset past the start point of the final matching string. 282 If this value was just blithely removed from the min_offset it would 283 invalidate some of the calculations for how many chars must match 284 before or after (as they are derived from min_offset and minlen and 285 the length of the string being searched for). 286 When the final pattern is compiled and the data is moved from the 287 scan_data_t structure into the regexp structure the information 288 about lookbehind is factored in, with the information that would 289 have been lost precalculated in the end_shift field for the 290 associated string. 291 292 The fields pos_min and pos_delta are used to store the minimum offset 293 and the delta to the maximum offset at the current point in the pattern. 294 295 */ 296 297 typedef struct scan_data_t { 298 /*I32 len_min; unused */ 299 /*I32 len_delta; unused */ 300 I32 pos_min; 301 I32 pos_delta; 302 SV *last_found; 303 I32 last_end; /* min value, <0 unless valid. */ 304 I32 last_start_min; 305 I32 last_start_max; 306 SV **longest; /* Either &l_fixed, or &l_float. */ 307 SV *longest_fixed; /* longest fixed string found in pattern */ 308 I32 offset_fixed; /* offset where it starts */ 309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */ 310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */ 311 SV *longest_float; /* longest floating string found in pattern */ 312 I32 offset_float_min; /* earliest point in string it can appear */ 313 I32 offset_float_max; /* latest point in string it can appear */ 314 I32 *minlen_float; /* pointer to the minlen relevent to the string */ 315 I32 lookbehind_float; /* is the position of the string modified by LB */ 316 I32 flags; 317 I32 whilem_c; 318 I32 *last_closep; 319 struct regnode_charclass_class *start_class; 320 } scan_data_t; 321 322 /* 323 * Forward declarations for pregcomp()'s friends. 324 */ 325 326 static const scan_data_t zero_scan_data = 327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; 328 329 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 330 #define SF_BEFORE_SEOL 0x0001 331 #define SF_BEFORE_MEOL 0x0002 332 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) 333 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) 334 335 #ifdef NO_UNARY_PLUS 336 # define SF_FIX_SHIFT_EOL (0+2) 337 # define SF_FL_SHIFT_EOL (0+4) 338 #else 339 # define SF_FIX_SHIFT_EOL (+2) 340 # define SF_FL_SHIFT_EOL (+4) 341 #endif 342 343 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) 344 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) 345 346 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) 347 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ 348 #define SF_IS_INF 0x0040 349 #define SF_HAS_PAR 0x0080 350 #define SF_IN_PAR 0x0100 351 #define SF_HAS_EVAL 0x0200 352 #define SCF_DO_SUBSTR 0x0400 353 #define SCF_DO_STCLASS_AND 0x0800 354 #define SCF_DO_STCLASS_OR 0x1000 355 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 356 #define SCF_WHILEM_VISITED_POS 0x2000 357 358 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ 359 #define SCF_SEEN_ACCEPT 0x8000 360 361 #define UTF (RExC_utf8 != 0) 362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0) 363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0) 364 365 #define OOB_UNICODE 12345678 366 #define OOB_NAMEDCLASS -1 367 368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) 370 371 372 /* length of regex to show in messages that don't mark a position within */ 373 #define RegexLengthToShowInErrorMessages 127 374 375 /* 376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 378 * op/pragma/warn/regcomp. 379 */ 380 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 381 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 382 383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" 384 385 /* 386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 387 * arg. Show regex, up to a maximum length. If it's too long, chop and add 388 * "...". 389 */ 390 #define _FAIL(code) STMT_START { \ 391 const char *ellipses = ""; \ 392 IV len = RExC_end - RExC_precomp; \ 393 \ 394 if (!SIZE_ONLY) \ 395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ 396 if (len > RegexLengthToShowInErrorMessages) { \ 397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 398 len = RegexLengthToShowInErrorMessages - 10; \ 399 ellipses = "..."; \ 400 } \ 401 code; \ 402 } STMT_END 403 404 #define FAIL(msg) _FAIL( \ 405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ 406 msg, (int)len, RExC_precomp, ellipses)) 407 408 #define FAIL2(msg,arg) _FAIL( \ 409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ 410 arg, (int)len, RExC_precomp, ellipses)) 411 412 /* 413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 414 */ 415 #define Simple_vFAIL(m) STMT_START { \ 416 const IV offset = RExC_parse - RExC_precomp; \ 417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 419 } STMT_END 420 421 /* 422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 423 */ 424 #define vFAIL(m) STMT_START { \ 425 if (!SIZE_ONLY) \ 426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ 427 Simple_vFAIL(m); \ 428 } STMT_END 429 430 /* 431 * Like Simple_vFAIL(), but accepts two arguments. 432 */ 433 #define Simple_vFAIL2(m,a1) STMT_START { \ 434 const IV offset = RExC_parse - RExC_precomp; \ 435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ 436 (int)offset, RExC_precomp, RExC_precomp + offset); \ 437 } STMT_END 438 439 /* 440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 441 */ 442 #define vFAIL2(m,a1) STMT_START { \ 443 if (!SIZE_ONLY) \ 444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ 445 Simple_vFAIL2(m, a1); \ 446 } STMT_END 447 448 449 /* 450 * Like Simple_vFAIL(), but accepts three arguments. 451 */ 452 #define Simple_vFAIL3(m, a1, a2) STMT_START { \ 453 const IV offset = RExC_parse - RExC_precomp; \ 454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ 455 (int)offset, RExC_precomp, RExC_precomp + offset); \ 456 } STMT_END 457 458 /* 459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 460 */ 461 #define vFAIL3(m,a1,a2) STMT_START { \ 462 if (!SIZE_ONLY) \ 463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ 464 Simple_vFAIL3(m, a1, a2); \ 465 } STMT_END 466 467 /* 468 * Like Simple_vFAIL(), but accepts four arguments. 469 */ 470 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ 471 const IV offset = RExC_parse - RExC_precomp; \ 472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ 473 (int)offset, RExC_precomp, RExC_precomp + offset); \ 474 } STMT_END 475 476 #define ckWARNreg(loc,m) STMT_START { \ 477 const IV offset = loc - RExC_precomp; \ 478 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 479 (int)offset, RExC_precomp, RExC_precomp + offset); \ 480 } STMT_END 481 482 #define ckWARNregdep(loc,m) STMT_START { \ 483 const IV offset = loc - RExC_precomp; \ 484 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ 485 m REPORT_LOCATION, \ 486 (int)offset, RExC_precomp, RExC_precomp + offset); \ 487 } STMT_END 488 489 #define ckWARN2reg(loc, m, a1) STMT_START { \ 490 const IV offset = loc - RExC_precomp; \ 491 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ 493 } STMT_END 494 495 #define vWARN3(loc, m, a1, a2) STMT_START { \ 496 const IV offset = loc - RExC_precomp; \ 497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ 499 } STMT_END 500 501 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ 502 const IV offset = loc - RExC_precomp; \ 503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 504 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ 505 } STMT_END 506 507 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ 508 const IV offset = loc - RExC_precomp; \ 509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 510 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ 511 } STMT_END 512 513 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ 514 const IV offset = loc - RExC_precomp; \ 515 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 516 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ 517 } STMT_END 518 519 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ 520 const IV offset = loc - RExC_precomp; \ 521 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 522 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ 523 } STMT_END 524 525 526 /* Allow for side effects in s */ 527 #define REGC(c,s) STMT_START { \ 528 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ 529 } STMT_END 530 531 /* Macros for recording node offsets. 20001227 mjd@plover.com 532 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in 533 * element 2*n-1 of the array. Element #2n holds the byte length node #n. 534 * Element 0 holds the number n. 535 * Position is 1 indexed. 536 */ 537 #ifndef RE_TRACK_PATTERN_OFFSETS 538 #define Set_Node_Offset_To_R(node,byte) 539 #define Set_Node_Offset(node,byte) 540 #define Set_Cur_Node_Offset 541 #define Set_Node_Length_To_R(node,len) 542 #define Set_Node_Length(node,len) 543 #define Set_Node_Cur_Length(node) 544 #define Node_Offset(n) 545 #define Node_Length(n) 546 #define Set_Node_Offset_Length(node,offset,len) 547 #define ProgLen(ri) ri->u.proglen 548 #define SetProgLen(ri,x) ri->u.proglen = x 549 #else 550 #define ProgLen(ri) ri->u.offsets[0] 551 #define SetProgLen(ri,x) ri->u.offsets[0] = x 552 #define Set_Node_Offset_To_R(node,byte) STMT_START { \ 553 if (! SIZE_ONLY) { \ 554 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ 555 __LINE__, (int)(node), (int)(byte))); \ 556 if((node) < 0) { \ 557 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ 558 } else { \ 559 RExC_offsets[2*(node)-1] = (byte); \ 560 } \ 561 } \ 562 } STMT_END 563 564 #define Set_Node_Offset(node,byte) \ 565 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) 566 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) 567 568 #define Set_Node_Length_To_R(node,len) STMT_START { \ 569 if (! SIZE_ONLY) { \ 570 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ 571 __LINE__, (int)(node), (int)(len))); \ 572 if((node) < 0) { \ 573 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ 574 } else { \ 575 RExC_offsets[2*(node)] = (len); \ 576 } \ 577 } \ 578 } STMT_END 579 580 #define Set_Node_Length(node,len) \ 581 Set_Node_Length_To_R((node)-RExC_emit_start, len) 582 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) 583 #define Set_Node_Cur_Length(node) \ 584 Set_Node_Length(node, RExC_parse - parse_start) 585 586 /* Get offsets and lengths */ 587 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) 588 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) 589 590 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \ 591 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ 592 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ 593 } STMT_END 594 #endif 595 596 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 597 #define EXPERIMENTAL_INPLACESCAN 598 #endif /*RE_TRACK_PATTERN_OFFSETS*/ 599 600 #define DEBUG_STUDYDATA(str,data,depth) \ 601 DEBUG_OPTIMISE_MORE_r(if(data){ \ 602 PerlIO_printf(Perl_debug_log, \ 603 "%*s" str "Pos:%"IVdf"/%"IVdf \ 604 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ 605 (int)(depth)*2, "", \ 606 (IV)((data)->pos_min), \ 607 (IV)((data)->pos_delta), \ 608 (UV)((data)->flags), \ 609 (IV)((data)->whilem_c), \ 610 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ 611 is_inf ? "INF " : "" \ 612 ); \ 613 if ((data)->last_found) \ 614 PerlIO_printf(Perl_debug_log, \ 615 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ 616 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ 617 SvPVX_const((data)->last_found), \ 618 (IV)((data)->last_end), \ 619 (IV)((data)->last_start_min), \ 620 (IV)((data)->last_start_max), \ 621 ((data)->longest && \ 622 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ 623 SvPVX_const((data)->longest_fixed), \ 624 (IV)((data)->offset_fixed), \ 625 ((data)->longest && \ 626 (data)->longest==&((data)->longest_float)) ? "*" : "", \ 627 SvPVX_const((data)->longest_float), \ 628 (IV)((data)->offset_float_min), \ 629 (IV)((data)->offset_float_max) \ 630 ); \ 631 PerlIO_printf(Perl_debug_log,"\n"); \ 632 }); 633 634 static void clear_re(pTHX_ void *r); 635 636 /* Mark that we cannot extend a found fixed substring at this point. 637 Update the longest found anchored substring and the longest found 638 floating substrings if needed. */ 639 640 STATIC void 641 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) 642 { 643 const STRLEN l = CHR_SVLEN(data->last_found); 644 const STRLEN old_l = CHR_SVLEN(*data->longest); 645 GET_RE_DEBUG_FLAGS_DECL; 646 647 PERL_ARGS_ASSERT_SCAN_COMMIT; 648 649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 650 SvSetMagicSV(*data->longest, data->last_found); 651 if (*data->longest == data->longest_fixed) { 652 data->offset_fixed = l ? data->last_start_min : data->pos_min; 653 if (data->flags & SF_BEFORE_EOL) 654 data->flags 655 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); 656 else 657 data->flags &= ~SF_FIX_BEFORE_EOL; 658 data->minlen_fixed=minlenp; 659 data->lookbehind_fixed=0; 660 } 661 else { /* *data->longest == data->longest_float */ 662 data->offset_float_min = l ? data->last_start_min : data->pos_min; 663 data->offset_float_max = (l 664 ? data->last_start_max 665 : data->pos_min + data->pos_delta); 666 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) 667 data->offset_float_max = I32_MAX; 668 if (data->flags & SF_BEFORE_EOL) 669 data->flags 670 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); 671 else 672 data->flags &= ~SF_FL_BEFORE_EOL; 673 data->minlen_float=minlenp; 674 data->lookbehind_float=0; 675 } 676 } 677 SvCUR_set(data->last_found, 0); 678 { 679 SV * const sv = data->last_found; 680 if (SvUTF8(sv) && SvMAGICAL(sv)) { 681 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); 682 if (mg) 683 mg->mg_len = 0; 684 } 685 } 686 data->last_end = -1; 687 data->flags &= ~SF_BEFORE_EOL; 688 DEBUG_STUDYDATA("commit: ",data,0); 689 } 690 691 /* Can match anything (initialization) */ 692 STATIC void 693 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 694 { 695 PERL_ARGS_ASSERT_CL_ANYTHING; 696 697 ANYOF_CLASS_ZERO(cl); 698 ANYOF_BITMAP_SETALL(cl); 699 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; 700 if (LOC) 701 cl->flags |= ANYOF_LOCALE; 702 } 703 704 /* Can match anything (initialization) */ 705 STATIC int 706 S_cl_is_anything(const struct regnode_charclass_class *cl) 707 { 708 int value; 709 710 PERL_ARGS_ASSERT_CL_IS_ANYTHING; 711 712 for (value = 0; value <= ANYOF_MAX; value += 2) 713 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) 714 return 1; 715 if (!(cl->flags & ANYOF_UNICODE_ALL)) 716 return 0; 717 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) 718 return 0; 719 return 1; 720 } 721 722 /* Can match anything (initialization) */ 723 STATIC void 724 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 725 { 726 PERL_ARGS_ASSERT_CL_INIT; 727 728 Zero(cl, 1, struct regnode_charclass_class); 729 cl->type = ANYOF; 730 cl_anything(pRExC_state, cl); 731 } 732 733 STATIC void 734 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 735 { 736 PERL_ARGS_ASSERT_CL_INIT_ZERO; 737 738 Zero(cl, 1, struct regnode_charclass_class); 739 cl->type = ANYOF; 740 cl_anything(pRExC_state, cl); 741 if (LOC) 742 cl->flags |= ANYOF_LOCALE; 743 } 744 745 /* 'And' a given class with another one. Can create false positives */ 746 /* We assume that cl is not inverted */ 747 STATIC void 748 S_cl_and(struct regnode_charclass_class *cl, 749 const struct regnode_charclass_class *and_with) 750 { 751 PERL_ARGS_ASSERT_CL_AND; 752 753 assert(and_with->type == ANYOF); 754 if (!(and_with->flags & ANYOF_CLASS) 755 && !(cl->flags & ANYOF_CLASS) 756 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 757 && !(and_with->flags & ANYOF_FOLD) 758 && !(cl->flags & ANYOF_FOLD)) { 759 int i; 760 761 if (and_with->flags & ANYOF_INVERT) 762 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 763 cl->bitmap[i] &= ~and_with->bitmap[i]; 764 else 765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 766 cl->bitmap[i] &= and_with->bitmap[i]; 767 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ 768 if (!(and_with->flags & ANYOF_EOS)) 769 cl->flags &= ~ANYOF_EOS; 770 771 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE && 772 !(and_with->flags & ANYOF_INVERT)) { 773 cl->flags &= ~ANYOF_UNICODE_ALL; 774 cl->flags |= ANYOF_UNICODE; 775 ARG_SET(cl, ARG(and_with)); 776 } 777 if (!(and_with->flags & ANYOF_UNICODE_ALL) && 778 !(and_with->flags & ANYOF_INVERT)) 779 cl->flags &= ~ANYOF_UNICODE_ALL; 780 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) && 781 !(and_with->flags & ANYOF_INVERT)) 782 cl->flags &= ~ANYOF_UNICODE; 783 } 784 785 /* 'OR' a given class with another one. Can create false positives */ 786 /* We assume that cl is not inverted */ 787 STATIC void 788 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) 789 { 790 PERL_ARGS_ASSERT_CL_OR; 791 792 if (or_with->flags & ANYOF_INVERT) { 793 /* We do not use 794 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) 795 * <= (B1 | !B2) | (CL1 | !CL2) 796 * which is wasteful if CL2 is small, but we ignore CL2: 797 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 798 * XXXX Can we handle case-fold? Unclear: 799 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = 800 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) 801 */ 802 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 803 && !(or_with->flags & ANYOF_FOLD) 804 && !(cl->flags & ANYOF_FOLD) ) { 805 int i; 806 807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 808 cl->bitmap[i] |= ~or_with->bitmap[i]; 809 } /* XXXX: logic is complicated otherwise */ 810 else { 811 cl_anything(pRExC_state, cl); 812 } 813 } else { 814 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ 815 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 816 && (!(or_with->flags & ANYOF_FOLD) 817 || (cl->flags & ANYOF_FOLD)) ) { 818 int i; 819 820 /* OR char bitmap and class bitmap separately */ 821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 822 cl->bitmap[i] |= or_with->bitmap[i]; 823 if (or_with->flags & ANYOF_CLASS) { 824 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++) 825 cl->classflags[i] |= or_with->classflags[i]; 826 cl->flags |= ANYOF_CLASS; 827 } 828 } 829 else { /* XXXX: logic is complicated, leave it along for a moment. */ 830 cl_anything(pRExC_state, cl); 831 } 832 } 833 if (or_with->flags & ANYOF_EOS) 834 cl->flags |= ANYOF_EOS; 835 836 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE && 837 ARG(cl) != ARG(or_with)) { 838 cl->flags |= ANYOF_UNICODE_ALL; 839 cl->flags &= ~ANYOF_UNICODE; 840 } 841 if (or_with->flags & ANYOF_UNICODE_ALL) { 842 cl->flags |= ANYOF_UNICODE_ALL; 843 cl->flags &= ~ANYOF_UNICODE; 844 } 845 } 846 847 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] 848 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) 849 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) 850 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) 851 852 853 #ifdef DEBUGGING 854 /* 855 dump_trie(trie,widecharmap,revcharmap) 856 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) 857 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) 858 859 These routines dump out a trie in a somewhat readable format. 860 The _interim_ variants are used for debugging the interim 861 tables that are used to generate the final compressed 862 representation which is what dump_trie expects. 863 864 Part of the reason for their existance is to provide a form 865 of documentation as to how the different representations function. 866 867 */ 868 869 /* 870 Dumps the final compressed table form of the trie to Perl_debug_log. 871 Used for debugging make_trie(). 872 */ 873 874 STATIC void 875 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, 876 AV *revcharmap, U32 depth) 877 { 878 U32 state; 879 SV *sv=sv_newmortal(); 880 int colwidth= widecharmap ? 6 : 4; 881 GET_RE_DEBUG_FLAGS_DECL; 882 883 PERL_ARGS_ASSERT_DUMP_TRIE; 884 885 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", 886 (int)depth * 2 + 2,"", 887 "Match","Base","Ofs" ); 888 889 for( state = 0 ; state < trie->uniquecharcount ; state++ ) { 890 SV ** const tmp = av_fetch( revcharmap, state, 0); 891 if ( tmp ) { 892 PerlIO_printf( Perl_debug_log, "%*s", 893 colwidth, 894 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 895 PL_colors[0], PL_colors[1], 896 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 897 PERL_PV_ESCAPE_FIRSTCHAR 898 ) 899 ); 900 } 901 } 902 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", 903 (int)depth * 2 + 2,""); 904 905 for( state = 0 ; state < trie->uniquecharcount ; state++ ) 906 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); 907 PerlIO_printf( Perl_debug_log, "\n"); 908 909 for( state = 1 ; state < trie->statecount ; state++ ) { 910 const U32 base = trie->states[ state ].trans.base; 911 912 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); 913 914 if ( trie->states[ state ].wordnum ) { 915 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); 916 } else { 917 PerlIO_printf( Perl_debug_log, "%6s", "" ); 918 } 919 920 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); 921 922 if ( base ) { 923 U32 ofs = 0; 924 925 while( ( base + ofs < trie->uniquecharcount ) || 926 ( base + ofs - trie->uniquecharcount < trie->lasttrans 927 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) 928 ofs++; 929 930 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); 931 932 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 933 if ( ( base + ofs >= trie->uniquecharcount ) && 934 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 935 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 936 { 937 PerlIO_printf( Perl_debug_log, "%*"UVXf, 938 colwidth, 939 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); 940 } else { 941 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); 942 } 943 } 944 945 PerlIO_printf( Perl_debug_log, "]"); 946 947 } 948 PerlIO_printf( Perl_debug_log, "\n" ); 949 } 950 } 951 /* 952 Dumps a fully constructed but uncompressed trie in list form. 953 List tries normally only are used for construction when the number of 954 possible chars (trie->uniquecharcount) is very high. 955 Used for debugging make_trie(). 956 */ 957 STATIC void 958 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, 959 HV *widecharmap, AV *revcharmap, U32 next_alloc, 960 U32 depth) 961 { 962 U32 state; 963 SV *sv=sv_newmortal(); 964 int colwidth= widecharmap ? 6 : 4; 965 GET_RE_DEBUG_FLAGS_DECL; 966 967 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; 968 969 /* print out the table precompression. */ 970 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", 971 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", 972 "------:-----+-----------------\n" ); 973 974 for( state=1 ; state < next_alloc ; state ++ ) { 975 U16 charid; 976 977 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", 978 (int)depth * 2 + 2,"", (UV)state ); 979 if ( ! trie->states[ state ].wordnum ) { 980 PerlIO_printf( Perl_debug_log, "%5s| ",""); 981 } else { 982 PerlIO_printf( Perl_debug_log, "W%4x| ", 983 trie->states[ state ].wordnum 984 ); 985 } 986 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { 987 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); 988 if ( tmp ) { 989 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", 990 colwidth, 991 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 992 PL_colors[0], PL_colors[1], 993 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 994 PERL_PV_ESCAPE_FIRSTCHAR 995 ) , 996 TRIE_LIST_ITEM(state,charid).forid, 997 (UV)TRIE_LIST_ITEM(state,charid).newstate 998 ); 999 if (!(charid % 10)) 1000 PerlIO_printf(Perl_debug_log, "\n%*s| ", 1001 (int)((depth * 2) + 14), ""); 1002 } 1003 } 1004 PerlIO_printf( Perl_debug_log, "\n"); 1005 } 1006 } 1007 1008 /* 1009 Dumps a fully constructed but uncompressed trie in table form. 1010 This is the normal DFA style state transition table, with a few 1011 twists to facilitate compression later. 1012 Used for debugging make_trie(). 1013 */ 1014 STATIC void 1015 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, 1016 HV *widecharmap, AV *revcharmap, U32 next_alloc, 1017 U32 depth) 1018 { 1019 U32 state; 1020 U16 charid; 1021 SV *sv=sv_newmortal(); 1022 int colwidth= widecharmap ? 6 : 4; 1023 GET_RE_DEBUG_FLAGS_DECL; 1024 1025 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; 1026 1027 /* 1028 print out the table precompression so that we can do a visual check 1029 that they are identical. 1030 */ 1031 1032 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); 1033 1034 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 1035 SV ** const tmp = av_fetch( revcharmap, charid, 0); 1036 if ( tmp ) { 1037 PerlIO_printf( Perl_debug_log, "%*s", 1038 colwidth, 1039 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 1040 PL_colors[0], PL_colors[1], 1041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 1042 PERL_PV_ESCAPE_FIRSTCHAR 1043 ) 1044 ); 1045 } 1046 } 1047 1048 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); 1049 1050 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { 1051 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); 1052 } 1053 1054 PerlIO_printf( Perl_debug_log, "\n" ); 1055 1056 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { 1057 1058 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", 1059 (int)depth * 2 + 2,"", 1060 (UV)TRIE_NODENUM( state ) ); 1061 1062 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 1063 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); 1064 if (v) 1065 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); 1066 else 1067 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); 1068 } 1069 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { 1070 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); 1071 } else { 1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, 1073 trie->states[ TRIE_NODENUM( state ) ].wordnum ); 1074 } 1075 } 1076 } 1077 1078 #endif 1079 1080 /* make_trie(startbranch,first,last,tail,word_count,flags,depth) 1081 startbranch: the first branch in the whole branch sequence 1082 first : start branch of sequence of branch-exact nodes. 1083 May be the same as startbranch 1084 last : Thing following the last branch. 1085 May be the same as tail. 1086 tail : item following the branch sequence 1087 count : words in the sequence 1088 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ 1089 depth : indent depth 1090 1091 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. 1092 1093 A trie is an N'ary tree where the branches are determined by digital 1094 decomposition of the key. IE, at the root node you look up the 1st character and 1095 follow that branch repeat until you find the end of the branches. Nodes can be 1096 marked as "accepting" meaning they represent a complete word. Eg: 1097 1098 /he|she|his|hers/ 1099 1100 would convert into the following structure. Numbers represent states, letters 1101 following numbers represent valid transitions on the letter from that state, if 1102 the number is in square brackets it represents an accepting state, otherwise it 1103 will be in parenthesis. 1104 1105 +-h->+-e->[3]-+-r->(8)-+-s->[9] 1106 | | 1107 | (2) 1108 | | 1109 (1) +-i->(6)-+-s->[7] 1110 | 1111 +-s->(3)-+-h->(4)-+-e->[5] 1112 1113 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) 1114 1115 This shows that when matching against the string 'hers' we will begin at state 1 1116 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, 1117 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which 1118 is also accepting. Thus we know that we can match both 'he' and 'hers' with a 1119 single traverse. We store a mapping from accepting to state to which word was 1120 matched, and then when we have multiple possibilities we try to complete the 1121 rest of the regex in the order in which they occured in the alternation. 1122 1123 The only prior NFA like behaviour that would be changed by the TRIE support is 1124 the silent ignoring of duplicate alternations which are of the form: 1125 1126 / (DUPE|DUPE) X? (?{ ... }) Y /x 1127 1128 Thus EVAL blocks follwing a trie may be called a different number of times with 1129 and without the optimisation. With the optimisations dupes will be silently 1130 ignored. This inconsistant behaviour of EVAL type nodes is well established as 1131 the following demonstrates: 1132 1133 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ 1134 1135 which prints out 'word' three times, but 1136 1137 'words'=~/(word|word|word)(?{ print $1 })S/ 1138 1139 which doesnt print it out at all. This is due to other optimisations kicking in. 1140 1141 Example of what happens on a structural level: 1142 1143 The regexp /(ac|ad|ab)+/ will produce the folowing debug output: 1144 1145 1: CURLYM[1] {1,32767}(18) 1146 5: BRANCH(8) 1147 6: EXACT <ac>(16) 1148 8: BRANCH(11) 1149 9: EXACT <ad>(16) 1150 11: BRANCH(14) 1151 12: EXACT <ab>(16) 1152 16: SUCCEED(0) 1153 17: NOTHING(18) 1154 18: END(0) 1155 1156 This would be optimizable with startbranch=5, first=5, last=16, tail=16 1157 and should turn into: 1158 1159 1: CURLYM[1] {1,32767}(18) 1160 5: TRIE(16) 1161 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] 1162 <ac> 1163 <ad> 1164 <ab> 1165 16: SUCCEED(0) 1166 17: NOTHING(18) 1167 18: END(0) 1168 1169 Cases where tail != last would be like /(?foo|bar)baz/: 1170 1171 1: BRANCH(4) 1172 2: EXACT <foo>(8) 1173 4: BRANCH(7) 1174 5: EXACT <bar>(8) 1175 7: TAIL(8) 1176 8: EXACT <baz>(10) 1177 10: END(0) 1178 1179 which would be optimizable with startbranch=1, first=1, last=7, tail=8 1180 and would end up looking like: 1181 1182 1: TRIE(8) 1183 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] 1184 <foo> 1185 <bar> 1186 7: TAIL(8) 1187 8: EXACT <baz>(10) 1188 10: END(0) 1189 1190 d = uvuni_to_utf8_flags(d, uv, 0); 1191 1192 is the recommended Unicode-aware way of saying 1193 1194 *(d++) = uv; 1195 */ 1196 1197 #define TRIE_STORE_REVCHAR \ 1198 STMT_START { \ 1199 if (UTF) { \ 1200 SV *zlopp = newSV(2); \ 1201 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ 1202 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ 1203 SvCUR_set(zlopp, kapow - flrbbbbb); \ 1204 SvPOK_on(zlopp); \ 1205 SvUTF8_on(zlopp); \ 1206 av_push(revcharmap, zlopp); \ 1207 } else { \ 1208 char ooooff = (char)uvc; \ 1209 av_push(revcharmap, newSVpvn(&ooooff, 1)); \ 1210 } \ 1211 } STMT_END 1212 1213 #define TRIE_READ_CHAR STMT_START { \ 1214 wordlen++; \ 1215 if ( UTF ) { \ 1216 if ( folder ) { \ 1217 if ( foldlen > 0 ) { \ 1218 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \ 1219 foldlen -= len; \ 1220 scan += len; \ 1221 len = 0; \ 1222 } else { \ 1223 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ 1224 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ 1225 foldlen -= UNISKIP( uvc ); \ 1226 scan = foldbuf + UNISKIP( uvc ); \ 1227 } \ 1228 } else { \ 1229 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ 1230 } \ 1231 } else { \ 1232 uvc = (U32)*uc; \ 1233 len = 1; \ 1234 } \ 1235 } STMT_END 1236 1237 1238 1239 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ 1240 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ 1241 U32 ging = TRIE_LIST_LEN( state ) *= 2; \ 1242 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ 1243 } \ 1244 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ 1245 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ 1246 TRIE_LIST_CUR( state )++; \ 1247 } STMT_END 1248 1249 #define TRIE_LIST_NEW(state) STMT_START { \ 1250 Newxz( trie->states[ state ].trans.list, \ 1251 4, reg_trie_trans_le ); \ 1252 TRIE_LIST_CUR( state ) = 1; \ 1253 TRIE_LIST_LEN( state ) = 4; \ 1254 } STMT_END 1255 1256 #define TRIE_HANDLE_WORD(state) STMT_START { \ 1257 U16 dupe= trie->states[ state ].wordnum; \ 1258 regnode * const noper_next = regnext( noper ); \ 1259 \ 1260 if (trie->wordlen) \ 1261 trie->wordlen[ curword ] = wordlen; \ 1262 DEBUG_r({ \ 1263 /* store the word for dumping */ \ 1264 SV* tmp; \ 1265 if (OP(noper) != NOTHING) \ 1266 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ 1267 else \ 1268 tmp = newSVpvn_utf8( "", 0, UTF ); \ 1269 av_push( trie_words, tmp ); \ 1270 }); \ 1271 \ 1272 curword++; \ 1273 \ 1274 if ( noper_next < tail ) { \ 1275 if (!trie->jump) \ 1276 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ 1277 trie->jump[curword] = (U16)(noper_next - convert); \ 1278 if (!jumper) \ 1279 jumper = noper_next; \ 1280 if (!nextbranch) \ 1281 nextbranch= regnext(cur); \ 1282 } \ 1283 \ 1284 if ( dupe ) { \ 1285 /* So it's a dupe. This means we need to maintain a */\ 1286 /* linked-list from the first to the next. */\ 1287 /* we only allocate the nextword buffer when there */\ 1288 /* a dupe, so first time we have to do the allocation */\ 1289 if (!trie->nextword) \ 1290 trie->nextword = (U16 *) \ 1291 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \ 1292 while ( trie->nextword[dupe] ) \ 1293 dupe= trie->nextword[dupe]; \ 1294 trie->nextword[dupe]= curword; \ 1295 } else { \ 1296 /* we haven't inserted this word yet. */ \ 1297 trie->states[ state ].wordnum = curword; \ 1298 } \ 1299 } STMT_END 1300 1301 1302 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ 1303 ( ( base + charid >= ucharcount \ 1304 && base + charid < ubound \ 1305 && state == trie->trans[ base - ucharcount + charid ].check \ 1306 && trie->trans[ base - ucharcount + charid ].next ) \ 1307 ? trie->trans[ base - ucharcount + charid ].next \ 1308 : ( state==1 ? special : 0 ) \ 1309 ) 1310 1311 #define MADE_TRIE 1 1312 #define MADE_JUMP_TRIE 2 1313 #define MADE_EXACT_TRIE 4 1314 1315 STATIC I32 1316 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) 1317 { 1318 dVAR; 1319 /* first pass, loop through and scan words */ 1320 reg_trie_data *trie; 1321 HV *widecharmap = NULL; 1322 AV *revcharmap = newAV(); 1323 regnode *cur; 1324 const U32 uniflags = UTF8_ALLOW_DEFAULT; 1325 STRLEN len = 0; 1326 UV uvc = 0; 1327 U16 curword = 0; 1328 U32 next_alloc = 0; 1329 regnode *jumper = NULL; 1330 regnode *nextbranch = NULL; 1331 regnode *convert = NULL; 1332 /* we just use folder as a flag in utf8 */ 1333 const U8 * const folder = ( flags == EXACTF 1334 ? PL_fold 1335 : ( flags == EXACTFL 1336 ? PL_fold_locale 1337 : NULL 1338 ) 1339 ); 1340 1341 #ifdef DEBUGGING 1342 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); 1343 AV *trie_words = NULL; 1344 /* along with revcharmap, this only used during construction but both are 1345 * useful during debugging so we store them in the struct when debugging. 1346 */ 1347 #else 1348 const U32 data_slot = add_data( pRExC_state, 2, "tu" ); 1349 STRLEN trie_charcount=0; 1350 #endif 1351 SV *re_trie_maxbuff; 1352 GET_RE_DEBUG_FLAGS_DECL; 1353 1354 PERL_ARGS_ASSERT_MAKE_TRIE; 1355 #ifndef DEBUGGING 1356 PERL_UNUSED_ARG(depth); 1357 #endif 1358 1359 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); 1360 trie->refcount = 1; 1361 trie->startstate = 1; 1362 trie->wordcount = word_count; 1363 RExC_rxi->data->data[ data_slot ] = (void*)trie; 1364 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); 1365 if (!(UTF && folder)) 1366 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); 1367 DEBUG_r({ 1368 trie_words = newAV(); 1369 }); 1370 1371 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 1372 if (!SvIOK(re_trie_maxbuff)) { 1373 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 1374 } 1375 DEBUG_OPTIMISE_r({ 1376 PerlIO_printf( Perl_debug_log, 1377 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", 1378 (int)depth * 2 + 2, "", 1379 REG_NODE_NUM(startbranch),REG_NODE_NUM(first), 1380 REG_NODE_NUM(last), REG_NODE_NUM(tail), 1381 (int)depth); 1382 }); 1383 1384 /* Find the node we are going to overwrite */ 1385 if ( first == startbranch && OP( last ) != BRANCH ) { 1386 /* whole branch chain */ 1387 convert = first; 1388 } else { 1389 /* branch sub-chain */ 1390 convert = NEXTOPER( first ); 1391 } 1392 1393 /* -- First loop and Setup -- 1394 1395 We first traverse the branches and scan each word to determine if it 1396 contains widechars, and how many unique chars there are, this is 1397 important as we have to build a table with at least as many columns as we 1398 have unique chars. 1399 1400 We use an array of integers to represent the character codes 0..255 1401 (trie->charmap) and we use a an HV* to store Unicode characters. We use the 1402 native representation of the character value as the key and IV's for the 1403 coded index. 1404 1405 *TODO* If we keep track of how many times each character is used we can 1406 remap the columns so that the table compression later on is more 1407 efficient in terms of memory by ensuring most common value is in the 1408 middle and the least common are on the outside. IMO this would be better 1409 than a most to least common mapping as theres a decent chance the most 1410 common letter will share a node with the least common, meaning the node 1411 will not be compressable. With a middle is most common approach the worst 1412 case is when we have the least common nodes twice. 1413 1414 */ 1415 1416 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1417 regnode * const noper = NEXTOPER( cur ); 1418 const U8 *uc = (U8*)STRING( noper ); 1419 const U8 * const e = uc + STR_LEN( noper ); 1420 STRLEN foldlen = 0; 1421 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1422 const U8 *scan = (U8*)NULL; 1423 U32 wordlen = 0; /* required init */ 1424 STRLEN chars = 0; 1425 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ 1426 1427 if (OP(noper) == NOTHING) { 1428 trie->minlen= 0; 1429 continue; 1430 } 1431 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ 1432 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte 1433 regardless of encoding */ 1434 1435 for ( ; uc < e ; uc += len ) { 1436 TRIE_CHARCOUNT(trie)++; 1437 TRIE_READ_CHAR; 1438 chars++; 1439 if ( uvc < 256 ) { 1440 if ( !trie->charmap[ uvc ] ) { 1441 trie->charmap[ uvc ]=( ++trie->uniquecharcount ); 1442 if ( folder ) 1443 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; 1444 TRIE_STORE_REVCHAR; 1445 } 1446 if ( set_bit ) { 1447 /* store the codepoint in the bitmap, and if its ascii 1448 also store its folded equivelent. */ 1449 TRIE_BITMAP_SET(trie,uvc); 1450 1451 /* store the folded codepoint */ 1452 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); 1453 1454 if ( !UTF ) { 1455 /* store first byte of utf8 representation of 1456 codepoints in the 127 < uvc < 256 range */ 1457 if (127 < uvc && uvc < 192) { 1458 TRIE_BITMAP_SET(trie,194); 1459 } else if (191 < uvc ) { 1460 TRIE_BITMAP_SET(trie,195); 1461 /* && uvc < 256 -- we know uvc is < 256 already */ 1462 } 1463 } 1464 set_bit = 0; /* We've done our bit :-) */ 1465 } 1466 } else { 1467 SV** svpp; 1468 if ( !widecharmap ) 1469 widecharmap = newHV(); 1470 1471 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); 1472 1473 if ( !svpp ) 1474 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); 1475 1476 if ( !SvTRUE( *svpp ) ) { 1477 sv_setiv( *svpp, ++trie->uniquecharcount ); 1478 TRIE_STORE_REVCHAR; 1479 } 1480 } 1481 } 1482 if( cur == first ) { 1483 trie->minlen=chars; 1484 trie->maxlen=chars; 1485 } else if (chars < trie->minlen) { 1486 trie->minlen=chars; 1487 } else if (chars > trie->maxlen) { 1488 trie->maxlen=chars; 1489 } 1490 1491 } /* end first pass */ 1492 DEBUG_TRIE_COMPILE_r( 1493 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", 1494 (int)depth * 2 + 2,"", 1495 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, 1496 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, 1497 (int)trie->minlen, (int)trie->maxlen ) 1498 ); 1499 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) ); 1500 1501 /* 1502 We now know what we are dealing with in terms of unique chars and 1503 string sizes so we can calculate how much memory a naive 1504 representation using a flat table will take. If it's over a reasonable 1505 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory 1506 conservative but potentially much slower representation using an array 1507 of lists. 1508 1509 At the end we convert both representations into the same compressed 1510 form that will be used in regexec.c for matching with. The latter 1511 is a form that cannot be used to construct with but has memory 1512 properties similar to the list form and access properties similar 1513 to the table form making it both suitable for fast searches and 1514 small enough that its feasable to store for the duration of a program. 1515 1516 See the comment in the code where the compressed table is produced 1517 inplace from the flat tabe representation for an explanation of how 1518 the compression works. 1519 1520 */ 1521 1522 1523 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { 1524 /* 1525 Second Pass -- Array Of Lists Representation 1526 1527 Each state will be represented by a list of charid:state records 1528 (reg_trie_trans_le) the first such element holds the CUR and LEN 1529 points of the allocated array. (See defines above). 1530 1531 We build the initial structure using the lists, and then convert 1532 it into the compressed table form which allows faster lookups 1533 (but cant be modified once converted). 1534 */ 1535 1536 STRLEN transcount = 1; 1537 1538 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 1539 "%*sCompiling trie using list compiler\n", 1540 (int)depth * 2 + 2, "")); 1541 1542 trie->states = (reg_trie_state *) 1543 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 1544 sizeof(reg_trie_state) ); 1545 TRIE_LIST_NEW(1); 1546 next_alloc = 2; 1547 1548 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1549 1550 regnode * const noper = NEXTOPER( cur ); 1551 U8 *uc = (U8*)STRING( noper ); 1552 const U8 * const e = uc + STR_LEN( noper ); 1553 U32 state = 1; /* required init */ 1554 U16 charid = 0; /* sanity init */ 1555 U8 *scan = (U8*)NULL; /* sanity init */ 1556 STRLEN foldlen = 0; /* required init */ 1557 U32 wordlen = 0; /* required init */ 1558 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1559 1560 if (OP(noper) != NOTHING) { 1561 for ( ; uc < e ; uc += len ) { 1562 1563 TRIE_READ_CHAR; 1564 1565 if ( uvc < 256 ) { 1566 charid = trie->charmap[ uvc ]; 1567 } else { 1568 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); 1569 if ( !svpp ) { 1570 charid = 0; 1571 } else { 1572 charid=(U16)SvIV( *svpp ); 1573 } 1574 } 1575 /* charid is now 0 if we dont know the char read, or nonzero if we do */ 1576 if ( charid ) { 1577 1578 U16 check; 1579 U32 newstate = 0; 1580 1581 charid--; 1582 if ( !trie->states[ state ].trans.list ) { 1583 TRIE_LIST_NEW( state ); 1584 } 1585 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { 1586 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { 1587 newstate = TRIE_LIST_ITEM( state, check ).newstate; 1588 break; 1589 } 1590 } 1591 if ( ! newstate ) { 1592 newstate = next_alloc++; 1593 TRIE_LIST_PUSH( state, charid, newstate ); 1594 transcount++; 1595 } 1596 state = newstate; 1597 } else { 1598 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 1599 } 1600 } 1601 } 1602 TRIE_HANDLE_WORD(state); 1603 1604 } /* end second pass */ 1605 1606 /* next alloc is the NEXT state to be allocated */ 1607 trie->statecount = next_alloc; 1608 trie->states = (reg_trie_state *) 1609 PerlMemShared_realloc( trie->states, 1610 next_alloc 1611 * sizeof(reg_trie_state) ); 1612 1613 /* and now dump it out before we compress it */ 1614 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, 1615 revcharmap, next_alloc, 1616 depth+1) 1617 ); 1618 1619 trie->trans = (reg_trie_trans *) 1620 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); 1621 { 1622 U32 state; 1623 U32 tp = 0; 1624 U32 zp = 0; 1625 1626 1627 for( state=1 ; state < next_alloc ; state ++ ) { 1628 U32 base=0; 1629 1630 /* 1631 DEBUG_TRIE_COMPILE_MORE_r( 1632 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) 1633 ); 1634 */ 1635 1636 if (trie->states[state].trans.list) { 1637 U16 minid=TRIE_LIST_ITEM( state, 1).forid; 1638 U16 maxid=minid; 1639 U16 idx; 1640 1641 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 1642 const U16 forid = TRIE_LIST_ITEM( state, idx).forid; 1643 if ( forid < minid ) { 1644 minid=forid; 1645 } else if ( forid > maxid ) { 1646 maxid=forid; 1647 } 1648 } 1649 if ( transcount < tp + maxid - minid + 1) { 1650 transcount *= 2; 1651 trie->trans = (reg_trie_trans *) 1652 PerlMemShared_realloc( trie->trans, 1653 transcount 1654 * sizeof(reg_trie_trans) ); 1655 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); 1656 } 1657 base = trie->uniquecharcount + tp - minid; 1658 if ( maxid == minid ) { 1659 U32 set = 0; 1660 for ( ; zp < tp ; zp++ ) { 1661 if ( ! trie->trans[ zp ].next ) { 1662 base = trie->uniquecharcount + zp - minid; 1663 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; 1664 trie->trans[ zp ].check = state; 1665 set = 1; 1666 break; 1667 } 1668 } 1669 if ( !set ) { 1670 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; 1671 trie->trans[ tp ].check = state; 1672 tp++; 1673 zp = tp; 1674 } 1675 } else { 1676 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 1677 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; 1678 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; 1679 trie->trans[ tid ].check = state; 1680 } 1681 tp += ( maxid - minid + 1 ); 1682 } 1683 Safefree(trie->states[ state ].trans.list); 1684 } 1685 /* 1686 DEBUG_TRIE_COMPILE_MORE_r( 1687 PerlIO_printf( Perl_debug_log, " base: %d\n",base); 1688 ); 1689 */ 1690 trie->states[ state ].trans.base=base; 1691 } 1692 trie->lasttrans = tp + 1; 1693 } 1694 } else { 1695 /* 1696 Second Pass -- Flat Table Representation. 1697 1698 we dont use the 0 slot of either trans[] or states[] so we add 1 to each. 1699 We know that we will need Charcount+1 trans at most to store the data 1700 (one row per char at worst case) So we preallocate both structures 1701 assuming worst case. 1702 1703 We then construct the trie using only the .next slots of the entry 1704 structs. 1705 1706 We use the .check field of the first entry of the node temporarily to 1707 make compression both faster and easier by keeping track of how many non 1708 zero fields are in the node. 1709 1710 Since trans are numbered from 1 any 0 pointer in the table is a FAIL 1711 transition. 1712 1713 There are two terms at use here: state as a TRIE_NODEIDX() which is a 1714 number representing the first entry of the node, and state as a 1715 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and 1716 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there 1717 are 2 entrys per node. eg: 1718 1719 A B A B 1720 1. 2 4 1. 3 7 1721 2. 0 3 3. 0 5 1722 3. 0 0 5. 0 0 1723 4. 0 0 7. 0 0 1724 1725 The table is internally in the right hand, idx form. However as we also 1726 have to deal with the states array which is indexed by nodenum we have to 1727 use TRIE_NODENUM() to convert. 1728 1729 */ 1730 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 1731 "%*sCompiling trie using table compiler\n", 1732 (int)depth * 2 + 2, "")); 1733 1734 trie->trans = (reg_trie_trans *) 1735 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) 1736 * trie->uniquecharcount + 1, 1737 sizeof(reg_trie_trans) ); 1738 trie->states = (reg_trie_state *) 1739 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 1740 sizeof(reg_trie_state) ); 1741 next_alloc = trie->uniquecharcount + 1; 1742 1743 1744 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1745 1746 regnode * const noper = NEXTOPER( cur ); 1747 const U8 *uc = (U8*)STRING( noper ); 1748 const U8 * const e = uc + STR_LEN( noper ); 1749 1750 U32 state = 1; /* required init */ 1751 1752 U16 charid = 0; /* sanity init */ 1753 U32 accept_state = 0; /* sanity init */ 1754 U8 *scan = (U8*)NULL; /* sanity init */ 1755 1756 STRLEN foldlen = 0; /* required init */ 1757 U32 wordlen = 0; /* required init */ 1758 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1759 1760 if ( OP(noper) != NOTHING ) { 1761 for ( ; uc < e ; uc += len ) { 1762 1763 TRIE_READ_CHAR; 1764 1765 if ( uvc < 256 ) { 1766 charid = trie->charmap[ uvc ]; 1767 } else { 1768 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); 1769 charid = svpp ? (U16)SvIV(*svpp) : 0; 1770 } 1771 if ( charid ) { 1772 charid--; 1773 if ( !trie->trans[ state + charid ].next ) { 1774 trie->trans[ state + charid ].next = next_alloc; 1775 trie->trans[ state ].check++; 1776 next_alloc += trie->uniquecharcount; 1777 } 1778 state = trie->trans[ state + charid ].next; 1779 } else { 1780 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 1781 } 1782 /* charid is now 0 if we dont know the char read, or nonzero if we do */ 1783 } 1784 } 1785 accept_state = TRIE_NODENUM( state ); 1786 TRIE_HANDLE_WORD(accept_state); 1787 1788 } /* end second pass */ 1789 1790 /* and now dump it out before we compress it */ 1791 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, 1792 revcharmap, 1793 next_alloc, depth+1)); 1794 1795 { 1796 /* 1797 * Inplace compress the table.* 1798 1799 For sparse data sets the table constructed by the trie algorithm will 1800 be mostly 0/FAIL transitions or to put it another way mostly empty. 1801 (Note that leaf nodes will not contain any transitions.) 1802 1803 This algorithm compresses the tables by eliminating most such 1804 transitions, at the cost of a modest bit of extra work during lookup: 1805 1806 - Each states[] entry contains a .base field which indicates the 1807 index in the state[] array wheres its transition data is stored. 1808 1809 - If .base is 0 there are no valid transitions from that node. 1810 1811 - If .base is nonzero then charid is added to it to find an entry in 1812 the trans array. 1813 1814 -If trans[states[state].base+charid].check!=state then the 1815 transition is taken to be a 0/Fail transition. Thus if there are fail 1816 transitions at the front of the node then the .base offset will point 1817 somewhere inside the previous nodes data (or maybe even into a node 1818 even earlier), but the .check field determines if the transition is 1819 valid. 1820 1821 XXX - wrong maybe? 1822 The following process inplace converts the table to the compressed 1823 table: We first do not compress the root node 1,and mark its all its 1824 .check pointers as 1 and set its .base pointer as 1 as well. This 1825 allows to do a DFA construction from the compressed table later, and 1826 ensures that any .base pointers we calculate later are greater than 1827 0. 1828 1829 - We set 'pos' to indicate the first entry of the second node. 1830 1831 - We then iterate over the columns of the node, finding the first and 1832 last used entry at l and m. We then copy l..m into pos..(pos+m-l), 1833 and set the .check pointers accordingly, and advance pos 1834 appropriately and repreat for the next node. Note that when we copy 1835 the next pointers we have to convert them from the original 1836 NODEIDX form to NODENUM form as the former is not valid post 1837 compression. 1838 1839 - If a node has no transitions used we mark its base as 0 and do not 1840 advance the pos pointer. 1841 1842 - If a node only has one transition we use a second pointer into the 1843 structure to fill in allocated fail transitions from other states. 1844 This pointer is independent of the main pointer and scans forward 1845 looking for null transitions that are allocated to a state. When it 1846 finds one it writes the single transition into the "hole". If the 1847 pointer doesnt find one the single transition is appended as normal. 1848 1849 - Once compressed we can Renew/realloc the structures to release the 1850 excess space. 1851 1852 See "Table-Compression Methods" in sec 3.9 of the Red Dragon, 1853 specifically Fig 3.47 and the associated pseudocode. 1854 1855 demq 1856 */ 1857 const U32 laststate = TRIE_NODENUM( next_alloc ); 1858 U32 state, charid; 1859 U32 pos = 0, zp=0; 1860 trie->statecount = laststate; 1861 1862 for ( state = 1 ; state < laststate ; state++ ) { 1863 U8 flag = 0; 1864 const U32 stateidx = TRIE_NODEIDX( state ); 1865 const U32 o_used = trie->trans[ stateidx ].check; 1866 U32 used = trie->trans[ stateidx ].check; 1867 trie->trans[ stateidx ].check = 0; 1868 1869 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { 1870 if ( flag || trie->trans[ stateidx + charid ].next ) { 1871 if ( trie->trans[ stateidx + charid ].next ) { 1872 if (o_used == 1) { 1873 for ( ; zp < pos ; zp++ ) { 1874 if ( ! trie->trans[ zp ].next ) { 1875 break; 1876 } 1877 } 1878 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; 1879 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); 1880 trie->trans[ zp ].check = state; 1881 if ( ++zp > pos ) pos = zp; 1882 break; 1883 } 1884 used--; 1885 } 1886 if ( !flag ) { 1887 flag = 1; 1888 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; 1889 } 1890 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); 1891 trie->trans[ pos ].check = state; 1892 pos++; 1893 } 1894 } 1895 } 1896 trie->lasttrans = pos + 1; 1897 trie->states = (reg_trie_state *) 1898 PerlMemShared_realloc( trie->states, laststate 1899 * sizeof(reg_trie_state) ); 1900 DEBUG_TRIE_COMPILE_MORE_r( 1901 PerlIO_printf( Perl_debug_log, 1902 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", 1903 (int)depth * 2 + 2,"", 1904 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), 1905 (IV)next_alloc, 1906 (IV)pos, 1907 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); 1908 ); 1909 1910 } /* end table compress */ 1911 } 1912 DEBUG_TRIE_COMPILE_MORE_r( 1913 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", 1914 (int)depth * 2 + 2, "", 1915 (UV)trie->statecount, 1916 (UV)trie->lasttrans) 1917 ); 1918 /* resize the trans array to remove unused space */ 1919 trie->trans = (reg_trie_trans *) 1920 PerlMemShared_realloc( trie->trans, trie->lasttrans 1921 * sizeof(reg_trie_trans) ); 1922 1923 /* and now dump out the compressed format */ 1924 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); 1925 1926 { /* Modify the program and insert the new TRIE node*/ 1927 U8 nodetype =(U8)(flags & 0xFF); 1928 char *str=NULL; 1929 1930 #ifdef DEBUGGING 1931 regnode *optimize = NULL; 1932 #ifdef RE_TRACK_PATTERN_OFFSETS 1933 1934 U32 mjd_offset = 0; 1935 U32 mjd_nodelen = 0; 1936 #endif /* RE_TRACK_PATTERN_OFFSETS */ 1937 #endif /* DEBUGGING */ 1938 /* 1939 This means we convert either the first branch or the first Exact, 1940 depending on whether the thing following (in 'last') is a branch 1941 or not and whther first is the startbranch (ie is it a sub part of 1942 the alternation or is it the whole thing.) 1943 Assuming its a sub part we conver the EXACT otherwise we convert 1944 the whole branch sequence, including the first. 1945 */ 1946 /* Find the node we are going to overwrite */ 1947 if ( first != startbranch || OP( last ) == BRANCH ) { 1948 /* branch sub-chain */ 1949 NEXT_OFF( first ) = (U16)(last - first); 1950 #ifdef RE_TRACK_PATTERN_OFFSETS 1951 DEBUG_r({ 1952 mjd_offset= Node_Offset((convert)); 1953 mjd_nodelen= Node_Length((convert)); 1954 }); 1955 #endif 1956 /* whole branch chain */ 1957 } 1958 #ifdef RE_TRACK_PATTERN_OFFSETS 1959 else { 1960 DEBUG_r({ 1961 const regnode *nop = NEXTOPER( convert ); 1962 mjd_offset= Node_Offset((nop)); 1963 mjd_nodelen= Node_Length((nop)); 1964 }); 1965 } 1966 DEBUG_OPTIMISE_r( 1967 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", 1968 (int)depth * 2 + 2, "", 1969 (UV)mjd_offset, (UV)mjd_nodelen) 1970 ); 1971 #endif 1972 /* But first we check to see if there is a common prefix we can 1973 split out as an EXACT and put in front of the TRIE node. */ 1974 trie->startstate= 1; 1975 if ( trie->bitmap && !widecharmap && !trie->jump ) { 1976 U32 state; 1977 for ( state = 1 ; state < trie->statecount-1 ; state++ ) { 1978 U32 ofs = 0; 1979 I32 idx = -1; 1980 U32 count = 0; 1981 const U32 base = trie->states[ state ].trans.base; 1982 1983 if ( trie->states[state].wordnum ) 1984 count = 1; 1985 1986 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 1987 if ( ( base + ofs >= trie->uniquecharcount ) && 1988 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 1989 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 1990 { 1991 if ( ++count > 1 ) { 1992 SV **tmp = av_fetch( revcharmap, ofs, 0); 1993 const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); 1994 if ( state == 1 ) break; 1995 if ( count == 2 ) { 1996 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); 1997 DEBUG_OPTIMISE_r( 1998 PerlIO_printf(Perl_debug_log, 1999 "%*sNew Start State=%"UVuf" Class: [", 2000 (int)depth * 2 + 2, "", 2001 (UV)state)); 2002 if (idx >= 0) { 2003 SV ** const tmp = av_fetch( revcharmap, idx, 0); 2004 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); 2005 2006 TRIE_BITMAP_SET(trie,*ch); 2007 if ( folder ) 2008 TRIE_BITMAP_SET(trie, folder[ *ch ]); 2009 DEBUG_OPTIMISE_r( 2010 PerlIO_printf(Perl_debug_log, "%s", (char*)ch) 2011 ); 2012 } 2013 } 2014 TRIE_BITMAP_SET(trie,*ch); 2015 if ( folder ) 2016 TRIE_BITMAP_SET(trie,folder[ *ch ]); 2017 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); 2018 } 2019 idx = ofs; 2020 } 2021 } 2022 if ( count == 1 ) { 2023 SV **tmp = av_fetch( revcharmap, idx, 0); 2024 STRLEN len; 2025 char *ch = SvPV( *tmp, len ); 2026 DEBUG_OPTIMISE_r({ 2027 SV *sv=sv_newmortal(); 2028 PerlIO_printf( Perl_debug_log, 2029 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", 2030 (int)depth * 2 + 2, "", 2031 (UV)state, (UV)idx, 2032 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, 2033 PL_colors[0], PL_colors[1], 2034 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 2035 PERL_PV_ESCAPE_FIRSTCHAR 2036 ) 2037 ); 2038 }); 2039 if ( state==1 ) { 2040 OP( convert ) = nodetype; 2041 str=STRING(convert); 2042 STR_LEN(convert)=0; 2043 } 2044 STR_LEN(convert) += len; 2045 while (len--) 2046 *str++ = *ch++; 2047 } else { 2048 #ifdef DEBUGGING 2049 if (state>1) 2050 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); 2051 #endif 2052 break; 2053 } 2054 } 2055 if (str) { 2056 regnode *n = convert+NODE_SZ_STR(convert); 2057 NEXT_OFF(convert) = NODE_SZ_STR(convert); 2058 trie->startstate = state; 2059 trie->minlen -= (state - 1); 2060 trie->maxlen -= (state - 1); 2061 #ifdef DEBUGGING 2062 /* At least the UNICOS C compiler choked on this 2063 * being argument to DEBUG_r(), so let's just have 2064 * it right here. */ 2065 if ( 2066 #ifdef PERL_EXT_RE_BUILD 2067 1 2068 #else 2069 DEBUG_r_TEST 2070 #endif 2071 ) { 2072 regnode *fix = convert; 2073 U32 word = trie->wordcount; 2074 mjd_nodelen++; 2075 Set_Node_Offset_Length(convert, mjd_offset, state - 1); 2076 while( ++fix < n ) { 2077 Set_Node_Offset_Length(fix, 0, 0); 2078 } 2079 while (word--) { 2080 SV ** const tmp = av_fetch( trie_words, word, 0 ); 2081 if (tmp) { 2082 if ( STR_LEN(convert) <= SvCUR(*tmp) ) 2083 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); 2084 else 2085 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); 2086 } 2087 } 2088 } 2089 #endif 2090 if (trie->maxlen) { 2091 convert = n; 2092 } else { 2093 NEXT_OFF(convert) = (U16)(tail - convert); 2094 DEBUG_r(optimize= n); 2095 } 2096 } 2097 } 2098 if (!jumper) 2099 jumper = last; 2100 if ( trie->maxlen ) { 2101 NEXT_OFF( convert ) = (U16)(tail - convert); 2102 ARG_SET( convert, data_slot ); 2103 /* Store the offset to the first unabsorbed branch in 2104 jump[0], which is otherwise unused by the jump logic. 2105 We use this when dumping a trie and during optimisation. */ 2106 if (trie->jump) 2107 trie->jump[0] = (U16)(nextbranch - convert); 2108 2109 /* XXXX */ 2110 if ( !trie->states[trie->startstate].wordnum && trie->bitmap && 2111 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) 2112 { 2113 OP( convert ) = TRIEC; 2114 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); 2115 PerlMemShared_free(trie->bitmap); 2116 trie->bitmap= NULL; 2117 } else 2118 OP( convert ) = TRIE; 2119 2120 /* store the type in the flags */ 2121 convert->flags = nodetype; 2122 DEBUG_r({ 2123 optimize = convert 2124 + NODE_STEP_REGNODE 2125 + regarglen[ OP( convert ) ]; 2126 }); 2127 /* XXX We really should free up the resource in trie now, 2128 as we won't use them - (which resources?) dmq */ 2129 } 2130 /* needed for dumping*/ 2131 DEBUG_r(if (optimize) { 2132 regnode *opt = convert; 2133 2134 while ( ++opt < optimize) { 2135 Set_Node_Offset_Length(opt,0,0); 2136 } 2137 /* 2138 Try to clean up some of the debris left after the 2139 optimisation. 2140 */ 2141 while( optimize < jumper ) { 2142 mjd_nodelen += Node_Length((optimize)); 2143 OP( optimize ) = OPTIMIZED; 2144 Set_Node_Offset_Length(optimize,0,0); 2145 optimize++; 2146 } 2147 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); 2148 }); 2149 } /* end node insert */ 2150 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; 2151 #ifdef DEBUGGING 2152 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; 2153 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; 2154 #else 2155 SvREFCNT_dec(revcharmap); 2156 #endif 2157 return trie->jump 2158 ? MADE_JUMP_TRIE 2159 : trie->startstate>1 2160 ? MADE_EXACT_TRIE 2161 : MADE_TRIE; 2162 } 2163 2164 STATIC void 2165 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) 2166 { 2167 /* The Trie is constructed and compressed now so we can build a fail array now if its needed 2168 2169 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the 2170 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 2171 ISBN 0-201-10088-6 2172 2173 We find the fail state for each state in the trie, this state is the longest proper 2174 suffix of the current states 'word' that is also a proper prefix of another word in our 2175 trie. State 1 represents the word '' and is the thus the default fail state. This allows 2176 the DFA not to have to restart after its tried and failed a word at a given point, it 2177 simply continues as though it had been matching the other word in the first place. 2178 Consider 2179 'abcdgu'=~/abcdefg|cdgu/ 2180 When we get to 'd' we are still matching the first word, we would encounter 'g' which would 2181 fail, which would bring use to the state representing 'd' in the second word where we would 2182 try 'g' and succeed, prodceding to match 'cdgu'. 2183 */ 2184 /* add a fail transition */ 2185 const U32 trie_offset = ARG(source); 2186 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; 2187 U32 *q; 2188 const U32 ucharcount = trie->uniquecharcount; 2189 const U32 numstates = trie->statecount; 2190 const U32 ubound = trie->lasttrans + ucharcount; 2191 U32 q_read = 0; 2192 U32 q_write = 0; 2193 U32 charid; 2194 U32 base = trie->states[ 1 ].trans.base; 2195 U32 *fail; 2196 reg_ac_data *aho; 2197 const U32 data_slot = add_data( pRExC_state, 1, "T" ); 2198 GET_RE_DEBUG_FLAGS_DECL; 2199 2200 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; 2201 #ifndef DEBUGGING 2202 PERL_UNUSED_ARG(depth); 2203 #endif 2204 2205 2206 ARG_SET( stclass, data_slot ); 2207 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); 2208 RExC_rxi->data->data[ data_slot ] = (void*)aho; 2209 aho->trie=trie_offset; 2210 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); 2211 Copy( trie->states, aho->states, numstates, reg_trie_state ); 2212 Newxz( q, numstates, U32); 2213 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); 2214 aho->refcount = 1; 2215 fail = aho->fail; 2216 /* initialize fail[0..1] to be 1 so that we always have 2217 a valid final fail state */ 2218 fail[ 0 ] = fail[ 1 ] = 1; 2219 2220 for ( charid = 0; charid < ucharcount ; charid++ ) { 2221 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); 2222 if ( newstate ) { 2223 q[ q_write ] = newstate; 2224 /* set to point at the root */ 2225 fail[ q[ q_write++ ] ]=1; 2226 } 2227 } 2228 while ( q_read < q_write) { 2229 const U32 cur = q[ q_read++ % numstates ]; 2230 base = trie->states[ cur ].trans.base; 2231 2232 for ( charid = 0 ; charid < ucharcount ; charid++ ) { 2233 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); 2234 if (ch_state) { 2235 U32 fail_state = cur; 2236 U32 fail_base; 2237 do { 2238 fail_state = fail[ fail_state ]; 2239 fail_base = aho->states[ fail_state ].trans.base; 2240 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); 2241 2242 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); 2243 fail[ ch_state ] = fail_state; 2244 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) 2245 { 2246 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; 2247 } 2248 q[ q_write++ % numstates] = ch_state; 2249 } 2250 } 2251 } 2252 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop 2253 when we fail in state 1, this allows us to use the 2254 charclass scan to find a valid start char. This is based on the principle 2255 that theres a good chance the string being searched contains lots of stuff 2256 that cant be a start char. 2257 */ 2258 fail[ 0 ] = fail[ 1 ] = 0; 2259 DEBUG_TRIE_COMPILE_r({ 2260 PerlIO_printf(Perl_debug_log, 2261 "%*sStclass Failtable (%"UVuf" states): 0", 2262 (int)(depth * 2), "", (UV)numstates 2263 ); 2264 for( q_read=1; q_read<numstates; q_read++ ) { 2265 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]); 2266 } 2267 PerlIO_printf(Perl_debug_log, "\n"); 2268 }); 2269 Safefree(q); 2270 /*RExC_seen |= REG_SEEN_TRIEDFA;*/ 2271 } 2272 2273 2274 /* 2275 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. 2276 * These need to be revisited when a newer toolchain becomes available. 2277 */ 2278 #if defined(__sparc64__) && defined(__GNUC__) 2279 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) 2280 # undef SPARC64_GCC_WORKAROUND 2281 # define SPARC64_GCC_WORKAROUND 1 2282 # endif 2283 #endif 2284 2285 #define DEBUG_PEEP(str,scan,depth) \ 2286 DEBUG_OPTIMISE_r({if (scan){ \ 2287 SV * const mysv=sv_newmortal(); \ 2288 regnode *Next = regnext(scan); \ 2289 regprop(RExC_rx, mysv, scan); \ 2290 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ 2291 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ 2292 Next ? (REG_NODE_NUM(Next)) : 0 ); \ 2293 }}); 2294 2295 2296 2297 2298 2299 #define JOIN_EXACT(scan,min,flags) \ 2300 if (PL_regkind[OP(scan)] == EXACT) \ 2301 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1) 2302 2303 STATIC U32 2304 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) { 2305 /* Merge several consecutive EXACTish nodes into one. */ 2306 regnode *n = regnext(scan); 2307 U32 stringok = 1; 2308 regnode *next = scan + NODE_SZ_STR(scan); 2309 U32 merged = 0; 2310 U32 stopnow = 0; 2311 #ifdef DEBUGGING 2312 regnode *stop = scan; 2313 GET_RE_DEBUG_FLAGS_DECL; 2314 #else 2315 PERL_UNUSED_ARG(depth); 2316 #endif 2317 2318 PERL_ARGS_ASSERT_JOIN_EXACT; 2319 #ifndef EXPERIMENTAL_INPLACESCAN 2320 PERL_UNUSED_ARG(flags); 2321 PERL_UNUSED_ARG(val); 2322 #endif 2323 DEBUG_PEEP("join",scan,depth); 2324 2325 /* Skip NOTHING, merge EXACT*. */ 2326 while (n && 2327 ( PL_regkind[OP(n)] == NOTHING || 2328 (stringok && (OP(n) == OP(scan)))) 2329 && NEXT_OFF(n) 2330 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { 2331 2332 if (OP(n) == TAIL || n > next) 2333 stringok = 0; 2334 if (PL_regkind[OP(n)] == NOTHING) { 2335 DEBUG_PEEP("skip:",n,depth); 2336 NEXT_OFF(scan) += NEXT_OFF(n); 2337 next = n + NODE_STEP_REGNODE; 2338 #ifdef DEBUGGING 2339 if (stringok) 2340 stop = n; 2341 #endif 2342 n = regnext(n); 2343 } 2344 else if (stringok) { 2345 const unsigned int oldl = STR_LEN(scan); 2346 regnode * const nnext = regnext(n); 2347 2348 DEBUG_PEEP("merg",n,depth); 2349 2350 merged++; 2351 if (oldl + STR_LEN(n) > U8_MAX) 2352 break; 2353 NEXT_OFF(scan) += NEXT_OFF(n); 2354 STR_LEN(scan) += STR_LEN(n); 2355 next = n + NODE_SZ_STR(n); 2356 /* Now we can overwrite *n : */ 2357 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 2358 #ifdef DEBUGGING 2359 stop = next - 1; 2360 #endif 2361 n = nnext; 2362 if (stopnow) break; 2363 } 2364 2365 #ifdef EXPERIMENTAL_INPLACESCAN 2366 if (flags && !NEXT_OFF(n)) { 2367 DEBUG_PEEP("atch", val, depth); 2368 if (reg_off_by_arg[OP(n)]) { 2369 ARG_SET(n, val - n); 2370 } 2371 else { 2372 NEXT_OFF(n) = val - n; 2373 } 2374 stopnow = 1; 2375 } 2376 #endif 2377 } 2378 2379 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) { 2380 /* 2381 Two problematic code points in Unicode casefolding of EXACT nodes: 2382 2383 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 2384 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 2385 2386 which casefold to 2387 2388 Unicode UTF-8 2389 2390 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 2391 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 2392 2393 This means that in case-insensitive matching (or "loose matching", 2394 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte 2395 length of the above casefolded versions) can match a target string 2396 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). 2397 This would rather mess up the minimum length computation. 2398 2399 What we'll do is to look for the tail four bytes, and then peek 2400 at the preceding two bytes to see whether we need to decrease 2401 the minimum length by four (six minus two). 2402 2403 Thanks to the design of UTF-8, there cannot be false matches: 2404 A sequence of valid UTF-8 bytes cannot be a subsequence of 2405 another valid sequence of UTF-8 bytes. 2406 2407 */ 2408 char * const s0 = STRING(scan), *s, *t; 2409 char * const s1 = s0 + STR_LEN(scan) - 1; 2410 char * const s2 = s1 - 4; 2411 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ 2412 const char t0[] = "\xaf\x49\xaf\x42"; 2413 #else 2414 const char t0[] = "\xcc\x88\xcc\x81"; 2415 #endif 2416 const char * const t1 = t0 + 3; 2417 2418 for (s = s0 + 2; 2419 s < s2 && (t = ninstr(s, s1, t0, t1)); 2420 s = t + 4) { 2421 #ifdef EBCDIC 2422 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) || 2423 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5)) 2424 #else 2425 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || 2426 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) 2427 #endif 2428 *min -= 4; 2429 } 2430 } 2431 2432 #ifdef DEBUGGING 2433 /* Allow dumping */ 2434 n = scan + NODE_SZ_STR(scan); 2435 while (n <= stop) { 2436 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) { 2437 OP(n) = OPTIMIZED; 2438 NEXT_OFF(n) = 0; 2439 } 2440 n++; 2441 } 2442 #endif 2443 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); 2444 return stopnow; 2445 } 2446 2447 /* REx optimizer. Converts nodes into quickier variants "in place". 2448 Finds fixed substrings. */ 2449 2450 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set 2451 to the position after last scanned or to NULL. */ 2452 2453 #define INIT_AND_WITHP \ 2454 assert(!and_withp); \ 2455 Newx(and_withp,1,struct regnode_charclass_class); \ 2456 SAVEFREEPV(and_withp) 2457 2458 /* this is a chain of data about sub patterns we are processing that 2459 need to be handled seperately/specially in study_chunk. Its so 2460 we can simulate recursion without losing state. */ 2461 struct scan_frame; 2462 typedef struct scan_frame { 2463 regnode *last; /* last node to process in this frame */ 2464 regnode *next; /* next node to process when last is reached */ 2465 struct scan_frame *prev; /*previous frame*/ 2466 I32 stop; /* what stopparen do we use */ 2467 } scan_frame; 2468 2469 2470 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) 2471 2472 #define CASE_SYNST_FNC(nAmE) \ 2473 case nAmE: \ 2474 if (flags & SCF_DO_STCLASS_AND) { \ 2475 for (value = 0; value < 256; value++) \ 2476 if (!is_ ## nAmE ## _cp(value)) \ 2477 ANYOF_BITMAP_CLEAR(data->start_class, value); \ 2478 } \ 2479 else { \ 2480 for (value = 0; value < 256; value++) \ 2481 if (is_ ## nAmE ## _cp(value)) \ 2482 ANYOF_BITMAP_SET(data->start_class, value); \ 2483 } \ 2484 break; \ 2485 case N ## nAmE: \ 2486 if (flags & SCF_DO_STCLASS_AND) { \ 2487 for (value = 0; value < 256; value++) \ 2488 if (is_ ## nAmE ## _cp(value)) \ 2489 ANYOF_BITMAP_CLEAR(data->start_class, value); \ 2490 } \ 2491 else { \ 2492 for (value = 0; value < 256; value++) \ 2493 if (!is_ ## nAmE ## _cp(value)) \ 2494 ANYOF_BITMAP_SET(data->start_class, value); \ 2495 } \ 2496 break 2497 2498 2499 2500 STATIC I32 2501 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, 2502 I32 *minlenp, I32 *deltap, 2503 regnode *last, 2504 scan_data_t *data, 2505 I32 stopparen, 2506 U8* recursed, 2507 struct regnode_charclass_class *and_withp, 2508 U32 flags, U32 depth) 2509 /* scanp: Start here (read-write). */ 2510 /* deltap: Write maxlen-minlen here. */ 2511 /* last: Stop before this one. */ 2512 /* data: string data about the pattern */ 2513 /* stopparen: treat close N as END */ 2514 /* recursed: which subroutines have we recursed into */ 2515 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ 2516 { 2517 dVAR; 2518 I32 min = 0, pars = 0, code; 2519 regnode *scan = *scanp, *next; 2520 I32 delta = 0; 2521 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 2522 int is_inf_internal = 0; /* The studied chunk is infinite */ 2523 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; 2524 scan_data_t data_fake; 2525 SV *re_trie_maxbuff = NULL; 2526 regnode *first_non_open = scan; 2527 I32 stopmin = I32_MAX; 2528 scan_frame *frame = NULL; 2529 GET_RE_DEBUG_FLAGS_DECL; 2530 2531 PERL_ARGS_ASSERT_STUDY_CHUNK; 2532 2533 #ifdef DEBUGGING 2534 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 2535 #endif 2536 2537 if ( depth == 0 ) { 2538 while (first_non_open && OP(first_non_open) == OPEN) 2539 first_non_open=regnext(first_non_open); 2540 } 2541 2542 2543 fake_study_recurse: 2544 while ( scan && OP(scan) != END && scan < last ){ 2545 /* Peephole optimizer: */ 2546 DEBUG_STUDYDATA("Peep:", data,depth); 2547 DEBUG_PEEP("Peep",scan,depth); 2548 JOIN_EXACT(scan,&min,0); 2549 2550 /* Follow the next-chain of the current node and optimize 2551 away all the NOTHINGs from it. */ 2552 if (OP(scan) != CURLYX) { 2553 const int max = (reg_off_by_arg[OP(scan)] 2554 ? I32_MAX 2555 /* I32 may be smaller than U16 on CRAYs! */ 2556 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 2557 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); 2558 int noff; 2559 regnode *n = scan; 2560 2561 /* Skip NOTHING and LONGJMP. */ 2562 while ((n = regnext(n)) 2563 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) 2564 || ((OP(n) == LONGJMP) && (noff = ARG(n)))) 2565 && off + noff < max) 2566 off += noff; 2567 if (reg_off_by_arg[OP(scan)]) 2568 ARG(scan) = off; 2569 else 2570 NEXT_OFF(scan) = off; 2571 } 2572 2573 2574 2575 /* The principal pseudo-switch. Cannot be a switch, since we 2576 look into several different things. */ 2577 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ 2578 || OP(scan) == IFTHEN) { 2579 next = regnext(scan); 2580 code = OP(scan); 2581 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ 2582 2583 if (OP(next) == code || code == IFTHEN) { 2584 /* NOTE - There is similar code to this block below for handling 2585 TRIE nodes on a re-study. If you change stuff here check there 2586 too. */ 2587 I32 max1 = 0, min1 = I32_MAX, num = 0; 2588 struct regnode_charclass_class accum; 2589 regnode * const startbranch=scan; 2590 2591 if (flags & SCF_DO_SUBSTR) 2592 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ 2593 if (flags & SCF_DO_STCLASS) 2594 cl_init_zero(pRExC_state, &accum); 2595 2596 while (OP(scan) == code) { 2597 I32 deltanext, minnext, f = 0, fake; 2598 struct regnode_charclass_class this_class; 2599 2600 num++; 2601 data_fake.flags = 0; 2602 if (data) { 2603 data_fake.whilem_c = data->whilem_c; 2604 data_fake.last_closep = data->last_closep; 2605 } 2606 else 2607 data_fake.last_closep = &fake; 2608 2609 data_fake.pos_delta = delta; 2610 next = regnext(scan); 2611 scan = NEXTOPER(scan); 2612 if (code != BRANCH) 2613 scan = NEXTOPER(scan); 2614 if (flags & SCF_DO_STCLASS) { 2615 cl_init(pRExC_state, &this_class); 2616 data_fake.start_class = &this_class; 2617 f = SCF_DO_STCLASS_AND; 2618 } 2619 if (flags & SCF_WHILEM_VISITED_POS) 2620 f |= SCF_WHILEM_VISITED_POS; 2621 2622 /* we suppose the run is continuous, last=next...*/ 2623 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 2624 next, &data_fake, 2625 stopparen, recursed, NULL, f,depth+1); 2626 if (min1 > minnext) 2627 min1 = minnext; 2628 if (max1 < minnext + deltanext) 2629 max1 = minnext + deltanext; 2630 if (deltanext == I32_MAX) 2631 is_inf = is_inf_internal = 1; 2632 scan = next; 2633 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 2634 pars++; 2635 if (data_fake.flags & SCF_SEEN_ACCEPT) { 2636 if ( stopmin > minnext) 2637 stopmin = min + min1; 2638 flags &= ~SCF_DO_SUBSTR; 2639 if (data) 2640 data->flags |= SCF_SEEN_ACCEPT; 2641 } 2642 if (data) { 2643 if (data_fake.flags & SF_HAS_EVAL) 2644 data->flags |= SF_HAS_EVAL; 2645 data->whilem_c = data_fake.whilem_c; 2646 } 2647 if (flags & SCF_DO_STCLASS) 2648 cl_or(pRExC_state, &accum, &this_class); 2649 } 2650 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 2651 min1 = 0; 2652 if (flags & SCF_DO_SUBSTR) { 2653 data->pos_min += min1; 2654 data->pos_delta += max1 - min1; 2655 if (max1 != min1 || is_inf) 2656 data->longest = &(data->longest_float); 2657 } 2658 min += min1; 2659 delta += max1 - min1; 2660 if (flags & SCF_DO_STCLASS_OR) { 2661 cl_or(pRExC_state, data->start_class, &accum); 2662 if (min1) { 2663 cl_and(data->start_class, and_withp); 2664 flags &= ~SCF_DO_STCLASS; 2665 } 2666 } 2667 else if (flags & SCF_DO_STCLASS_AND) { 2668 if (min1) { 2669 cl_and(data->start_class, &accum); 2670 flags &= ~SCF_DO_STCLASS; 2671 } 2672 else { 2673 /* Switch to OR mode: cache the old value of 2674 * data->start_class */ 2675 INIT_AND_WITHP; 2676 StructCopy(data->start_class, and_withp, 2677 struct regnode_charclass_class); 2678 flags &= ~SCF_DO_STCLASS_AND; 2679 StructCopy(&accum, data->start_class, 2680 struct regnode_charclass_class); 2681 flags |= SCF_DO_STCLASS_OR; 2682 data->start_class->flags |= ANYOF_EOS; 2683 } 2684 } 2685 2686 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { 2687 /* demq. 2688 2689 Assuming this was/is a branch we are dealing with: 'scan' now 2690 points at the item that follows the branch sequence, whatever 2691 it is. We now start at the beginning of the sequence and look 2692 for subsequences of 2693 2694 BRANCH->EXACT=>x1 2695 BRANCH->EXACT=>x2 2696 tail 2697 2698 which would be constructed from a pattern like /A|LIST|OF|WORDS/ 2699 2700 If we can find such a subseqence we need to turn the first 2701 element into a trie and then add the subsequent branch exact 2702 strings to the trie. 2703 2704 We have two cases 2705 2706 1. patterns where the whole set of branch can be converted. 2707 2708 2. patterns where only a subset can be converted. 2709 2710 In case 1 we can replace the whole set with a single regop 2711 for the trie. In case 2 we need to keep the start and end 2712 branchs so 2713 2714 'BRANCH EXACT; BRANCH EXACT; BRANCH X' 2715 becomes BRANCH TRIE; BRANCH X; 2716 2717 There is an additional case, that being where there is a 2718 common prefix, which gets split out into an EXACT like node 2719 preceding the TRIE node. 2720 2721 If x(1..n)==tail then we can do a simple trie, if not we make 2722 a "jump" trie, such that when we match the appropriate word 2723 we "jump" to the appopriate tail node. Essentailly we turn 2724 a nested if into a case structure of sorts. 2725 2726 */ 2727 2728 int made=0; 2729 if (!re_trie_maxbuff) { 2730 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 2731 if (!SvIOK(re_trie_maxbuff)) 2732 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 2733 } 2734 if ( SvIV(re_trie_maxbuff)>=0 ) { 2735 regnode *cur; 2736 regnode *first = (regnode *)NULL; 2737 regnode *last = (regnode *)NULL; 2738 regnode *tail = scan; 2739 U8 optype = 0; 2740 U32 count=0; 2741 2742 #ifdef DEBUGGING 2743 SV * const mysv = sv_newmortal(); /* for dumping */ 2744 #endif 2745 /* var tail is used because there may be a TAIL 2746 regop in the way. Ie, the exacts will point to the 2747 thing following the TAIL, but the last branch will 2748 point at the TAIL. So we advance tail. If we 2749 have nested (?:) we may have to move through several 2750 tails. 2751 */ 2752 2753 while ( OP( tail ) == TAIL ) { 2754 /* this is the TAIL generated by (?:) */ 2755 tail = regnext( tail ); 2756 } 2757 2758 2759 DEBUG_OPTIMISE_r({ 2760 regprop(RExC_rx, mysv, tail ); 2761 PerlIO_printf( Perl_debug_log, "%*s%s%s\n", 2762 (int)depth * 2 + 2, "", 2763 "Looking for TRIE'able sequences. Tail node is: ", 2764 SvPV_nolen_const( mysv ) 2765 ); 2766 }); 2767 2768 /* 2769 2770 step through the branches, cur represents each 2771 branch, noper is the first thing to be matched 2772 as part of that branch and noper_next is the 2773 regnext() of that node. if noper is an EXACT 2774 and noper_next is the same as scan (our current 2775 position in the regex) then the EXACT branch is 2776 a possible optimization target. Once we have 2777 two or more consequetive such branches we can 2778 create a trie of the EXACT's contents and stich 2779 it in place. If the sequence represents all of 2780 the branches we eliminate the whole thing and 2781 replace it with a single TRIE. If it is a 2782 subsequence then we need to stitch it in. This 2783 means the first branch has to remain, and needs 2784 to be repointed at the item on the branch chain 2785 following the last branch optimized. This could 2786 be either a BRANCH, in which case the 2787 subsequence is internal, or it could be the 2788 item following the branch sequence in which 2789 case the subsequence is at the end. 2790 2791 */ 2792 2793 /* dont use tail as the end marker for this traverse */ 2794 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { 2795 regnode * const noper = NEXTOPER( cur ); 2796 #if defined(DEBUGGING) || defined(NOJUMPTRIE) 2797 regnode * const noper_next = regnext( noper ); 2798 #endif 2799 2800 DEBUG_OPTIMISE_r({ 2801 regprop(RExC_rx, mysv, cur); 2802 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", 2803 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); 2804 2805 regprop(RExC_rx, mysv, noper); 2806 PerlIO_printf( Perl_debug_log, " -> %s", 2807 SvPV_nolen_const(mysv)); 2808 2809 if ( noper_next ) { 2810 regprop(RExC_rx, mysv, noper_next ); 2811 PerlIO_printf( Perl_debug_log,"\t=> %s\t", 2812 SvPV_nolen_const(mysv)); 2813 } 2814 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n", 2815 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) ); 2816 }); 2817 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype 2818 : PL_regkind[ OP( noper ) ] == EXACT ) 2819 || OP(noper) == NOTHING ) 2820 #ifdef NOJUMPTRIE 2821 && noper_next == tail 2822 #endif 2823 && count < U16_MAX) 2824 { 2825 count++; 2826 if ( !first || optype == NOTHING ) { 2827 if (!first) first = cur; 2828 optype = OP( noper ); 2829 } else { 2830 last = cur; 2831 } 2832 } else { 2833 /* 2834 Currently we do not believe that the trie logic can 2835 handle case insensitive matching properly when the 2836 pattern is not unicode (thus forcing unicode semantics). 2837 2838 If/when this is fixed the following define can be swapped 2839 in below to fully enable trie logic. 2840 2841 #define TRIE_TYPE_IS_SAFE 1 2842 2843 */ 2844 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) 2845 2846 if ( last && TRIE_TYPE_IS_SAFE ) { 2847 make_trie( pRExC_state, 2848 startbranch, first, cur, tail, count, 2849 optype, depth+1 ); 2850 } 2851 if ( PL_regkind[ OP( noper ) ] == EXACT 2852 #ifdef NOJUMPTRIE 2853 && noper_next == tail 2854 #endif 2855 ){ 2856 count = 1; 2857 first = cur; 2858 optype = OP( noper ); 2859 } else { 2860 count = 0; 2861 first = NULL; 2862 optype = 0; 2863 } 2864 last = NULL; 2865 } 2866 } 2867 DEBUG_OPTIMISE_r({ 2868 regprop(RExC_rx, mysv, cur); 2869 PerlIO_printf( Perl_debug_log, 2870 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2, 2871 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); 2872 2873 }); 2874 2875 if ( last && TRIE_TYPE_IS_SAFE ) { 2876 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); 2877 #ifdef TRIE_STUDY_OPT 2878 if ( ((made == MADE_EXACT_TRIE && 2879 startbranch == first) 2880 || ( first_non_open == first )) && 2881 depth==0 ) { 2882 flags |= SCF_TRIE_RESTUDY; 2883 if ( startbranch == first 2884 && scan == tail ) 2885 { 2886 RExC_seen &=~REG_TOP_LEVEL_BRANCHES; 2887 } 2888 } 2889 #endif 2890 } 2891 } 2892 2893 } /* do trie */ 2894 2895 } 2896 else if ( code == BRANCHJ ) { /* single branch is optimized. */ 2897 scan = NEXTOPER(NEXTOPER(scan)); 2898 } else /* single branch is optimized. */ 2899 scan = NEXTOPER(scan); 2900 continue; 2901 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { 2902 scan_frame *newframe = NULL; 2903 I32 paren; 2904 regnode *start; 2905 regnode *end; 2906 2907 if (OP(scan) != SUSPEND) { 2908 /* set the pointer */ 2909 if (OP(scan) == GOSUB) { 2910 paren = ARG(scan); 2911 RExC_recurse[ARG2L(scan)] = scan; 2912 start = RExC_open_parens[paren-1]; 2913 end = RExC_close_parens[paren-1]; 2914 } else { 2915 paren = 0; 2916 start = RExC_rxi->program + 1; 2917 end = RExC_opend; 2918 } 2919 if (!recursed) { 2920 Newxz(recursed, (((RExC_npar)>>3) +1), U8); 2921 SAVEFREEPV(recursed); 2922 } 2923 if (!PAREN_TEST(recursed,paren+1)) { 2924 PAREN_SET(recursed,paren+1); 2925 Newx(newframe,1,scan_frame); 2926 } else { 2927 if (flags & SCF_DO_SUBSTR) { 2928 SCAN_COMMIT(pRExC_state,data,minlenp); 2929 data->longest = &(data->longest_float); 2930 } 2931 is_inf = is_inf_internal = 1; 2932 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 2933 cl_anything(pRExC_state, data->start_class); 2934 flags &= ~SCF_DO_STCLASS; 2935 } 2936 } else { 2937 Newx(newframe,1,scan_frame); 2938 paren = stopparen; 2939 start = scan+2; 2940 end = regnext(scan); 2941 } 2942 if (newframe) { 2943 assert(start); 2944 assert(end); 2945 SAVEFREEPV(newframe); 2946 newframe->next = regnext(scan); 2947 newframe->last = last; 2948 newframe->stop = stopparen; 2949 newframe->prev = frame; 2950 2951 frame = newframe; 2952 scan = start; 2953 stopparen = paren; 2954 last = end; 2955 2956 continue; 2957 } 2958 } 2959 else if (OP(scan) == EXACT) { 2960 I32 l = STR_LEN(scan); 2961 UV uc; 2962 if (UTF) { 2963 const U8 * const s = (U8*)STRING(scan); 2964 l = utf8_length(s, s + l); 2965 uc = utf8_to_uvchr(s, NULL); 2966 } else { 2967 uc = *((U8*)STRING(scan)); 2968 } 2969 min += l; 2970 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 2971 /* The code below prefers earlier match for fixed 2972 offset, later match for variable offset. */ 2973 if (data->last_end == -1) { /* Update the start info. */ 2974 data->last_start_min = data->pos_min; 2975 data->last_start_max = is_inf 2976 ? I32_MAX : data->pos_min + data->pos_delta; 2977 } 2978 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); 2979 if (UTF) 2980 SvUTF8_on(data->last_found); 2981 { 2982 SV * const sv = data->last_found; 2983 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 2984 mg_find(sv, PERL_MAGIC_utf8) : NULL; 2985 if (mg && mg->mg_len >= 0) 2986 mg->mg_len += utf8_length((U8*)STRING(scan), 2987 (U8*)STRING(scan)+STR_LEN(scan)); 2988 } 2989 data->last_end = data->pos_min + l; 2990 data->pos_min += l; /* As in the first entry. */ 2991 data->flags &= ~SF_BEFORE_EOL; 2992 } 2993 if (flags & SCF_DO_STCLASS_AND) { 2994 /* Check whether it is compatible with what we know already! */ 2995 int compat = 1; 2996 2997 if (uc >= 0x100 || 2998 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 2999 && !ANYOF_BITMAP_TEST(data->start_class, uc) 3000 && (!(data->start_class->flags & ANYOF_FOLD) 3001 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 3002 ) 3003 compat = 0; 3004 ANYOF_CLASS_ZERO(data->start_class); 3005 ANYOF_BITMAP_ZERO(data->start_class); 3006 if (compat) 3007 ANYOF_BITMAP_SET(data->start_class, uc); 3008 data->start_class->flags &= ~ANYOF_EOS; 3009 if (uc < 0x100) 3010 data->start_class->flags &= ~ANYOF_UNICODE_ALL; 3011 } 3012 else if (flags & SCF_DO_STCLASS_OR) { 3013 /* false positive possible if the class is case-folded */ 3014 if (uc < 0x100) 3015 ANYOF_BITMAP_SET(data->start_class, uc); 3016 else 3017 data->start_class->flags |= ANYOF_UNICODE_ALL; 3018 data->start_class->flags &= ~ANYOF_EOS; 3019 cl_and(data->start_class, and_withp); 3020 } 3021 flags &= ~SCF_DO_STCLASS; 3022 } 3023 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ 3024 I32 l = STR_LEN(scan); 3025 UV uc = *((U8*)STRING(scan)); 3026 3027 /* Search for fixed substrings supports EXACT only. */ 3028 if (flags & SCF_DO_SUBSTR) { 3029 assert(data); 3030 SCAN_COMMIT(pRExC_state, data, minlenp); 3031 } 3032 if (UTF) { 3033 const U8 * const s = (U8 *)STRING(scan); 3034 l = utf8_length(s, s + l); 3035 uc = utf8_to_uvchr(s, NULL); 3036 } 3037 min += l; 3038 if (flags & SCF_DO_SUBSTR) 3039 data->pos_min += l; 3040 if (flags & SCF_DO_STCLASS_AND) { 3041 /* Check whether it is compatible with what we know already! */ 3042 int compat = 1; 3043 3044 if (uc >= 0x100 || 3045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 3046 && !ANYOF_BITMAP_TEST(data->start_class, uc) 3047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 3048 compat = 0; 3049 ANYOF_CLASS_ZERO(data->start_class); 3050 ANYOF_BITMAP_ZERO(data->start_class); 3051 if (compat) { 3052 ANYOF_BITMAP_SET(data->start_class, uc); 3053 data->start_class->flags &= ~ANYOF_EOS; 3054 data->start_class->flags |= ANYOF_FOLD; 3055 if (OP(scan) == EXACTFL) 3056 data->start_class->flags |= ANYOF_LOCALE; 3057 } 3058 } 3059 else if (flags & SCF_DO_STCLASS_OR) { 3060 if (data->start_class->flags & ANYOF_FOLD) { 3061 /* false positive possible if the class is case-folded. 3062 Assume that the locale settings are the same... */ 3063 if (uc < 0x100) 3064 ANYOF_BITMAP_SET(data->start_class, uc); 3065 data->start_class->flags &= ~ANYOF_EOS; 3066 } 3067 cl_and(data->start_class, and_withp); 3068 } 3069 flags &= ~SCF_DO_STCLASS; 3070 } 3071 else if (strchr((const char*)PL_varies,OP(scan))) { 3072 I32 mincount, maxcount, minnext, deltanext, fl = 0; 3073 I32 f = flags, pos_before = 0; 3074 regnode * const oscan = scan; 3075 struct regnode_charclass_class this_class; 3076 struct regnode_charclass_class *oclass = NULL; 3077 I32 next_is_eval = 0; 3078 3079 switch (PL_regkind[OP(scan)]) { 3080 case WHILEM: /* End of (?:...)* . */ 3081 scan = NEXTOPER(scan); 3082 goto finish; 3083 case PLUS: 3084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 3085 next = NEXTOPER(scan); 3086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { 3087 mincount = 1; 3088 maxcount = REG_INFTY; 3089 next = regnext(scan); 3090 scan = NEXTOPER(scan); 3091 goto do_curly; 3092 } 3093 } 3094 if (flags & SCF_DO_SUBSTR) 3095 data->pos_min++; 3096 min++; 3097 /* Fall through. */ 3098 case STAR: 3099 if (flags & SCF_DO_STCLASS) { 3100 mincount = 0; 3101 maxcount = REG_INFTY; 3102 next = regnext(scan); 3103 scan = NEXTOPER(scan); 3104 goto do_curly; 3105 } 3106 is_inf = is_inf_internal = 1; 3107 scan = regnext(scan); 3108 if (flags & SCF_DO_SUBSTR) { 3109 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ 3110 data->longest = &(data->longest_float); 3111 } 3112 goto optimize_curly_tail; 3113 case CURLY: 3114 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) 3115 && (scan->flags == stopparen)) 3116 { 3117 mincount = 1; 3118 maxcount = 1; 3119 } else { 3120 mincount = ARG1(scan); 3121 maxcount = ARG2(scan); 3122 } 3123 next = regnext(scan); 3124 if (OP(scan) == CURLYX) { 3125 I32 lp = (data ? *(data->last_closep) : 0); 3126 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); 3127 } 3128 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 3129 next_is_eval = (OP(scan) == EVAL); 3130 do_curly: 3131 if (flags & SCF_DO_SUBSTR) { 3132 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ 3133 pos_before = data->pos_min; 3134 } 3135 if (data) { 3136 fl = data->flags; 3137 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 3138 if (is_inf) 3139 data->flags |= SF_IS_INF; 3140 } 3141 if (flags & SCF_DO_STCLASS) { 3142 cl_init(pRExC_state, &this_class); 3143 oclass = data->start_class; 3144 data->start_class = &this_class; 3145 f |= SCF_DO_STCLASS_AND; 3146 f &= ~SCF_DO_STCLASS_OR; 3147 } 3148 /* These are the cases when once a subexpression 3149 fails at a particular position, it cannot succeed 3150 even after backtracking at the enclosing scope. 3151 3152 XXXX what if minimal match and we are at the 3153 initial run of {n,m}? */ 3154 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) 3155 f &= ~SCF_WHILEM_VISITED_POS; 3156 3157 /* This will finish on WHILEM, setting scan, or on NULL: */ 3158 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 3159 last, data, stopparen, recursed, NULL, 3160 (mincount == 0 3161 ? (f & ~SCF_DO_SUBSTR) : f),depth+1); 3162 3163 if (flags & SCF_DO_STCLASS) 3164 data->start_class = oclass; 3165 if (mincount == 0 || minnext == 0) { 3166 if (flags & SCF_DO_STCLASS_OR) { 3167 cl_or(pRExC_state, data->start_class, &this_class); 3168 } 3169 else if (flags & SCF_DO_STCLASS_AND) { 3170 /* Switch to OR mode: cache the old value of 3171 * data->start_class */ 3172 INIT_AND_WITHP; 3173 StructCopy(data->start_class, and_withp, 3174 struct regnode_charclass_class); 3175 flags &= ~SCF_DO_STCLASS_AND; 3176 StructCopy(&this_class, data->start_class, 3177 struct regnode_charclass_class); 3178 flags |= SCF_DO_STCLASS_OR; 3179 data->start_class->flags |= ANYOF_EOS; 3180 } 3181 } else { /* Non-zero len */ 3182 if (flags & SCF_DO_STCLASS_OR) { 3183 cl_or(pRExC_state, data->start_class, &this_class); 3184 cl_and(data->start_class, and_withp); 3185 } 3186 else if (flags & SCF_DO_STCLASS_AND) 3187 cl_and(data->start_class, &this_class); 3188 flags &= ~SCF_DO_STCLASS; 3189 } 3190 if (!scan) /* It was not CURLYX, but CURLY. */ 3191 scan = next; 3192 if ( /* ? quantifier ok, except for (?{ ... }) */ 3193 (next_is_eval || !(mincount == 0 && maxcount == 1)) 3194 && (minnext == 0) && (deltanext == 0) 3195 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 3196 && maxcount <= REG_INFTY/3) /* Complement check for big count */ 3197 { 3198 ckWARNreg(RExC_parse, 3199 "Quantifier unexpected on zero-length expression"); 3200 } 3201 3202 min += minnext * mincount; 3203 is_inf_internal |= ((maxcount == REG_INFTY 3204 && (minnext + deltanext) > 0) 3205 || deltanext == I32_MAX); 3206 is_inf |= is_inf_internal; 3207 delta += (minnext + deltanext) * maxcount - minnext * mincount; 3208 3209 /* Try powerful optimization CURLYX => CURLYN. */ 3210 if ( OP(oscan) == CURLYX && data 3211 && data->flags & SF_IN_PAR 3212 && !(data->flags & SF_HAS_EVAL) 3213 && !deltanext && minnext == 1 ) { 3214 /* Try to optimize to CURLYN. */ 3215 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; 3216 regnode * const nxt1 = nxt; 3217 #ifdef DEBUGGING 3218 regnode *nxt2; 3219 #endif 3220 3221 /* Skip open. */ 3222 nxt = regnext(nxt); 3223 if (!strchr((const char*)PL_simple,OP(nxt)) 3224 && !(PL_regkind[OP(nxt)] == EXACT 3225 && STR_LEN(nxt) == 1)) 3226 goto nogo; 3227 #ifdef DEBUGGING 3228 nxt2 = nxt; 3229 #endif 3230 nxt = regnext(nxt); 3231 if (OP(nxt) != CLOSE) 3232 goto nogo; 3233 if (RExC_open_parens) { 3234 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ 3235 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ 3236 } 3237 /* Now we know that nxt2 is the only contents: */ 3238 oscan->flags = (U8)ARG(nxt); 3239 OP(oscan) = CURLYN; 3240 OP(nxt1) = NOTHING; /* was OPEN. */ 3241 3242 #ifdef DEBUGGING 3243 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 3244 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */ 3245 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */ 3246 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 3247 OP(nxt + 1) = OPTIMIZED; /* was count. */ 3248 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ 3249 #endif 3250 } 3251 nogo: 3252 3253 /* Try optimization CURLYX => CURLYM. */ 3254 if ( OP(oscan) == CURLYX && data 3255 && !(data->flags & SF_HAS_PAR) 3256 && !(data->flags & SF_HAS_EVAL) 3257 && !deltanext /* atom is fixed width */ 3258 && minnext != 0 /* CURLYM can't handle zero width */ 3259 ) { 3260 /* XXXX How to optimize if data == 0? */ 3261 /* Optimize to a simpler form. */ 3262 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ 3263 regnode *nxt2; 3264 3265 OP(oscan) = CURLYM; 3266 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 3267 && (OP(nxt2) != WHILEM)) 3268 nxt = nxt2; 3269 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 3270 /* Need to optimize away parenths. */ 3271 if (data->flags & SF_IN_PAR) { 3272 /* Set the parenth number. */ 3273 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ 3274 3275 if (OP(nxt) != CLOSE) 3276 FAIL("Panic opt close"); 3277 oscan->flags = (U8)ARG(nxt); 3278 if (RExC_open_parens) { 3279 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ 3280 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ 3281 } 3282 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 3283 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 3284 3285 #ifdef DEBUGGING 3286 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 3287 OP(nxt + 1) = OPTIMIZED; /* was count. */ 3288 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */ 3289 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */ 3290 #endif 3291 #if 0 3292 while ( nxt1 && (OP(nxt1) != WHILEM)) { 3293 regnode *nnxt = regnext(nxt1); 3294 3295 if (nnxt == nxt) { 3296 if (reg_off_by_arg[OP(nxt1)]) 3297 ARG_SET(nxt1, nxt2 - nxt1); 3298 else if (nxt2 - nxt1 < U16_MAX) 3299 NEXT_OFF(nxt1) = nxt2 - nxt1; 3300 else 3301 OP(nxt) = NOTHING; /* Cannot beautify */ 3302 } 3303 nxt1 = nnxt; 3304 } 3305 #endif 3306 /* Optimize again: */ 3307 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, 3308 NULL, stopparen, recursed, NULL, 0,depth+1); 3309 } 3310 else 3311 oscan->flags = 0; 3312 } 3313 else if ((OP(oscan) == CURLYX) 3314 && (flags & SCF_WHILEM_VISITED_POS) 3315 /* See the comment on a similar expression above. 3316 However, this time it not a subexpression 3317 we care about, but the expression itself. */ 3318 && (maxcount == REG_INFTY) 3319 && data && ++data->whilem_c < 16) { 3320 /* This stays as CURLYX, we can put the count/of pair. */ 3321 /* Find WHILEM (as in regexec.c) */ 3322 regnode *nxt = oscan + NEXT_OFF(oscan); 3323 3324 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ 3325 nxt += ARG(nxt); 3326 PREVOPER(nxt)->flags = (U8)(data->whilem_c 3327 | (RExC_whilem_seen << 4)); /* On WHILEM */ 3328 } 3329 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 3330 pars++; 3331 if (flags & SCF_DO_SUBSTR) { 3332 SV *last_str = NULL; 3333 int counted = mincount != 0; 3334 3335 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ 3336 #if defined(SPARC64_GCC_WORKAROUND) 3337 I32 b = 0; 3338 STRLEN l = 0; 3339 const char *s = NULL; 3340 I32 old = 0; 3341 3342 if (pos_before >= data->last_start_min) 3343 b = pos_before; 3344 else 3345 b = data->last_start_min; 3346 3347 l = 0; 3348 s = SvPV_const(data->last_found, l); 3349 old = b - data->last_start_min; 3350 3351 #else 3352 I32 b = pos_before >= data->last_start_min 3353 ? pos_before : data->last_start_min; 3354 STRLEN l; 3355 const char * const s = SvPV_const(data->last_found, l); 3356 I32 old = b - data->last_start_min; 3357 #endif 3358 3359 if (UTF) 3360 old = utf8_hop((U8*)s, old) - (U8*)s; 3361 3362 l -= old; 3363 /* Get the added string: */ 3364 last_str = newSVpvn_utf8(s + old, l, UTF); 3365 if (deltanext == 0 && pos_before == b) { 3366 /* What was added is a constant string */ 3367 if (mincount > 1) { 3368 SvGROW(last_str, (mincount * l) + 1); 3369 repeatcpy(SvPVX(last_str) + l, 3370 SvPVX_const(last_str), l, mincount - 1); 3371 SvCUR_set(last_str, SvCUR(last_str) * mincount); 3372 /* Add additional parts. */ 3373 SvCUR_set(data->last_found, 3374 SvCUR(data->last_found) - l); 3375 sv_catsv(data->last_found, last_str); 3376 { 3377 SV * sv = data->last_found; 3378 MAGIC *mg = 3379 SvUTF8(sv) && SvMAGICAL(sv) ? 3380 mg_find(sv, PERL_MAGIC_utf8) : NULL; 3381 if (mg && mg->mg_len >= 0) 3382 mg->mg_len += CHR_SVLEN(last_str) - l; 3383 } 3384 data->last_end += l * (mincount - 1); 3385 } 3386 } else { 3387 /* start offset must point into the last copy */ 3388 data->last_start_min += minnext * (mincount - 1); 3389 data->last_start_max += is_inf ? I32_MAX 3390 : (maxcount - 1) * (minnext + data->pos_delta); 3391 } 3392 } 3393 /* It is counted once already... */ 3394 data->pos_min += minnext * (mincount - counted); 3395 data->pos_delta += - counted * deltanext + 3396 (minnext + deltanext) * maxcount - minnext * mincount; 3397 if (mincount != maxcount) { 3398 /* Cannot extend fixed substrings found inside 3399 the group. */ 3400 SCAN_COMMIT(pRExC_state,data,minlenp); 3401 if (mincount && last_str) { 3402 SV * const sv = data->last_found; 3403 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 3404 mg_find(sv, PERL_MAGIC_utf8) : NULL; 3405 3406 if (mg) 3407 mg->mg_len = -1; 3408 sv_setsv(sv, last_str); 3409 data->last_end = data->pos_min; 3410 data->last_start_min = 3411 data->pos_min - CHR_SVLEN(last_str); 3412 data->last_start_max = is_inf 3413 ? I32_MAX 3414 : data->pos_min + data->pos_delta 3415 - CHR_SVLEN(last_str); 3416 } 3417 data->longest = &(data->longest_float); 3418 } 3419 SvREFCNT_dec(last_str); 3420 } 3421 if (data && (fl & SF_HAS_EVAL)) 3422 data->flags |= SF_HAS_EVAL; 3423 optimize_curly_tail: 3424 if (OP(oscan) != CURLYX) { 3425 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING 3426 && NEXT_OFF(next)) 3427 NEXT_OFF(oscan) += NEXT_OFF(next); 3428 } 3429 continue; 3430 default: /* REF and CLUMP only? */ 3431 if (flags & SCF_DO_SUBSTR) { 3432 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3433 data->longest = &(data->longest_float); 3434 } 3435 is_inf = is_inf_internal = 1; 3436 if (flags & SCF_DO_STCLASS_OR) 3437 cl_anything(pRExC_state, data->start_class); 3438 flags &= ~SCF_DO_STCLASS; 3439 break; 3440 } 3441 } 3442 else if (OP(scan) == LNBREAK) { 3443 if (flags & SCF_DO_STCLASS) { 3444 int value = 0; 3445 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ 3446 if (flags & SCF_DO_STCLASS_AND) { 3447 for (value = 0; value < 256; value++) 3448 if (!is_VERTWS_cp(value)) 3449 ANYOF_BITMAP_CLEAR(data->start_class, value); 3450 } 3451 else { 3452 for (value = 0; value < 256; value++) 3453 if (is_VERTWS_cp(value)) 3454 ANYOF_BITMAP_SET(data->start_class, value); 3455 } 3456 if (flags & SCF_DO_STCLASS_OR) 3457 cl_and(data->start_class, and_withp); 3458 flags &= ~SCF_DO_STCLASS; 3459 } 3460 min += 1; 3461 delta += 1; 3462 if (flags & SCF_DO_SUBSTR) { 3463 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3464 data->pos_min += 1; 3465 data->pos_delta += 1; 3466 data->longest = &(data->longest_float); 3467 } 3468 3469 } 3470 else if (OP(scan) == FOLDCHAR) { 3471 int d = ARG(scan)==0xDF ? 1 : 2; 3472 flags &= ~SCF_DO_STCLASS; 3473 min += 1; 3474 delta += d; 3475 if (flags & SCF_DO_SUBSTR) { 3476 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3477 data->pos_min += 1; 3478 data->pos_delta += d; 3479 data->longest = &(data->longest_float); 3480 } 3481 } 3482 else if (strchr((const char*)PL_simple,OP(scan))) { 3483 int value = 0; 3484 3485 if (flags & SCF_DO_SUBSTR) { 3486 SCAN_COMMIT(pRExC_state,data,minlenp); 3487 data->pos_min++; 3488 } 3489 min++; 3490 if (flags & SCF_DO_STCLASS) { 3491 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ 3492 3493 /* Some of the logic below assumes that switching 3494 locale on will only add false positives. */ 3495 switch (PL_regkind[OP(scan)]) { 3496 case SANY: 3497 default: 3498 do_default: 3499 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ 3500 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 3501 cl_anything(pRExC_state, data->start_class); 3502 break; 3503 case REG_ANY: 3504 if (OP(scan) == SANY) 3505 goto do_default; 3506 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ 3507 value = (ANYOF_BITMAP_TEST(data->start_class,'\n') 3508 || (data->start_class->flags & ANYOF_CLASS)); 3509 cl_anything(pRExC_state, data->start_class); 3510 } 3511 if (flags & SCF_DO_STCLASS_AND || !value) 3512 ANYOF_BITMAP_CLEAR(data->start_class,'\n'); 3513 break; 3514 case ANYOF: 3515 if (flags & SCF_DO_STCLASS_AND) 3516 cl_and(data->start_class, 3517 (struct regnode_charclass_class*)scan); 3518 else 3519 cl_or(pRExC_state, data->start_class, 3520 (struct regnode_charclass_class*)scan); 3521 break; 3522 case ALNUM: 3523 if (flags & SCF_DO_STCLASS_AND) { 3524 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3525 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 3526 for (value = 0; value < 256; value++) 3527 if (!isALNUM(value)) 3528 ANYOF_BITMAP_CLEAR(data->start_class, value); 3529 } 3530 } 3531 else { 3532 if (data->start_class->flags & ANYOF_LOCALE) 3533 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 3534 else { 3535 for (value = 0; value < 256; value++) 3536 if (isALNUM(value)) 3537 ANYOF_BITMAP_SET(data->start_class, value); 3538 } 3539 } 3540 break; 3541 case ALNUML: 3542 if (flags & SCF_DO_STCLASS_AND) { 3543 if (data->start_class->flags & ANYOF_LOCALE) 3544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 3545 } 3546 else { 3547 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 3548 data->start_class->flags |= ANYOF_LOCALE; 3549 } 3550 break; 3551 case NALNUM: 3552 if (flags & SCF_DO_STCLASS_AND) { 3553 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3554 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 3555 for (value = 0; value < 256; value++) 3556 if (isALNUM(value)) 3557 ANYOF_BITMAP_CLEAR(data->start_class, value); 3558 } 3559 } 3560 else { 3561 if (data->start_class->flags & ANYOF_LOCALE) 3562 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 3563 else { 3564 for (value = 0; value < 256; value++) 3565 if (!isALNUM(value)) 3566 ANYOF_BITMAP_SET(data->start_class, value); 3567 } 3568 } 3569 break; 3570 case NALNUML: 3571 if (flags & SCF_DO_STCLASS_AND) { 3572 if (data->start_class->flags & ANYOF_LOCALE) 3573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 3574 } 3575 else { 3576 data->start_class->flags |= ANYOF_LOCALE; 3577 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 3578 } 3579 break; 3580 case SPACE: 3581 if (flags & SCF_DO_STCLASS_AND) { 3582 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3583 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 3584 for (value = 0; value < 256; value++) 3585 if (!isSPACE(value)) 3586 ANYOF_BITMAP_CLEAR(data->start_class, value); 3587 } 3588 } 3589 else { 3590 if (data->start_class->flags & ANYOF_LOCALE) 3591 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 3592 else { 3593 for (value = 0; value < 256; value++) 3594 if (isSPACE(value)) 3595 ANYOF_BITMAP_SET(data->start_class, value); 3596 } 3597 } 3598 break; 3599 case SPACEL: 3600 if (flags & SCF_DO_STCLASS_AND) { 3601 if (data->start_class->flags & ANYOF_LOCALE) 3602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 3603 } 3604 else { 3605 data->start_class->flags |= ANYOF_LOCALE; 3606 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 3607 } 3608 break; 3609 case NSPACE: 3610 if (flags & SCF_DO_STCLASS_AND) { 3611 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3612 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 3613 for (value = 0; value < 256; value++) 3614 if (isSPACE(value)) 3615 ANYOF_BITMAP_CLEAR(data->start_class, value); 3616 } 3617 } 3618 else { 3619 if (data->start_class->flags & ANYOF_LOCALE) 3620 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 3621 else { 3622 for (value = 0; value < 256; value++) 3623 if (!isSPACE(value)) 3624 ANYOF_BITMAP_SET(data->start_class, value); 3625 } 3626 } 3627 break; 3628 case NSPACEL: 3629 if (flags & SCF_DO_STCLASS_AND) { 3630 if (data->start_class->flags & ANYOF_LOCALE) { 3631 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 3632 for (value = 0; value < 256; value++) 3633 if (!isSPACE(value)) 3634 ANYOF_BITMAP_CLEAR(data->start_class, value); 3635 } 3636 } 3637 else { 3638 data->start_class->flags |= ANYOF_LOCALE; 3639 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 3640 } 3641 break; 3642 case DIGIT: 3643 if (flags & SCF_DO_STCLASS_AND) { 3644 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); 3645 for (value = 0; value < 256; value++) 3646 if (!isDIGIT(value)) 3647 ANYOF_BITMAP_CLEAR(data->start_class, value); 3648 } 3649 else { 3650 if (data->start_class->flags & ANYOF_LOCALE) 3651 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); 3652 else { 3653 for (value = 0; value < 256; value++) 3654 if (isDIGIT(value)) 3655 ANYOF_BITMAP_SET(data->start_class, value); 3656 } 3657 } 3658 break; 3659 case NDIGIT: 3660 if (flags & SCF_DO_STCLASS_AND) { 3661 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); 3662 for (value = 0; value < 256; value++) 3663 if (isDIGIT(value)) 3664 ANYOF_BITMAP_CLEAR(data->start_class, value); 3665 } 3666 else { 3667 if (data->start_class->flags & ANYOF_LOCALE) 3668 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); 3669 else { 3670 for (value = 0; value < 256; value++) 3671 if (!isDIGIT(value)) 3672 ANYOF_BITMAP_SET(data->start_class, value); 3673 } 3674 } 3675 break; 3676 CASE_SYNST_FNC(VERTWS); 3677 CASE_SYNST_FNC(HORIZWS); 3678 3679 } 3680 if (flags & SCF_DO_STCLASS_OR) 3681 cl_and(data->start_class, and_withp); 3682 flags &= ~SCF_DO_STCLASS; 3683 } 3684 } 3685 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { 3686 data->flags |= (OP(scan) == MEOL 3687 ? SF_BEFORE_MEOL 3688 : SF_BEFORE_SEOL); 3689 } 3690 else if ( PL_regkind[OP(scan)] == BRANCHJ 3691 /* Lookbehind, or need to calculate parens/evals/stclass: */ 3692 && (scan->flags || data || (flags & SCF_DO_STCLASS)) 3693 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { 3694 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY 3695 || OP(scan) == UNLESSM ) 3696 { 3697 /* Negative Lookahead/lookbehind 3698 In this case we can't do fixed string optimisation. 3699 */ 3700 3701 I32 deltanext, minnext, fake = 0; 3702 regnode *nscan; 3703 struct regnode_charclass_class intrnl; 3704 int f = 0; 3705 3706 data_fake.flags = 0; 3707 if (data) { 3708 data_fake.whilem_c = data->whilem_c; 3709 data_fake.last_closep = data->last_closep; 3710 } 3711 else 3712 data_fake.last_closep = &fake; 3713 data_fake.pos_delta = delta; 3714 if ( flags & SCF_DO_STCLASS && !scan->flags 3715 && OP(scan) == IFMATCH ) { /* Lookahead */ 3716 cl_init(pRExC_state, &intrnl); 3717 data_fake.start_class = &intrnl; 3718 f |= SCF_DO_STCLASS_AND; 3719 } 3720 if (flags & SCF_WHILEM_VISITED_POS) 3721 f |= SCF_WHILEM_VISITED_POS; 3722 next = regnext(scan); 3723 nscan = NEXTOPER(NEXTOPER(scan)); 3724 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, 3725 last, &data_fake, stopparen, recursed, NULL, f, depth+1); 3726 if (scan->flags) { 3727 if (deltanext) { 3728 FAIL("Variable length lookbehind not implemented"); 3729 } 3730 else if (minnext > (I32)U8_MAX) { 3731 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); 3732 } 3733 scan->flags = (U8)minnext; 3734 } 3735 if (data) { 3736 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3737 pars++; 3738 if (data_fake.flags & SF_HAS_EVAL) 3739 data->flags |= SF_HAS_EVAL; 3740 data->whilem_c = data_fake.whilem_c; 3741 } 3742 if (f & SCF_DO_STCLASS_AND) { 3743 if (flags & SCF_DO_STCLASS_OR) { 3744 /* OR before, AND after: ideally we would recurse with 3745 * data_fake to get the AND applied by study of the 3746 * remainder of the pattern, and then derecurse; 3747 * *** HACK *** for now just treat as "no information". 3748 * See [perl #56690]. 3749 */ 3750 cl_init(pRExC_state, data->start_class); 3751 } else { 3752 /* AND before and after: combine and continue */ 3753 const int was = (data->start_class->flags & ANYOF_EOS); 3754 3755 cl_and(data->start_class, &intrnl); 3756 if (was) 3757 data->start_class->flags |= ANYOF_EOS; 3758 } 3759 } 3760 } 3761 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY 3762 else { 3763 /* Positive Lookahead/lookbehind 3764 In this case we can do fixed string optimisation, 3765 but we must be careful about it. Note in the case of 3766 lookbehind the positions will be offset by the minimum 3767 length of the pattern, something we won't know about 3768 until after the recurse. 3769 */ 3770 I32 deltanext, fake = 0; 3771 regnode *nscan; 3772 struct regnode_charclass_class intrnl; 3773 int f = 0; 3774 /* We use SAVEFREEPV so that when the full compile 3775 is finished perl will clean up the allocated 3776 minlens when its all done. This was we don't 3777 have to worry about freeing them when we know 3778 they wont be used, which would be a pain. 3779 */ 3780 I32 *minnextp; 3781 Newx( minnextp, 1, I32 ); 3782 SAVEFREEPV(minnextp); 3783 3784 if (data) { 3785 StructCopy(data, &data_fake, scan_data_t); 3786 if ((flags & SCF_DO_SUBSTR) && data->last_found) { 3787 f |= SCF_DO_SUBSTR; 3788 if (scan->flags) 3789 SCAN_COMMIT(pRExC_state, &data_fake,minlenp); 3790 data_fake.last_found=newSVsv(data->last_found); 3791 } 3792 } 3793 else 3794 data_fake.last_closep = &fake; 3795 data_fake.flags = 0; 3796 data_fake.pos_delta = delta; 3797 if (is_inf) 3798 data_fake.flags |= SF_IS_INF; 3799 if ( flags & SCF_DO_STCLASS && !scan->flags 3800 && OP(scan) == IFMATCH ) { /* Lookahead */ 3801 cl_init(pRExC_state, &intrnl); 3802 data_fake.start_class = &intrnl; 3803 f |= SCF_DO_STCLASS_AND; 3804 } 3805 if (flags & SCF_WHILEM_VISITED_POS) 3806 f |= SCF_WHILEM_VISITED_POS; 3807 next = regnext(scan); 3808 nscan = NEXTOPER(NEXTOPER(scan)); 3809 3810 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, 3811 last, &data_fake, stopparen, recursed, NULL, f,depth+1); 3812 if (scan->flags) { 3813 if (deltanext) { 3814 FAIL("Variable length lookbehind not implemented"); 3815 } 3816 else if (*minnextp > (I32)U8_MAX) { 3817 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); 3818 } 3819 scan->flags = (U8)*minnextp; 3820 } 3821 3822 *minnextp += min; 3823 3824 if (f & SCF_DO_STCLASS_AND) { 3825 const int was = (data->start_class->flags & ANYOF_EOS); 3826 3827 cl_and(data->start_class, &intrnl); 3828 if (was) 3829 data->start_class->flags |= ANYOF_EOS; 3830 } 3831 if (data) { 3832 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3833 pars++; 3834 if (data_fake.flags & SF_HAS_EVAL) 3835 data->flags |= SF_HAS_EVAL; 3836 data->whilem_c = data_fake.whilem_c; 3837 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { 3838 if (RExC_rx->minlen<*minnextp) 3839 RExC_rx->minlen=*minnextp; 3840 SCAN_COMMIT(pRExC_state, &data_fake, minnextp); 3841 SvREFCNT_dec(data_fake.last_found); 3842 3843 if ( data_fake.minlen_fixed != minlenp ) 3844 { 3845 data->offset_fixed= data_fake.offset_fixed; 3846 data->minlen_fixed= data_fake.minlen_fixed; 3847 data->lookbehind_fixed+= scan->flags; 3848 } 3849 if ( data_fake.minlen_float != minlenp ) 3850 { 3851 data->minlen_float= data_fake.minlen_float; 3852 data->offset_float_min=data_fake.offset_float_min; 3853 data->offset_float_max=data_fake.offset_float_max; 3854 data->lookbehind_float+= scan->flags; 3855 } 3856 } 3857 } 3858 3859 3860 } 3861 #endif 3862 } 3863 else if (OP(scan) == OPEN) { 3864 if (stopparen != (I32)ARG(scan)) 3865 pars++; 3866 } 3867 else if (OP(scan) == CLOSE) { 3868 if (stopparen == (I32)ARG(scan)) { 3869 break; 3870 } 3871 if ((I32)ARG(scan) == is_par) { 3872 next = regnext(scan); 3873 3874 if ( next && (OP(next) != WHILEM) && next < last) 3875 is_par = 0; /* Disable optimization */ 3876 } 3877 if (data) 3878 *(data->last_closep) = ARG(scan); 3879 } 3880 else if (OP(scan) == EVAL) { 3881 if (data) 3882 data->flags |= SF_HAS_EVAL; 3883 } 3884 else if ( PL_regkind[OP(scan)] == ENDLIKE ) { 3885 if (flags & SCF_DO_SUBSTR) { 3886 SCAN_COMMIT(pRExC_state,data,minlenp); 3887 flags &= ~SCF_DO_SUBSTR; 3888 } 3889 if (data && OP(scan)==ACCEPT) { 3890 data->flags |= SCF_SEEN_ACCEPT; 3891 if (stopmin > min) 3892 stopmin = min; 3893 } 3894 } 3895 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ 3896 { 3897 if (flags & SCF_DO_SUBSTR) { 3898 SCAN_COMMIT(pRExC_state,data,minlenp); 3899 data->longest = &(data->longest_float); 3900 } 3901 is_inf = is_inf_internal = 1; 3902 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 3903 cl_anything(pRExC_state, data->start_class); 3904 flags &= ~SCF_DO_STCLASS; 3905 } 3906 else if (OP(scan) == GPOS) { 3907 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && 3908 !(delta || is_inf || (data && data->pos_delta))) 3909 { 3910 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) 3911 RExC_rx->extflags |= RXf_ANCH_GPOS; 3912 if (RExC_rx->gofs < (U32)min) 3913 RExC_rx->gofs = min; 3914 } else { 3915 RExC_rx->extflags |= RXf_GPOS_FLOAT; 3916 RExC_rx->gofs = 0; 3917 } 3918 } 3919 #ifdef TRIE_STUDY_OPT 3920 #ifdef FULL_TRIE_STUDY 3921 else if (PL_regkind[OP(scan)] == TRIE) { 3922 /* NOTE - There is similar code to this block above for handling 3923 BRANCH nodes on the initial study. If you change stuff here 3924 check there too. */ 3925 regnode *trie_node= scan; 3926 regnode *tail= regnext(scan); 3927 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 3928 I32 max1 = 0, min1 = I32_MAX; 3929 struct regnode_charclass_class accum; 3930 3931 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ 3932 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ 3933 if (flags & SCF_DO_STCLASS) 3934 cl_init_zero(pRExC_state, &accum); 3935 3936 if (!trie->jump) { 3937 min1= trie->minlen; 3938 max1= trie->maxlen; 3939 } else { 3940 const regnode *nextbranch= NULL; 3941 U32 word; 3942 3943 for ( word=1 ; word <= trie->wordcount ; word++) 3944 { 3945 I32 deltanext=0, minnext=0, f = 0, fake; 3946 struct regnode_charclass_class this_class; 3947 3948 data_fake.flags = 0; 3949 if (data) { 3950 data_fake.whilem_c = data->whilem_c; 3951 data_fake.last_closep = data->last_closep; 3952 } 3953 else 3954 data_fake.last_closep = &fake; 3955 data_fake.pos_delta = delta; 3956 if (flags & SCF_DO_STCLASS) { 3957 cl_init(pRExC_state, &this_class); 3958 data_fake.start_class = &this_class; 3959 f = SCF_DO_STCLASS_AND; 3960 } 3961 if (flags & SCF_WHILEM_VISITED_POS) 3962 f |= SCF_WHILEM_VISITED_POS; 3963 3964 if (trie->jump[word]) { 3965 if (!nextbranch) 3966 nextbranch = trie_node + trie->jump[0]; 3967 scan= trie_node + trie->jump[word]; 3968 /* We go from the jump point to the branch that follows 3969 it. Note this means we need the vestigal unused branches 3970 even though they arent otherwise used. 3971 */ 3972 minnext = study_chunk(pRExC_state, &scan, minlenp, 3973 &deltanext, (regnode *)nextbranch, &data_fake, 3974 stopparen, recursed, NULL, f,depth+1); 3975 } 3976 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 3977 nextbranch= regnext((regnode*)nextbranch); 3978 3979 if (min1 > (I32)(minnext + trie->minlen)) 3980 min1 = minnext + trie->minlen; 3981 if (max1 < (I32)(minnext + deltanext + trie->maxlen)) 3982 max1 = minnext + deltanext + trie->maxlen; 3983 if (deltanext == I32_MAX) 3984 is_inf = is_inf_internal = 1; 3985 3986 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3987 pars++; 3988 if (data_fake.flags & SCF_SEEN_ACCEPT) { 3989 if ( stopmin > min + min1) 3990 stopmin = min + min1; 3991 flags &= ~SCF_DO_SUBSTR; 3992 if (data) 3993 data->flags |= SCF_SEEN_ACCEPT; 3994 } 3995 if (data) { 3996 if (data_fake.flags & SF_HAS_EVAL) 3997 data->flags |= SF_HAS_EVAL; 3998 data->whilem_c = data_fake.whilem_c; 3999 } 4000 if (flags & SCF_DO_STCLASS) 4001 cl_or(pRExC_state, &accum, &this_class); 4002 } 4003 } 4004 if (flags & SCF_DO_SUBSTR) { 4005 data->pos_min += min1; 4006 data->pos_delta += max1 - min1; 4007 if (max1 != min1 || is_inf) 4008 data->longest = &(data->longest_float); 4009 } 4010 min += min1; 4011 delta += max1 - min1; 4012 if (flags & SCF_DO_STCLASS_OR) { 4013 cl_or(pRExC_state, data->start_class, &accum); 4014 if (min1) { 4015 cl_and(data->start_class, and_withp); 4016 flags &= ~SCF_DO_STCLASS; 4017 } 4018 } 4019 else if (flags & SCF_DO_STCLASS_AND) { 4020 if (min1) { 4021 cl_and(data->start_class, &accum); 4022 flags &= ~SCF_DO_STCLASS; 4023 } 4024 else { 4025 /* Switch to OR mode: cache the old value of 4026 * data->start_class */ 4027 INIT_AND_WITHP; 4028 StructCopy(data->start_class, and_withp, 4029 struct regnode_charclass_class); 4030 flags &= ~SCF_DO_STCLASS_AND; 4031 StructCopy(&accum, data->start_class, 4032 struct regnode_charclass_class); 4033 flags |= SCF_DO_STCLASS_OR; 4034 data->start_class->flags |= ANYOF_EOS; 4035 } 4036 } 4037 scan= tail; 4038 continue; 4039 } 4040 #else 4041 else if (PL_regkind[OP(scan)] == TRIE) { 4042 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 4043 U8*bang=NULL; 4044 4045 min += trie->minlen; 4046 delta += (trie->maxlen - trie->minlen); 4047 flags &= ~SCF_DO_STCLASS; /* xxx */ 4048 if (flags & SCF_DO_SUBSTR) { 4049 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 4050 data->pos_min += trie->minlen; 4051 data->pos_delta += (trie->maxlen - trie->minlen); 4052 if (trie->maxlen != trie->minlen) 4053 data->longest = &(data->longest_float); 4054 } 4055 if (trie->jump) /* no more substrings -- for now /grr*/ 4056 flags &= ~SCF_DO_SUBSTR; 4057 } 4058 #endif /* old or new */ 4059 #endif /* TRIE_STUDY_OPT */ 4060 4061 /* Else: zero-length, ignore. */ 4062 scan = regnext(scan); 4063 } 4064 if (frame) { 4065 last = frame->last; 4066 scan = frame->next; 4067 stopparen = frame->stop; 4068 frame = frame->prev; 4069 goto fake_study_recurse; 4070 } 4071 4072 finish: 4073 assert(!frame); 4074 DEBUG_STUDYDATA("pre-fin:",data,depth); 4075 4076 *scanp = scan; 4077 *deltap = is_inf_internal ? I32_MAX : delta; 4078 if (flags & SCF_DO_SUBSTR && is_inf) 4079 data->pos_delta = I32_MAX - data->pos_min; 4080 if (is_par > (I32)U8_MAX) 4081 is_par = 0; 4082 if (is_par && pars==1 && data) { 4083 data->flags |= SF_IN_PAR; 4084 data->flags &= ~SF_HAS_PAR; 4085 } 4086 else if (pars && data) { 4087 data->flags |= SF_HAS_PAR; 4088 data->flags &= ~SF_IN_PAR; 4089 } 4090 if (flags & SCF_DO_STCLASS_OR) 4091 cl_and(data->start_class, and_withp); 4092 if (flags & SCF_TRIE_RESTUDY) 4093 data->flags |= SCF_TRIE_RESTUDY; 4094 4095 DEBUG_STUDYDATA("post-fin:",data,depth); 4096 4097 return min < stopmin ? min : stopmin; 4098 } 4099 4100 STATIC U32 4101 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) 4102 { 4103 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; 4104 4105 PERL_ARGS_ASSERT_ADD_DATA; 4106 4107 Renewc(RExC_rxi->data, 4108 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), 4109 char, struct reg_data); 4110 if(count) 4111 Renew(RExC_rxi->data->what, count + n, U8); 4112 else 4113 Newx(RExC_rxi->data->what, n, U8); 4114 RExC_rxi->data->count = count + n; 4115 Copy(s, RExC_rxi->data->what + count, n, U8); 4116 return count; 4117 } 4118 4119 /*XXX: todo make this not included in a non debugging perl */ 4120 #ifndef PERL_IN_XSUB_RE 4121 void 4122 Perl_reginitcolors(pTHX) 4123 { 4124 dVAR; 4125 const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); 4126 if (s) { 4127 char *t = savepv(s); 4128 int i = 0; 4129 PL_colors[0] = t; 4130 while (++i < 6) { 4131 t = strchr(t, '\t'); 4132 if (t) { 4133 *t = '\0'; 4134 PL_colors[i] = ++t; 4135 } 4136 else 4137 PL_colors[i] = t = (char *)""; 4138 } 4139 } else { 4140 int i = 0; 4141 while (i < 6) 4142 PL_colors[i++] = (char *)""; 4143 } 4144 PL_colorset = 1; 4145 } 4146 #endif 4147 4148 4149 #ifdef TRIE_STUDY_OPT 4150 #define CHECK_RESTUDY_GOTO \ 4151 if ( \ 4152 (data.flags & SCF_TRIE_RESTUDY) \ 4153 && ! restudied++ \ 4154 ) goto reStudy 4155 #else 4156 #define CHECK_RESTUDY_GOTO 4157 #endif 4158 4159 /* 4160 - pregcomp - compile a regular expression into internal code 4161 * 4162 * We can't allocate space until we know how big the compiled form will be, 4163 * but we can't compile it (and thus know how big it is) until we've got a 4164 * place to put the code. So we cheat: we compile it twice, once with code 4165 * generation turned off and size counting turned on, and once "for real". 4166 * This also means that we don't allocate space until we are sure that the 4167 * thing really will compile successfully, and we never have to move the 4168 * code and thus invalidate pointers into it. (Note that it has to be in 4169 * one piece because free() must be able to free it all.) [NB: not true in perl] 4170 * 4171 * Beware that the optimization-preparation code in here knows about some 4172 * of the structure of the compiled regexp. [I'll say.] 4173 */ 4174 4175 4176 4177 #ifndef PERL_IN_XSUB_RE 4178 #define RE_ENGINE_PTR &PL_core_reg_engine 4179 #else 4180 extern const struct regexp_engine my_reg_engine; 4181 #define RE_ENGINE_PTR &my_reg_engine 4182 #endif 4183 4184 #ifndef PERL_IN_XSUB_RE 4185 REGEXP * 4186 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) 4187 { 4188 dVAR; 4189 HV * const table = GvHV(PL_hintgv); 4190 4191 PERL_ARGS_ASSERT_PREGCOMP; 4192 4193 /* Dispatch a request to compile a regexp to correct 4194 regexp engine. */ 4195 if (table) { 4196 SV **ptr= hv_fetchs(table, "regcomp", FALSE); 4197 GET_RE_DEBUG_FLAGS_DECL; 4198 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) { 4199 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr)); 4200 DEBUG_COMPILE_r({ 4201 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", 4202 SvIV(*ptr)); 4203 }); 4204 return CALLREGCOMP_ENG(eng, pattern, flags); 4205 } 4206 } 4207 return Perl_re_compile(aTHX_ pattern, flags); 4208 } 4209 #endif 4210 4211 REGEXP * 4212 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) 4213 { 4214 dVAR; 4215 REGEXP *rx; 4216 struct regexp *r; 4217 register regexp_internal *ri; 4218 STRLEN plen; 4219 char *exp = SvPV(pattern, plen); 4220 char* xend = exp + plen; 4221 regnode *scan; 4222 I32 flags; 4223 I32 minlen = 0; 4224 I32 sawplus = 0; 4225 I32 sawopen = 0; 4226 scan_data_t data; 4227 RExC_state_t RExC_state; 4228 RExC_state_t * const pRExC_state = &RExC_state; 4229 #ifdef TRIE_STUDY_OPT 4230 int restudied= 0; 4231 RExC_state_t copyRExC_state; 4232 #endif 4233 GET_RE_DEBUG_FLAGS_DECL; 4234 4235 PERL_ARGS_ASSERT_RE_COMPILE; 4236 4237 DEBUG_r(if (!PL_colorset) reginitcolors()); 4238 4239 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); 4240 4241 DEBUG_COMPILE_r({ 4242 SV *dsv= sv_newmortal(); 4243 RE_PV_QUOTED_DECL(s, RExC_utf8, 4244 dsv, exp, plen, 60); 4245 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", 4246 PL_colors[4],PL_colors[5],s); 4247 }); 4248 4249 redo_first_pass: 4250 RExC_precomp = exp; 4251 RExC_flags = pm_flags; 4252 RExC_sawback = 0; 4253 4254 RExC_seen = 0; 4255 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 4256 RExC_seen_evals = 0; 4257 RExC_extralen = 0; 4258 4259 /* First pass: determine size, legality. */ 4260 RExC_parse = exp; 4261 RExC_start = exp; 4262 RExC_end = xend; 4263 RExC_naughty = 0; 4264 RExC_npar = 1; 4265 RExC_nestroot = 0; 4266 RExC_size = 0L; 4267 RExC_emit = &PL_regdummy; 4268 RExC_whilem_seen = 0; 4269 RExC_open_parens = NULL; 4270 RExC_close_parens = NULL; 4271 RExC_opend = NULL; 4272 RExC_paren_names = NULL; 4273 #ifdef DEBUGGING 4274 RExC_paren_name_list = NULL; 4275 #endif 4276 RExC_recurse = NULL; 4277 RExC_recurse_count = 0; 4278 4279 #if 0 /* REGC() is (currently) a NOP at the first pass. 4280 * Clever compilers notice this and complain. --jhi */ 4281 REGC((U8)REG_MAGIC, (char*)RExC_emit); 4282 #endif 4283 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n")); 4284 if (reg(pRExC_state, 0, &flags,1) == NULL) { 4285 RExC_precomp = NULL; 4286 return(NULL); 4287 } 4288 if (RExC_utf8 && !RExC_orig_utf8) { 4289 /* It's possible to write a regexp in ascii that represents Unicode 4290 codepoints outside of the byte range, such as via \x{100}. If we 4291 detect such a sequence we have to convert the entire pattern to utf8 4292 and then recompile, as our sizing calculation will have been based 4293 on 1 byte == 1 character, but we will need to use utf8 to encode 4294 at least some part of the pattern, and therefore must convert the whole 4295 thing. 4296 XXX: somehow figure out how to make this less expensive... 4297 -- dmq */ 4298 STRLEN len = plen; 4299 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, 4300 "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); 4301 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); 4302 xend = exp + len; 4303 RExC_orig_utf8 = RExC_utf8; 4304 SAVEFREEPV(exp); 4305 goto redo_first_pass; 4306 } 4307 DEBUG_PARSE_r({ 4308 PerlIO_printf(Perl_debug_log, 4309 "Required size %"IVdf" nodes\n" 4310 "Starting second pass (creation)\n", 4311 (IV)RExC_size); 4312 RExC_lastnum=0; 4313 RExC_lastparse=NULL; 4314 }); 4315 /* Small enough for pointer-storage convention? 4316 If extralen==0, this means that we will not need long jumps. */ 4317 if (RExC_size >= 0x10000L && RExC_extralen) 4318 RExC_size += RExC_extralen; 4319 else 4320 RExC_extralen = 0; 4321 if (RExC_whilem_seen > 15) 4322 RExC_whilem_seen = 15; 4323 4324 /* Allocate space and zero-initialize. Note, the two step process 4325 of zeroing when in debug mode, thus anything assigned has to 4326 happen after that */ 4327 rx = (REGEXP*) newSV_type(SVt_REGEXP); 4328 r = (struct regexp*)SvANY(rx); 4329 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), 4330 char, regexp_internal); 4331 if ( r == NULL || ri == NULL ) 4332 FAIL("Regexp out of space"); 4333 #ifdef DEBUGGING 4334 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ 4335 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); 4336 #else 4337 /* bulk initialize base fields with 0. */ 4338 Zero(ri, sizeof(regexp_internal), char); 4339 #endif 4340 4341 /* non-zero initialization begins here */ 4342 RXi_SET( r, ri ); 4343 r->engine= RE_ENGINE_PTR; 4344 r->extflags = pm_flags; 4345 { 4346 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); 4347 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); 4348 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); 4349 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) 4350 >> RXf_PMf_STD_PMMOD_SHIFT); 4351 const char *fptr = STD_PAT_MODS; /*"msix"*/ 4352 char *p; 4353 const STRLEN wraplen = plen + has_minus + has_p + has_runon 4354 + (sizeof(STD_PAT_MODS) - 1) 4355 + (sizeof("(?:)") - 1); 4356 4357 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); 4358 SvCUR_set(rx, wraplen); 4359 SvPOK_on(rx); 4360 SvFLAGS(rx) |= SvUTF8(pattern); 4361 *p++='('; *p++='?'; 4362 if (has_p) 4363 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ 4364 { 4365 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; 4366 char *colon = r + 1; 4367 char ch; 4368 4369 while((ch = *fptr++)) { 4370 if(reganch & 1) 4371 *p++ = ch; 4372 else 4373 *r-- = ch; 4374 reganch >>= 1; 4375 } 4376 if(has_minus) { 4377 *r = '-'; 4378 p = colon; 4379 } 4380 } 4381 4382 *p++ = ':'; 4383 Copy(RExC_precomp, p, plen, char); 4384 assert ((RX_WRAPPED(rx) - p) < 16); 4385 r->pre_prefix = p - RX_WRAPPED(rx); 4386 p += plen; 4387 if (has_runon) 4388 *p++ = '\n'; 4389 *p++ = ')'; 4390 *p = 0; 4391 } 4392 4393 r->intflags = 0; 4394 r->nparens = RExC_npar - 1; /* set early to validate backrefs */ 4395 4396 if (RExC_seen & REG_SEEN_RECURSE) { 4397 Newxz(RExC_open_parens, RExC_npar,regnode *); 4398 SAVEFREEPV(RExC_open_parens); 4399 Newxz(RExC_close_parens,RExC_npar,regnode *); 4400 SAVEFREEPV(RExC_close_parens); 4401 } 4402 4403 /* Useful during FAIL. */ 4404 #ifdef RE_TRACK_PATTERN_OFFSETS 4405 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ 4406 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, 4407 "%s %"UVuf" bytes for offset annotations.\n", 4408 ri->u.offsets ? "Got" : "Couldn't get", 4409 (UV)((2*RExC_size+1) * sizeof(U32)))); 4410 #endif 4411 SetProgLen(ri,RExC_size); 4412 RExC_rx_sv = rx; 4413 RExC_rx = r; 4414 RExC_rxi = ri; 4415 4416 /* Second pass: emit code. */ 4417 RExC_flags = pm_flags; /* don't let top level (?i) bleed */ 4418 RExC_parse = exp; 4419 RExC_end = xend; 4420 RExC_naughty = 0; 4421 RExC_npar = 1; 4422 RExC_emit_start = ri->program; 4423 RExC_emit = ri->program; 4424 RExC_emit_bound = ri->program + RExC_size + 1; 4425 4426 /* Store the count of eval-groups for security checks: */ 4427 RExC_rx->seen_evals = RExC_seen_evals; 4428 REGC((U8)REG_MAGIC, (char*) RExC_emit++); 4429 if (reg(pRExC_state, 0, &flags,1) == NULL) { 4430 ReREFCNT_dec(rx); 4431 return(NULL); 4432 } 4433 /* XXXX To minimize changes to RE engine we always allocate 4434 3-units-long substrs field. */ 4435 Newx(r->substrs, 1, struct reg_substr_data); 4436 if (RExC_recurse_count) { 4437 Newxz(RExC_recurse,RExC_recurse_count,regnode *); 4438 SAVEFREEPV(RExC_recurse); 4439 } 4440 4441 reStudy: 4442 r->minlen = minlen = sawplus = sawopen = 0; 4443 Zero(r->substrs, 1, struct reg_substr_data); 4444 4445 #ifdef TRIE_STUDY_OPT 4446 if (!restudied) { 4447 StructCopy(&zero_scan_data, &data, scan_data_t); 4448 copyRExC_state = RExC_state; 4449 } else { 4450 U32 seen=RExC_seen; 4451 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); 4452 4453 RExC_state = copyRExC_state; 4454 if (seen & REG_TOP_LEVEL_BRANCHES) 4455 RExC_seen |= REG_TOP_LEVEL_BRANCHES; 4456 else 4457 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; 4458 if (data.last_found) { 4459 SvREFCNT_dec(data.longest_fixed); 4460 SvREFCNT_dec(data.longest_float); 4461 SvREFCNT_dec(data.last_found); 4462 } 4463 StructCopy(&zero_scan_data, &data, scan_data_t); 4464 } 4465 #else 4466 StructCopy(&zero_scan_data, &data, scan_data_t); 4467 #endif 4468 4469 /* Dig out information for optimizations. */ 4470 r->extflags = RExC_flags; /* was pm_op */ 4471 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ 4472 4473 if (UTF) 4474 SvUTF8_on(rx); /* Unicode in it? */ 4475 ri->regstclass = NULL; 4476 if (RExC_naughty >= 10) /* Probably an expensive pattern. */ 4477 r->intflags |= PREGf_NAUGHTY; 4478 scan = ri->program + 1; /* First BRANCH. */ 4479 4480 /* testing for BRANCH here tells us whether there is "must appear" 4481 data in the pattern. If there is then we can use it for optimisations */ 4482 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ 4483 I32 fake; 4484 STRLEN longest_float_length, longest_fixed_length; 4485 struct regnode_charclass_class ch_class; /* pointed to by data */ 4486 int stclass_flag; 4487 I32 last_close = 0; /* pointed to by data */ 4488 regnode *first= scan; 4489 regnode *first_next= regnext(first); 4490 4491 /* 4492 * Skip introductions and multiplicators >= 1 4493 * so that we can extract the 'meat' of the pattern that must 4494 * match in the large if() sequence following. 4495 * NOTE that EXACT is NOT covered here, as it is normally 4496 * picked up by the optimiser separately. 4497 * 4498 * This is unfortunate as the optimiser isnt handling lookahead 4499 * properly currently. 4500 * 4501 */ 4502 while ((OP(first) == OPEN && (sawopen = 1)) || 4503 /* An OR of *one* alternative - should not happen now. */ 4504 (OP(first) == BRANCH && OP(first_next) != BRANCH) || 4505 /* for now we can't handle lookbehind IFMATCH*/ 4506 (OP(first) == IFMATCH && !first->flags) || 4507 (OP(first) == PLUS) || 4508 (OP(first) == MINMOD) || 4509 /* An {n,m} with n>0 */ 4510 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || 4511 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) 4512 { 4513 /* 4514 * the only op that could be a regnode is PLUS, all the rest 4515 * will be regnode_1 or regnode_2. 4516 * 4517 */ 4518 if (OP(first) == PLUS) 4519 sawplus = 1; 4520 else 4521 first += regarglen[OP(first)]; 4522 4523 first = NEXTOPER(first); 4524 first_next= regnext(first); 4525 } 4526 4527 /* Starting-point info. */ 4528 again: 4529 DEBUG_PEEP("first:",first,0); 4530 /* Ignore EXACT as we deal with it later. */ 4531 if (PL_regkind[OP(first)] == EXACT) { 4532 if (OP(first) == EXACT) 4533 NOOP; /* Empty, get anchored substr later. */ 4534 else if ((OP(first) == EXACTF || OP(first) == EXACTFL)) 4535 ri->regstclass = first; 4536 } 4537 #ifdef TRIE_STCLASS 4538 else if (PL_regkind[OP(first)] == TRIE && 4539 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) 4540 { 4541 regnode *trie_op; 4542 /* this can happen only on restudy */ 4543 if ( OP(first) == TRIE ) { 4544 struct regnode_1 *trieop = (struct regnode_1 *) 4545 PerlMemShared_calloc(1, sizeof(struct regnode_1)); 4546 StructCopy(first,trieop,struct regnode_1); 4547 trie_op=(regnode *)trieop; 4548 } else { 4549 struct regnode_charclass *trieop = (struct regnode_charclass *) 4550 PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); 4551 StructCopy(first,trieop,struct regnode_charclass); 4552 trie_op=(regnode *)trieop; 4553 } 4554 OP(trie_op)+=2; 4555 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); 4556 ri->regstclass = trie_op; 4557 } 4558 #endif 4559 else if (strchr((const char*)PL_simple,OP(first))) 4560 ri->regstclass = first; 4561 else if (PL_regkind[OP(first)] == BOUND || 4562 PL_regkind[OP(first)] == NBOUND) 4563 ri->regstclass = first; 4564 else if (PL_regkind[OP(first)] == BOL) { 4565 r->extflags |= (OP(first) == MBOL 4566 ? RXf_ANCH_MBOL 4567 : (OP(first) == SBOL 4568 ? RXf_ANCH_SBOL 4569 : RXf_ANCH_BOL)); 4570 first = NEXTOPER(first); 4571 goto again; 4572 } 4573 else if (OP(first) == GPOS) { 4574 r->extflags |= RXf_ANCH_GPOS; 4575 first = NEXTOPER(first); 4576 goto again; 4577 } 4578 else if ((!sawopen || !RExC_sawback) && 4579 (OP(first) == STAR && 4580 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && 4581 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) 4582 { 4583 /* turn .* into ^.* with an implied $*=1 */ 4584 const int type = 4585 (OP(NEXTOPER(first)) == REG_ANY) 4586 ? RXf_ANCH_MBOL 4587 : RXf_ANCH_SBOL; 4588 r->extflags |= type; 4589 r->intflags |= PREGf_IMPLICIT; 4590 first = NEXTOPER(first); 4591 goto again; 4592 } 4593 if (sawplus && (!sawopen || !RExC_sawback) 4594 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ 4595 /* x+ must match at the 1st pos of run of x's */ 4596 r->intflags |= PREGf_SKIP; 4597 4598 /* Scan is after the zeroth branch, first is atomic matcher. */ 4599 #ifdef TRIE_STUDY_OPT 4600 DEBUG_PARSE_r( 4601 if (!restudied) 4602 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 4603 (IV)(first - scan + 1)) 4604 ); 4605 #else 4606 DEBUG_PARSE_r( 4607 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 4608 (IV)(first - scan + 1)) 4609 ); 4610 #endif 4611 4612 4613 /* 4614 * If there's something expensive in the r.e., find the 4615 * longest literal string that must appear and make it the 4616 * regmust. Resolve ties in favor of later strings, since 4617 * the regstart check works with the beginning of the r.e. 4618 * and avoiding duplication strengthens checking. Not a 4619 * strong reason, but sufficient in the absence of others. 4620 * [Now we resolve ties in favor of the earlier string if 4621 * it happens that c_offset_min has been invalidated, since the 4622 * earlier string may buy us something the later one won't.] 4623 */ 4624 4625 data.longest_fixed = newSVpvs(""); 4626 data.longest_float = newSVpvs(""); 4627 data.last_found = newSVpvs(""); 4628 data.longest = &(data.longest_fixed); 4629 first = scan; 4630 if (!ri->regstclass) { 4631 cl_init(pRExC_state, &ch_class); 4632 data.start_class = &ch_class; 4633 stclass_flag = SCF_DO_STCLASS_AND; 4634 } else /* XXXX Check for BOUND? */ 4635 stclass_flag = 0; 4636 data.last_closep = &last_close; 4637 4638 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ 4639 &data, -1, NULL, NULL, 4640 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); 4641 4642 4643 CHECK_RESTUDY_GOTO; 4644 4645 4646 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) 4647 && data.last_start_min == 0 && data.last_end > 0 4648 && !RExC_seen_zerolen 4649 && !(RExC_seen & REG_SEEN_VERBARG) 4650 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) 4651 r->extflags |= RXf_CHECK_ALL; 4652 scan_commit(pRExC_state, &data,&minlen,0); 4653 SvREFCNT_dec(data.last_found); 4654 4655 /* Note that code very similar to this but for anchored string 4656 follows immediately below, changes may need to be made to both. 4657 Be careful. 4658 */ 4659 longest_float_length = CHR_SVLEN(data.longest_float); 4660 if (longest_float_length 4661 || (data.flags & SF_FL_BEFORE_EOL 4662 && (!(data.flags & SF_FL_BEFORE_MEOL) 4663 || (RExC_flags & RXf_PMf_MULTILINE)))) 4664 { 4665 I32 t,ml; 4666 4667 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ 4668 && data.offset_fixed == data.offset_float_min 4669 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) 4670 goto remove_float; /* As in (a)+. */ 4671 4672 /* copy the information about the longest float from the reg_scan_data 4673 over to the program. */ 4674 if (SvUTF8(data.longest_float)) { 4675 r->float_utf8 = data.longest_float; 4676 r->float_substr = NULL; 4677 } else { 4678 r->float_substr = data.longest_float; 4679 r->float_utf8 = NULL; 4680 } 4681 /* float_end_shift is how many chars that must be matched that 4682 follow this item. We calculate it ahead of time as once the 4683 lookbehind offset is added in we lose the ability to correctly 4684 calculate it.*/ 4685 ml = data.minlen_float ? *(data.minlen_float) 4686 : (I32)longest_float_length; 4687 r->float_end_shift = ml - data.offset_float_min 4688 - longest_float_length + (SvTAIL(data.longest_float) != 0) 4689 + data.lookbehind_float; 4690 r->float_min_offset = data.offset_float_min - data.lookbehind_float; 4691 r->float_max_offset = data.offset_float_max; 4692 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ 4693 r->float_max_offset -= data.lookbehind_float; 4694 4695 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ 4696 && (!(data.flags & SF_FL_BEFORE_MEOL) 4697 || (RExC_flags & RXf_PMf_MULTILINE))); 4698 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); 4699 } 4700 else { 4701 remove_float: 4702 r->float_substr = r->float_utf8 = NULL; 4703 SvREFCNT_dec(data.longest_float); 4704 longest_float_length = 0; 4705 } 4706 4707 /* Note that code very similar to this but for floating string 4708 is immediately above, changes may need to be made to both. 4709 Be careful. 4710 */ 4711 longest_fixed_length = CHR_SVLEN(data.longest_fixed); 4712 if (longest_fixed_length 4713 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ 4714 && (!(data.flags & SF_FIX_BEFORE_MEOL) 4715 || (RExC_flags & RXf_PMf_MULTILINE)))) 4716 { 4717 I32 t,ml; 4718 4719 /* copy the information about the longest fixed 4720 from the reg_scan_data over to the program. */ 4721 if (SvUTF8(data.longest_fixed)) { 4722 r->anchored_utf8 = data.longest_fixed; 4723 r->anchored_substr = NULL; 4724 } else { 4725 r->anchored_substr = data.longest_fixed; 4726 r->anchored_utf8 = NULL; 4727 } 4728 /* fixed_end_shift is how many chars that must be matched that 4729 follow this item. We calculate it ahead of time as once the 4730 lookbehind offset is added in we lose the ability to correctly 4731 calculate it.*/ 4732 ml = data.minlen_fixed ? *(data.minlen_fixed) 4733 : (I32)longest_fixed_length; 4734 r->anchored_end_shift = ml - data.offset_fixed 4735 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0) 4736 + data.lookbehind_fixed; 4737 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; 4738 4739 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ 4740 && (!(data.flags & SF_FIX_BEFORE_MEOL) 4741 || (RExC_flags & RXf_PMf_MULTILINE))); 4742 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); 4743 } 4744 else { 4745 r->anchored_substr = r->anchored_utf8 = NULL; 4746 SvREFCNT_dec(data.longest_fixed); 4747 longest_fixed_length = 0; 4748 } 4749 if (ri->regstclass 4750 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) 4751 ri->regstclass = NULL; 4752 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) 4753 && stclass_flag 4754 && !(data.start_class->flags & ANYOF_EOS) 4755 && !cl_is_anything(data.start_class)) 4756 { 4757 const U32 n = add_data(pRExC_state, 1, "f"); 4758 4759 Newx(RExC_rxi->data->data[n], 1, 4760 struct regnode_charclass_class); 4761 StructCopy(data.start_class, 4762 (struct regnode_charclass_class*)RExC_rxi->data->data[n], 4763 struct regnode_charclass_class); 4764 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 4765 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 4766 DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); 4767 regprop(r, sv, (regnode*)data.start_class); 4768 PerlIO_printf(Perl_debug_log, 4769 "synthetic stclass \"%s\".\n", 4770 SvPVX_const(sv));}); 4771 } 4772 4773 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ 4774 if (longest_fixed_length > longest_float_length) { 4775 r->check_end_shift = r->anchored_end_shift; 4776 r->check_substr = r->anchored_substr; 4777 r->check_utf8 = r->anchored_utf8; 4778 r->check_offset_min = r->check_offset_max = r->anchored_offset; 4779 if (r->extflags & RXf_ANCH_SINGLE) 4780 r->extflags |= RXf_NOSCAN; 4781 } 4782 else { 4783 r->check_end_shift = r->float_end_shift; 4784 r->check_substr = r->float_substr; 4785 r->check_utf8 = r->float_utf8; 4786 r->check_offset_min = r->float_min_offset; 4787 r->check_offset_max = r->float_max_offset; 4788 } 4789 /* XXXX Currently intuiting is not compatible with ANCH_GPOS. 4790 This should be changed ASAP! */ 4791 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { 4792 r->extflags |= RXf_USE_INTUIT; 4793 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) 4794 r->extflags |= RXf_INTUIT_TAIL; 4795 } 4796 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) 4797 if ( (STRLEN)minlen < longest_float_length ) 4798 minlen= longest_float_length; 4799 if ( (STRLEN)minlen < longest_fixed_length ) 4800 minlen= longest_fixed_length; 4801 */ 4802 } 4803 else { 4804 /* Several toplevels. Best we can is to set minlen. */ 4805 I32 fake; 4806 struct regnode_charclass_class ch_class; 4807 I32 last_close = 0; 4808 4809 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); 4810 4811 scan = ri->program + 1; 4812 cl_init(pRExC_state, &ch_class); 4813 data.start_class = &ch_class; 4814 data.last_closep = &last_close; 4815 4816 4817 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, 4818 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); 4819 4820 CHECK_RESTUDY_GOTO; 4821 4822 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 4823 = r->float_substr = r->float_utf8 = NULL; 4824 if (!(data.start_class->flags & ANYOF_EOS) 4825 && !cl_is_anything(data.start_class)) 4826 { 4827 const U32 n = add_data(pRExC_state, 1, "f"); 4828 4829 Newx(RExC_rxi->data->data[n], 1, 4830 struct regnode_charclass_class); 4831 StructCopy(data.start_class, 4832 (struct regnode_charclass_class*)RExC_rxi->data->data[n], 4833 struct regnode_charclass_class); 4834 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 4835 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 4836 DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); 4837 regprop(r, sv, (regnode*)data.start_class); 4838 PerlIO_printf(Perl_debug_log, 4839 "synthetic stclass \"%s\".\n", 4840 SvPVX_const(sv));}); 4841 } 4842 } 4843 4844 /* Guard against an embedded (?=) or (?<=) with a longer minlen than 4845 the "real" pattern. */ 4846 DEBUG_OPTIMISE_r({ 4847 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", 4848 (IV)minlen, (IV)r->minlen); 4849 }); 4850 r->minlenret = minlen; 4851 if (r->minlen < minlen) 4852 r->minlen = minlen; 4853 4854 if (RExC_seen & REG_SEEN_GPOS) 4855 r->extflags |= RXf_GPOS_SEEN; 4856 if (RExC_seen & REG_SEEN_LOOKBEHIND) 4857 r->extflags |= RXf_LOOKBEHIND_SEEN; 4858 if (RExC_seen & REG_SEEN_EVAL) 4859 r->extflags |= RXf_EVAL_SEEN; 4860 if (RExC_seen & REG_SEEN_CANY) 4861 r->extflags |= RXf_CANY_SEEN; 4862 if (RExC_seen & REG_SEEN_VERBARG) 4863 r->intflags |= PREGf_VERBARG_SEEN; 4864 if (RExC_seen & REG_SEEN_CUTGROUP) 4865 r->intflags |= PREGf_CUTGROUP_SEEN; 4866 if (RExC_paren_names) 4867 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); 4868 else 4869 RXp_PAREN_NAMES(r) = NULL; 4870 4871 #ifdef STUPID_PATTERN_CHECKS 4872 if (RX_PRELEN(rx) == 0) 4873 r->extflags |= RXf_NULL; 4874 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') 4875 /* XXX: this should happen BEFORE we compile */ 4876 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 4877 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) 4878 r->extflags |= RXf_WHITE; 4879 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') 4880 r->extflags |= RXf_START_ONLY; 4881 #else 4882 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') 4883 /* XXX: this should happen BEFORE we compile */ 4884 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 4885 else { 4886 regnode *first = ri->program + 1; 4887 U8 fop = OP(first); 4888 U8 nop = OP(NEXTOPER(first)); 4889 4890 if (PL_regkind[fop] == NOTHING && nop == END) 4891 r->extflags |= RXf_NULL; 4892 else if (PL_regkind[fop] == BOL && nop == END) 4893 r->extflags |= RXf_START_ONLY; 4894 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) 4895 r->extflags |= RXf_WHITE; 4896 } 4897 #endif 4898 #ifdef DEBUGGING 4899 if (RExC_paren_names) { 4900 ri->name_list_idx = add_data( pRExC_state, 1, "p" ); 4901 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); 4902 } else 4903 #endif 4904 ri->name_list_idx = 0; 4905 4906 if (RExC_recurse_count) { 4907 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { 4908 const regnode *scan = RExC_recurse[RExC_recurse_count-1]; 4909 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); 4910 } 4911 } 4912 Newxz(r->offs, RExC_npar, regexp_paren_pair); 4913 /* assume we don't need to swap parens around before we match */ 4914 4915 DEBUG_DUMP_r({ 4916 PerlIO_printf(Perl_debug_log,"Final program:\n"); 4917 regdump(r); 4918 }); 4919 #ifdef RE_TRACK_PATTERN_OFFSETS 4920 DEBUG_OFFSETS_r(if (ri->u.offsets) { 4921 const U32 len = ri->u.offsets[0]; 4922 U32 i; 4923 GET_RE_DEBUG_FLAGS_DECL; 4924 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); 4925 for (i = 1; i <= len; i++) { 4926 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) 4927 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", 4928 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); 4929 } 4930 PerlIO_printf(Perl_debug_log, "\n"); 4931 }); 4932 #endif 4933 return rx; 4934 } 4935 4936 #undef RE_ENGINE_PTR 4937 4938 4939 SV* 4940 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, 4941 const U32 flags) 4942 { 4943 PERL_ARGS_ASSERT_REG_NAMED_BUFF; 4944 4945 PERL_UNUSED_ARG(value); 4946 4947 if (flags & RXapif_FETCH) { 4948 return reg_named_buff_fetch(rx, key, flags); 4949 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { 4950 Perl_croak(aTHX_ "%s", PL_no_modify); 4951 return NULL; 4952 } else if (flags & RXapif_EXISTS) { 4953 return reg_named_buff_exists(rx, key, flags) 4954 ? &PL_sv_yes 4955 : &PL_sv_no; 4956 } else if (flags & RXapif_REGNAMES) { 4957 return reg_named_buff_all(rx, flags); 4958 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { 4959 return reg_named_buff_scalar(rx, flags); 4960 } else { 4961 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); 4962 return NULL; 4963 } 4964 } 4965 4966 SV* 4967 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, 4968 const U32 flags) 4969 { 4970 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; 4971 PERL_UNUSED_ARG(lastkey); 4972 4973 if (flags & RXapif_FIRSTKEY) 4974 return reg_named_buff_firstkey(rx, flags); 4975 else if (flags & RXapif_NEXTKEY) 4976 return reg_named_buff_nextkey(rx, flags); 4977 else { 4978 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); 4979 return NULL; 4980 } 4981 } 4982 4983 SV* 4984 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, 4985 const U32 flags) 4986 { 4987 AV *retarray = NULL; 4988 SV *ret; 4989 struct regexp *const rx = (struct regexp *)SvANY(r); 4990 4991 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; 4992 4993 if (flags & RXapif_ALL) 4994 retarray=newAV(); 4995 4996 if (rx && RXp_PAREN_NAMES(rx)) { 4997 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); 4998 if (he_str) { 4999 IV i; 5000 SV* sv_dat=HeVAL(he_str); 5001 I32 *nums=(I32*)SvPVX(sv_dat); 5002 for ( i=0; i<SvIVX(sv_dat); i++ ) { 5003 if ((I32)(rx->nparens) >= nums[i] 5004 && rx->offs[nums[i]].start != -1 5005 && rx->offs[nums[i]].end != -1) 5006 { 5007 ret = newSVpvs(""); 5008 CALLREG_NUMBUF_FETCH(r,nums[i],ret); 5009 if (!retarray) 5010 return ret; 5011 } else { 5012 ret = newSVsv(&PL_sv_undef); 5013 } 5014 if (retarray) 5015 av_push(retarray, ret); 5016 } 5017 if (retarray) 5018 return newRV_noinc(MUTABLE_SV(retarray)); 5019 } 5020 } 5021 return NULL; 5022 } 5023 5024 bool 5025 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, 5026 const U32 flags) 5027 { 5028 struct regexp *const rx = (struct regexp *)SvANY(r); 5029 5030 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; 5031 5032 if (rx && RXp_PAREN_NAMES(rx)) { 5033 if (flags & RXapif_ALL) { 5034 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); 5035 } else { 5036 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); 5037 if (sv) { 5038 SvREFCNT_dec(sv); 5039 return TRUE; 5040 } else { 5041 return FALSE; 5042 } 5043 } 5044 } else { 5045 return FALSE; 5046 } 5047 } 5048 5049 SV* 5050 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) 5051 { 5052 struct regexp *const rx = (struct regexp *)SvANY(r); 5053 5054 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; 5055 5056 if ( rx && RXp_PAREN_NAMES(rx) ) { 5057 (void)hv_iterinit(RXp_PAREN_NAMES(rx)); 5058 5059 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); 5060 } else { 5061 return FALSE; 5062 } 5063 } 5064 5065 SV* 5066 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) 5067 { 5068 struct regexp *const rx = (struct regexp *)SvANY(r); 5069 GET_RE_DEBUG_FLAGS_DECL; 5070 5071 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; 5072 5073 if (rx && RXp_PAREN_NAMES(rx)) { 5074 HV *hv = RXp_PAREN_NAMES(rx); 5075 HE *temphe; 5076 while ( (temphe = hv_iternext_flags(hv,0)) ) { 5077 IV i; 5078 IV parno = 0; 5079 SV* sv_dat = HeVAL(temphe); 5080 I32 *nums = (I32*)SvPVX(sv_dat); 5081 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 5082 if ((I32)(rx->lastparen) >= nums[i] && 5083 rx->offs[nums[i]].start != -1 && 5084 rx->offs[nums[i]].end != -1) 5085 { 5086 parno = nums[i]; 5087 break; 5088 } 5089 } 5090 if (parno || flags & RXapif_ALL) { 5091 return newSVhek(HeKEY_hek(temphe)); 5092 } 5093 } 5094 } 5095 return NULL; 5096 } 5097 5098 SV* 5099 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) 5100 { 5101 SV *ret; 5102 AV *av; 5103 I32 length; 5104 struct regexp *const rx = (struct regexp *)SvANY(r); 5105 5106 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; 5107 5108 if (rx && RXp_PAREN_NAMES(rx)) { 5109 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { 5110 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); 5111 } else if (flags & RXapif_ONE) { 5112 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); 5113 av = MUTABLE_AV(SvRV(ret)); 5114 length = av_len(av); 5115 SvREFCNT_dec(ret); 5116 return newSViv(length + 1); 5117 } else { 5118 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); 5119 return NULL; 5120 } 5121 } 5122 return &PL_sv_undef; 5123 } 5124 5125 SV* 5126 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) 5127 { 5128 struct regexp *const rx = (struct regexp *)SvANY(r); 5129 AV *av = newAV(); 5130 5131 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; 5132 5133 if (rx && RXp_PAREN_NAMES(rx)) { 5134 HV *hv= RXp_PAREN_NAMES(rx); 5135 HE *temphe; 5136 (void)hv_iterinit(hv); 5137 while ( (temphe = hv_iternext_flags(hv,0)) ) { 5138 IV i; 5139 IV parno = 0; 5140 SV* sv_dat = HeVAL(temphe); 5141 I32 *nums = (I32*)SvPVX(sv_dat); 5142 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 5143 if ((I32)(rx->lastparen) >= nums[i] && 5144 rx->offs[nums[i]].start != -1 && 5145 rx->offs[nums[i]].end != -1) 5146 { 5147 parno = nums[i]; 5148 break; 5149 } 5150 } 5151 if (parno || flags & RXapif_ALL) { 5152 av_push(av, newSVhek(HeKEY_hek(temphe))); 5153 } 5154 } 5155 } 5156 5157 return newRV_noinc(MUTABLE_SV(av)); 5158 } 5159 5160 void 5161 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, 5162 SV * const sv) 5163 { 5164 struct regexp *const rx = (struct regexp *)SvANY(r); 5165 char *s = NULL; 5166 I32 i = 0; 5167 I32 s1, t1; 5168 5169 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; 5170 5171 if (!rx->subbeg) { 5172 sv_setsv(sv,&PL_sv_undef); 5173 return; 5174 } 5175 else 5176 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { 5177 /* $` */ 5178 i = rx->offs[0].start; 5179 s = rx->subbeg; 5180 } 5181 else 5182 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { 5183 /* $' */ 5184 s = rx->subbeg + rx->offs[0].end; 5185 i = rx->sublen - rx->offs[0].end; 5186 } 5187 else 5188 if ( 0 <= paren && paren <= (I32)rx->nparens && 5189 (s1 = rx->offs[paren].start) != -1 && 5190 (t1 = rx->offs[paren].end) != -1) 5191 { 5192 /* $& $1 ... */ 5193 i = t1 - s1; 5194 s = rx->subbeg + s1; 5195 } else { 5196 sv_setsv(sv,&PL_sv_undef); 5197 return; 5198 } 5199 assert(rx->sublen >= (s - rx->subbeg) + i ); 5200 if (i >= 0) { 5201 const int oldtainted = PL_tainted; 5202 TAINT_NOT; 5203 sv_setpvn(sv, s, i); 5204 PL_tainted = oldtainted; 5205 if ( (rx->extflags & RXf_CANY_SEEN) 5206 ? (RXp_MATCH_UTF8(rx) 5207 && (!i || is_utf8_string((U8*)s, i))) 5208 : (RXp_MATCH_UTF8(rx)) ) 5209 { 5210 SvUTF8_on(sv); 5211 } 5212 else 5213 SvUTF8_off(sv); 5214 if (PL_tainting) { 5215 if (RXp_MATCH_TAINTED(rx)) { 5216 if (SvTYPE(sv) >= SVt_PVMG) { 5217 MAGIC* const mg = SvMAGIC(sv); 5218 MAGIC* mgt; 5219 PL_tainted = 1; 5220 SvMAGIC_set(sv, mg->mg_moremagic); 5221 SvTAINT(sv); 5222 if ((mgt = SvMAGIC(sv))) { 5223 mg->mg_moremagic = mgt; 5224 SvMAGIC_set(sv, mg); 5225 } 5226 } else { 5227 PL_tainted = 1; 5228 SvTAINT(sv); 5229 } 5230 } else 5231 SvTAINTED_off(sv); 5232 } 5233 } else { 5234 sv_setsv(sv,&PL_sv_undef); 5235 return; 5236 } 5237 } 5238 5239 void 5240 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, 5241 SV const * const value) 5242 { 5243 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; 5244 5245 PERL_UNUSED_ARG(rx); 5246 PERL_UNUSED_ARG(paren); 5247 PERL_UNUSED_ARG(value); 5248 5249 if (!PL_localizing) 5250 Perl_croak(aTHX_ "%s", PL_no_modify); 5251 } 5252 5253 I32 5254 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, 5255 const I32 paren) 5256 { 5257 struct regexp *const rx = (struct regexp *)SvANY(r); 5258 I32 i; 5259 I32 s1, t1; 5260 5261 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; 5262 5263 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ 5264 switch (paren) { 5265 /* $` / ${^PREMATCH} */ 5266 case RX_BUFF_IDX_PREMATCH: 5267 if (rx->offs[0].start != -1) { 5268 i = rx->offs[0].start; 5269 if (i > 0) { 5270 s1 = 0; 5271 t1 = i; 5272 goto getlen; 5273 } 5274 } 5275 return 0; 5276 /* $' / ${^POSTMATCH} */ 5277 case RX_BUFF_IDX_POSTMATCH: 5278 if (rx->offs[0].end != -1) { 5279 i = rx->sublen - rx->offs[0].end; 5280 if (i > 0) { 5281 s1 = rx->offs[0].end; 5282 t1 = rx->sublen; 5283 goto getlen; 5284 } 5285 } 5286 return 0; 5287 /* $& / ${^MATCH}, $1, $2, ... */ 5288 default: 5289 if (paren <= (I32)rx->nparens && 5290 (s1 = rx->offs[paren].start) != -1 && 5291 (t1 = rx->offs[paren].end) != -1) 5292 { 5293 i = t1 - s1; 5294 goto getlen; 5295 } else { 5296 if (ckWARN(WARN_UNINITIALIZED)) 5297 report_uninit((const SV *)sv); 5298 return 0; 5299 } 5300 } 5301 getlen: 5302 if (i > 0 && RXp_MATCH_UTF8(rx)) { 5303 const char * const s = rx->subbeg + s1; 5304 const U8 *ep; 5305 STRLEN el; 5306 5307 i = t1 - s1; 5308 if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) 5309 i = el; 5310 } 5311 return i; 5312 } 5313 5314 SV* 5315 Perl_reg_qr_package(pTHX_ REGEXP * const rx) 5316 { 5317 PERL_ARGS_ASSERT_REG_QR_PACKAGE; 5318 PERL_UNUSED_ARG(rx); 5319 if (0) 5320 return NULL; 5321 else 5322 return newSVpvs("Regexp"); 5323 } 5324 5325 /* Scans the name of a named buffer from the pattern. 5326 * If flags is REG_RSN_RETURN_NULL returns null. 5327 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name 5328 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding 5329 * to the parsed name as looked up in the RExC_paren_names hash. 5330 * If there is an error throws a vFAIL().. type exception. 5331 */ 5332 5333 #define REG_RSN_RETURN_NULL 0 5334 #define REG_RSN_RETURN_NAME 1 5335 #define REG_RSN_RETURN_DATA 2 5336 5337 STATIC SV* 5338 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) 5339 { 5340 char *name_start = RExC_parse; 5341 5342 PERL_ARGS_ASSERT_REG_SCAN_NAME; 5343 5344 if (isIDFIRST_lazy_if(RExC_parse, UTF)) { 5345 /* skip IDFIRST by using do...while */ 5346 if (UTF) 5347 do { 5348 RExC_parse += UTF8SKIP(RExC_parse); 5349 } while (isALNUM_utf8((U8*)RExC_parse)); 5350 else 5351 do { 5352 RExC_parse++; 5353 } while (isALNUM(*RExC_parse)); 5354 } 5355 5356 if ( flags ) { 5357 SV* sv_name 5358 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), 5359 SVs_TEMP | (UTF ? SVf_UTF8 : 0)); 5360 if ( flags == REG_RSN_RETURN_NAME) 5361 return sv_name; 5362 else if (flags==REG_RSN_RETURN_DATA) { 5363 HE *he_str = NULL; 5364 SV *sv_dat = NULL; 5365 if ( ! sv_name ) /* should not happen*/ 5366 Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); 5367 if (RExC_paren_names) 5368 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); 5369 if ( he_str ) 5370 sv_dat = HeVAL(he_str); 5371 if ( ! sv_dat ) 5372 vFAIL("Reference to nonexistent named group"); 5373 return sv_dat; 5374 } 5375 else { 5376 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name"); 5377 } 5378 /* NOT REACHED */ 5379 } 5380 return NULL; 5381 } 5382 5383 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ 5384 int rem=(int)(RExC_end - RExC_parse); \ 5385 int cut; \ 5386 int num; \ 5387 int iscut=0; \ 5388 if (rem>10) { \ 5389 rem=10; \ 5390 iscut=1; \ 5391 } \ 5392 cut=10-rem; \ 5393 if (RExC_lastparse!=RExC_parse) \ 5394 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ 5395 rem, RExC_parse, \ 5396 cut + 4, \ 5397 iscut ? "..." : "<" \ 5398 ); \ 5399 else \ 5400 PerlIO_printf(Perl_debug_log,"%16s",""); \ 5401 \ 5402 if (SIZE_ONLY) \ 5403 num = RExC_size + 1; \ 5404 else \ 5405 num=REG_NODE_NUM(RExC_emit); \ 5406 if (RExC_lastnum!=num) \ 5407 PerlIO_printf(Perl_debug_log,"|%4d",num); \ 5408 else \ 5409 PerlIO_printf(Perl_debug_log,"|%4s",""); \ 5410 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ 5411 (int)((depth*2)), "", \ 5412 (funcname) \ 5413 ); \ 5414 RExC_lastnum=num; \ 5415 RExC_lastparse=RExC_parse; \ 5416 }) 5417 5418 5419 5420 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ 5421 DEBUG_PARSE_MSG((funcname)); \ 5422 PerlIO_printf(Perl_debug_log,"%4s","\n"); \ 5423 }) 5424 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ 5425 DEBUG_PARSE_MSG((funcname)); \ 5426 PerlIO_printf(Perl_debug_log,fmt "\n",args); \ 5427 }) 5428 /* 5429 - reg - regular expression, i.e. main body or parenthesized thing 5430 * 5431 * Caller must absorb opening parenthesis. 5432 * 5433 * Combining parenthesis handling with the base level of regular expression 5434 * is a trifle forced, but the need to tie the tails of the branches to what 5435 * follows makes it hard to avoid. 5436 */ 5437 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) 5438 #ifdef DEBUGGING 5439 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) 5440 #else 5441 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) 5442 #endif 5443 5444 STATIC regnode * 5445 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) 5446 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ 5447 { 5448 dVAR; 5449 register regnode *ret; /* Will be the head of the group. */ 5450 register regnode *br; 5451 register regnode *lastbr; 5452 register regnode *ender = NULL; 5453 register I32 parno = 0; 5454 I32 flags; 5455 U32 oregflags = RExC_flags; 5456 bool have_branch = 0; 5457 bool is_open = 0; 5458 I32 freeze_paren = 0; 5459 I32 after_freeze = 0; 5460 5461 /* for (?g), (?gc), and (?o) warnings; warning 5462 about (?c) will warn about (?g) -- japhy */ 5463 5464 #define WASTED_O 0x01 5465 #define WASTED_G 0x02 5466 #define WASTED_C 0x04 5467 #define WASTED_GC (0x02|0x04) 5468 I32 wastedflags = 0x00; 5469 5470 char * parse_start = RExC_parse; /* MJD */ 5471 char * const oregcomp_parse = RExC_parse; 5472 5473 GET_RE_DEBUG_FLAGS_DECL; 5474 5475 PERL_ARGS_ASSERT_REG; 5476 DEBUG_PARSE("reg "); 5477 5478 *flagp = 0; /* Tentatively. */ 5479 5480 5481 /* Make an OPEN node, if parenthesized. */ 5482 if (paren) { 5483 if ( *RExC_parse == '*') { /* (*VERB:ARG) */ 5484 char *start_verb = RExC_parse; 5485 STRLEN verb_len = 0; 5486 char *start_arg = NULL; 5487 unsigned char op = 0; 5488 int argok = 1; 5489 int internal_argval = 0; /* internal_argval is only useful if !argok */ 5490 while ( *RExC_parse && *RExC_parse != ')' ) { 5491 if ( *RExC_parse == ':' ) { 5492 start_arg = RExC_parse + 1; 5493 break; 5494 } 5495 RExC_parse++; 5496 } 5497 ++start_verb; 5498 verb_len = RExC_parse - start_verb; 5499 if ( start_arg ) { 5500 RExC_parse++; 5501 while ( *RExC_parse && *RExC_parse != ')' ) 5502 RExC_parse++; 5503 if ( *RExC_parse != ')' ) 5504 vFAIL("Unterminated verb pattern argument"); 5505 if ( RExC_parse == start_arg ) 5506 start_arg = NULL; 5507 } else { 5508 if ( *RExC_parse != ')' ) 5509 vFAIL("Unterminated verb pattern"); 5510 } 5511 5512 switch ( *start_verb ) { 5513 case 'A': /* (*ACCEPT) */ 5514 if ( memEQs(start_verb,verb_len,"ACCEPT") ) { 5515 op = ACCEPT; 5516 internal_argval = RExC_nestroot; 5517 } 5518 break; 5519 case 'C': /* (*COMMIT) */ 5520 if ( memEQs(start_verb,verb_len,"COMMIT") ) 5521 op = COMMIT; 5522 break; 5523 case 'F': /* (*FAIL) */ 5524 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { 5525 op = OPFAIL; 5526 argok = 0; 5527 } 5528 break; 5529 case ':': /* (*:NAME) */ 5530 case 'M': /* (*MARK:NAME) */ 5531 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { 5532 op = MARKPOINT; 5533 argok = -1; 5534 } 5535 break; 5536 case 'P': /* (*PRUNE) */ 5537 if ( memEQs(start_verb,verb_len,"PRUNE") ) 5538 op = PRUNE; 5539 break; 5540 case 'S': /* (*SKIP) */ 5541 if ( memEQs(start_verb,verb_len,"SKIP") ) 5542 op = SKIP; 5543 break; 5544 case 'T': /* (*THEN) */ 5545 /* [19:06] <TimToady> :: is then */ 5546 if ( memEQs(start_verb,verb_len,"THEN") ) { 5547 op = CUTGROUP; 5548 RExC_seen |= REG_SEEN_CUTGROUP; 5549 } 5550 break; 5551 } 5552 if ( ! op ) { 5553 RExC_parse++; 5554 vFAIL3("Unknown verb pattern '%.*s'", 5555 verb_len, start_verb); 5556 } 5557 if ( argok ) { 5558 if ( start_arg && internal_argval ) { 5559 vFAIL3("Verb pattern '%.*s' may not have an argument", 5560 verb_len, start_verb); 5561 } else if ( argok < 0 && !start_arg ) { 5562 vFAIL3("Verb pattern '%.*s' has a mandatory argument", 5563 verb_len, start_verb); 5564 } else { 5565 ret = reganode(pRExC_state, op, internal_argval); 5566 if ( ! internal_argval && ! SIZE_ONLY ) { 5567 if (start_arg) { 5568 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); 5569 ARG(ret) = add_data( pRExC_state, 1, "S" ); 5570 RExC_rxi->data->data[ARG(ret)]=(void*)sv; 5571 ret->flags = 0; 5572 } else { 5573 ret->flags = 1; 5574 } 5575 } 5576 } 5577 if (!internal_argval) 5578 RExC_seen |= REG_SEEN_VERBARG; 5579 } else if ( start_arg ) { 5580 vFAIL3("Verb pattern '%.*s' may not have an argument", 5581 verb_len, start_verb); 5582 } else { 5583 ret = reg_node(pRExC_state, op); 5584 } 5585 nextchar(pRExC_state); 5586 return ret; 5587 } else 5588 if (*RExC_parse == '?') { /* (?...) */ 5589 bool is_logical = 0; 5590 const char * const seqstart = RExC_parse; 5591 5592 RExC_parse++; 5593 paren = *RExC_parse++; 5594 ret = NULL; /* For look-ahead/behind. */ 5595 switch (paren) { 5596 5597 case 'P': /* (?P...) variants for those used to PCRE/Python */ 5598 paren = *RExC_parse++; 5599 if ( paren == '<') /* (?P<...>) named capture */ 5600 goto named_capture; 5601 else if (paren == '>') { /* (?P>name) named recursion */ 5602 goto named_recursion; 5603 } 5604 else if (paren == '=') { /* (?P=...) named backref */ 5605 /* this pretty much dupes the code for \k<NAME> in regatom(), if 5606 you change this make sure you change that */ 5607 char* name_start = RExC_parse; 5608 U32 num = 0; 5609 SV *sv_dat = reg_scan_name(pRExC_state, 5610 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5611 if (RExC_parse == name_start || *RExC_parse != ')') 5612 vFAIL2("Sequence %.3s... not terminated",parse_start); 5613 5614 if (!SIZE_ONLY) { 5615 num = add_data( pRExC_state, 1, "S" ); 5616 RExC_rxi->data->data[num]=(void*)sv_dat; 5617 SvREFCNT_inc_simple_void(sv_dat); 5618 } 5619 RExC_sawback = 1; 5620 ret = reganode(pRExC_state, 5621 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), 5622 num); 5623 *flagp |= HASWIDTH; 5624 5625 Set_Node_Offset(ret, parse_start+1); 5626 Set_Node_Cur_Length(ret); /* MJD */ 5627 5628 nextchar(pRExC_state); 5629 return ret; 5630 } 5631 RExC_parse++; 5632 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5633 /*NOTREACHED*/ 5634 case '<': /* (?<...) */ 5635 if (*RExC_parse == '!') 5636 paren = ','; 5637 else if (*RExC_parse != '=') 5638 named_capture: 5639 { /* (?<...>) */ 5640 char *name_start; 5641 SV *svname; 5642 paren= '>'; 5643 case '\'': /* (?'...') */ 5644 name_start= RExC_parse; 5645 svname = reg_scan_name(pRExC_state, 5646 SIZE_ONLY ? /* reverse test from the others */ 5647 REG_RSN_RETURN_NAME : 5648 REG_RSN_RETURN_NULL); 5649 if (RExC_parse == name_start) { 5650 RExC_parse++; 5651 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5652 /*NOTREACHED*/ 5653 } 5654 if (*RExC_parse != paren) 5655 vFAIL2("Sequence (?%c... not terminated", 5656 paren=='>' ? '<' : paren); 5657 if (SIZE_ONLY) { 5658 HE *he_str; 5659 SV *sv_dat = NULL; 5660 if (!svname) /* shouldnt happen */ 5661 Perl_croak(aTHX_ 5662 "panic: reg_scan_name returned NULL"); 5663 if (!RExC_paren_names) { 5664 RExC_paren_names= newHV(); 5665 sv_2mortal(MUTABLE_SV(RExC_paren_names)); 5666 #ifdef DEBUGGING 5667 RExC_paren_name_list= newAV(); 5668 sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); 5669 #endif 5670 } 5671 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); 5672 if ( he_str ) 5673 sv_dat = HeVAL(he_str); 5674 if ( ! sv_dat ) { 5675 /* croak baby croak */ 5676 Perl_croak(aTHX_ 5677 "panic: paren_name hash element allocation failed"); 5678 } else if ( SvPOK(sv_dat) ) { 5679 /* (?|...) can mean we have dupes so scan to check 5680 its already been stored. Maybe a flag indicating 5681 we are inside such a construct would be useful, 5682 but the arrays are likely to be quite small, so 5683 for now we punt -- dmq */ 5684 IV count = SvIV(sv_dat); 5685 I32 *pv = (I32*)SvPVX(sv_dat); 5686 IV i; 5687 for ( i = 0 ; i < count ; i++ ) { 5688 if ( pv[i] == RExC_npar ) { 5689 count = 0; 5690 break; 5691 } 5692 } 5693 if ( count ) { 5694 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); 5695 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); 5696 pv[count] = RExC_npar; 5697 SvIV_set(sv_dat, SvIVX(sv_dat) + 1); 5698 } 5699 } else { 5700 (void)SvUPGRADE(sv_dat,SVt_PVNV); 5701 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); 5702 SvIOK_on(sv_dat); 5703 SvIV_set(sv_dat, 1); 5704 } 5705 #ifdef DEBUGGING 5706 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) 5707 SvREFCNT_dec(svname); 5708 #endif 5709 5710 /*sv_dump(sv_dat);*/ 5711 } 5712 nextchar(pRExC_state); 5713 paren = 1; 5714 goto capturing_parens; 5715 } 5716 RExC_seen |= REG_SEEN_LOOKBEHIND; 5717 RExC_parse++; 5718 case '=': /* (?=...) */ 5719 RExC_seen_zerolen++; 5720 break; 5721 case '!': /* (?!...) */ 5722 RExC_seen_zerolen++; 5723 if (*RExC_parse == ')') { 5724 ret=reg_node(pRExC_state, OPFAIL); 5725 nextchar(pRExC_state); 5726 return ret; 5727 } 5728 break; 5729 case '|': /* (?|...) */ 5730 /* branch reset, behave like a (?:...) except that 5731 buffers in alternations share the same numbers */ 5732 paren = ':'; 5733 after_freeze = freeze_paren = RExC_npar; 5734 break; 5735 case ':': /* (?:...) */ 5736 case '>': /* (?>...) */ 5737 break; 5738 case '$': /* (?$...) */ 5739 case '@': /* (?@...) */ 5740 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 5741 break; 5742 case '#': /* (?#...) */ 5743 while (*RExC_parse && *RExC_parse != ')') 5744 RExC_parse++; 5745 if (*RExC_parse != ')') 5746 FAIL("Sequence (?#... not terminated"); 5747 nextchar(pRExC_state); 5748 *flagp = TRYAGAIN; 5749 return NULL; 5750 case '0' : /* (?0) */ 5751 case 'R' : /* (?R) */ 5752 if (*RExC_parse != ')') 5753 FAIL("Sequence (?R) not terminated"); 5754 ret = reg_node(pRExC_state, GOSTART); 5755 *flagp |= POSTPONED; 5756 nextchar(pRExC_state); 5757 return ret; 5758 /*notreached*/ 5759 { /* named and numeric backreferences */ 5760 I32 num; 5761 case '&': /* (?&NAME) */ 5762 parse_start = RExC_parse - 1; 5763 named_recursion: 5764 { 5765 SV *sv_dat = reg_scan_name(pRExC_state, 5766 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5767 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 5768 } 5769 goto gen_recurse_regop; 5770 /* NOT REACHED */ 5771 case '+': 5772 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 5773 RExC_parse++; 5774 vFAIL("Illegal pattern"); 5775 } 5776 goto parse_recursion; 5777 /* NOT REACHED*/ 5778 case '-': /* (?-1) */ 5779 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 5780 RExC_parse--; /* rewind to let it be handled later */ 5781 goto parse_flags; 5782 } 5783 /*FALLTHROUGH */ 5784 case '1': case '2': case '3': case '4': /* (?1) */ 5785 case '5': case '6': case '7': case '8': case '9': 5786 RExC_parse--; 5787 parse_recursion: 5788 num = atoi(RExC_parse); 5789 parse_start = RExC_parse - 1; /* MJD */ 5790 if (*RExC_parse == '-') 5791 RExC_parse++; 5792 while (isDIGIT(*RExC_parse)) 5793 RExC_parse++; 5794 if (*RExC_parse!=')') 5795 vFAIL("Expecting close bracket"); 5796 5797 gen_recurse_regop: 5798 if ( paren == '-' ) { 5799 /* 5800 Diagram of capture buffer numbering. 5801 Top line is the normal capture buffer numbers 5802 Botton line is the negative indexing as from 5803 the X (the (?-2)) 5804 5805 + 1 2 3 4 5 X 6 7 5806 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ 5807 - 5 4 3 2 1 X x x 5808 5809 */ 5810 num = RExC_npar + num; 5811 if (num < 1) { 5812 RExC_parse++; 5813 vFAIL("Reference to nonexistent group"); 5814 } 5815 } else if ( paren == '+' ) { 5816 num = RExC_npar + num - 1; 5817 } 5818 5819 ret = reganode(pRExC_state, GOSUB, num); 5820 if (!SIZE_ONLY) { 5821 if (num > (I32)RExC_rx->nparens) { 5822 RExC_parse++; 5823 vFAIL("Reference to nonexistent group"); 5824 } 5825 ARG2L_SET( ret, RExC_recurse_count++); 5826 RExC_emit++; 5827 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 5828 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); 5829 } else { 5830 RExC_size++; 5831 } 5832 RExC_seen |= REG_SEEN_RECURSE; 5833 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ 5834 Set_Node_Offset(ret, parse_start); /* MJD */ 5835 5836 *flagp |= POSTPONED; 5837 nextchar(pRExC_state); 5838 return ret; 5839 } /* named and numeric backreferences */ 5840 /* NOT REACHED */ 5841 5842 case '?': /* (??...) */ 5843 is_logical = 1; 5844 if (*RExC_parse != '{') { 5845 RExC_parse++; 5846 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5847 /*NOTREACHED*/ 5848 } 5849 *flagp |= POSTPONED; 5850 paren = *RExC_parse++; 5851 /* FALL THROUGH */ 5852 case '{': /* (?{...}) */ 5853 { 5854 I32 count = 1; 5855 U32 n = 0; 5856 char c; 5857 char *s = RExC_parse; 5858 5859 RExC_seen_zerolen++; 5860 RExC_seen |= REG_SEEN_EVAL; 5861 while (count && (c = *RExC_parse)) { 5862 if (c == '\\') { 5863 if (RExC_parse[1]) 5864 RExC_parse++; 5865 } 5866 else if (c == '{') 5867 count++; 5868 else if (c == '}') 5869 count--; 5870 RExC_parse++; 5871 } 5872 if (*RExC_parse != ')') { 5873 RExC_parse = s; 5874 vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); 5875 } 5876 if (!SIZE_ONLY) { 5877 PAD *pad; 5878 OP_4tree *sop, *rop; 5879 SV * const sv = newSVpvn(s, RExC_parse - 1 - s); 5880 5881 ENTER; 5882 Perl_save_re_context(aTHX); 5883 rop = sv_compile_2op(sv, &sop, "re", &pad); 5884 sop->op_private |= OPpREFCOUNTED; 5885 /* re_dup will OpREFCNT_inc */ 5886 OpREFCNT_set(sop, 1); 5887 LEAVE; 5888 5889 n = add_data(pRExC_state, 3, "nop"); 5890 RExC_rxi->data->data[n] = (void*)rop; 5891 RExC_rxi->data->data[n+1] = (void*)sop; 5892 RExC_rxi->data->data[n+2] = (void*)pad; 5893 SvREFCNT_dec(sv); 5894 } 5895 else { /* First pass */ 5896 if (PL_reginterp_cnt < ++RExC_seen_evals 5897 && IN_PERL_RUNTIME) 5898 /* No compiled RE interpolated, has runtime 5899 components ===> unsafe. */ 5900 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 5901 if (PL_tainting && PL_tainted) 5902 FAIL("Eval-group in insecure regular expression"); 5903 #if PERL_VERSION > 8 5904 if (IN_PERL_COMPILETIME) 5905 PL_cv_has_eval = 1; 5906 #endif 5907 } 5908 5909 nextchar(pRExC_state); 5910 if (is_logical) { 5911 ret = reg_node(pRExC_state, LOGICAL); 5912 if (!SIZE_ONLY) 5913 ret->flags = 2; 5914 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); 5915 /* deal with the length of this later - MJD */ 5916 return ret; 5917 } 5918 ret = reganode(pRExC_state, EVAL, n); 5919 Set_Node_Length(ret, RExC_parse - parse_start + 1); 5920 Set_Node_Offset(ret, parse_start); 5921 return ret; 5922 } 5923 case '(': /* (?(?{...})...) and (?(?=...)...) */ 5924 { 5925 int is_define= 0; 5926 if (RExC_parse[0] == '?') { /* (?(?...)) */ 5927 if (RExC_parse[1] == '=' || RExC_parse[1] == '!' 5928 || RExC_parse[1] == '<' 5929 || RExC_parse[1] == '{') { /* Lookahead or eval. */ 5930 I32 flag; 5931 5932 ret = reg_node(pRExC_state, LOGICAL); 5933 if (!SIZE_ONLY) 5934 ret->flags = 1; 5935 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1)); 5936 goto insert_if; 5937 } 5938 } 5939 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ 5940 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ 5941 { 5942 char ch = RExC_parse[0] == '<' ? '>' : '\''; 5943 char *name_start= RExC_parse++; 5944 U32 num = 0; 5945 SV *sv_dat=reg_scan_name(pRExC_state, 5946 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5947 if (RExC_parse == name_start || *RExC_parse != ch) 5948 vFAIL2("Sequence (?(%c... not terminated", 5949 (ch == '>' ? '<' : ch)); 5950 RExC_parse++; 5951 if (!SIZE_ONLY) { 5952 num = add_data( pRExC_state, 1, "S" ); 5953 RExC_rxi->data->data[num]=(void*)sv_dat; 5954 SvREFCNT_inc_simple_void(sv_dat); 5955 } 5956 ret = reganode(pRExC_state,NGROUPP,num); 5957 goto insert_if_check_paren; 5958 } 5959 else if (RExC_parse[0] == 'D' && 5960 RExC_parse[1] == 'E' && 5961 RExC_parse[2] == 'F' && 5962 RExC_parse[3] == 'I' && 5963 RExC_parse[4] == 'N' && 5964 RExC_parse[5] == 'E') 5965 { 5966 ret = reganode(pRExC_state,DEFINEP,0); 5967 RExC_parse +=6 ; 5968 is_define = 1; 5969 goto insert_if_check_paren; 5970 } 5971 else if (RExC_parse[0] == 'R') { 5972 RExC_parse++; 5973 parno = 0; 5974 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 5975 parno = atoi(RExC_parse++); 5976 while (isDIGIT(*RExC_parse)) 5977 RExC_parse++; 5978 } else if (RExC_parse[0] == '&') { 5979 SV *sv_dat; 5980 RExC_parse++; 5981 sv_dat = reg_scan_name(pRExC_state, 5982 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5983 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 5984 } 5985 ret = reganode(pRExC_state,INSUBP,parno); 5986 goto insert_if_check_paren; 5987 } 5988 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 5989 /* (?(1)...) */ 5990 char c; 5991 parno = atoi(RExC_parse++); 5992 5993 while (isDIGIT(*RExC_parse)) 5994 RExC_parse++; 5995 ret = reganode(pRExC_state, GROUPP, parno); 5996 5997 insert_if_check_paren: 5998 if ((c = *nextchar(pRExC_state)) != ')') 5999 vFAIL("Switch condition not recognized"); 6000 insert_if: 6001 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); 6002 br = regbranch(pRExC_state, &flags, 1,depth+1); 6003 if (br == NULL) 6004 br = reganode(pRExC_state, LONGJMP, 0); 6005 else 6006 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); 6007 c = *nextchar(pRExC_state); 6008 if (flags&HASWIDTH) 6009 *flagp |= HASWIDTH; 6010 if (c == '|') { 6011 if (is_define) 6012 vFAIL("(?(DEFINE)....) does not allow branches"); 6013 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ 6014 regbranch(pRExC_state, &flags, 1,depth+1); 6015 REGTAIL(pRExC_state, ret, lastbr); 6016 if (flags&HASWIDTH) 6017 *flagp |= HASWIDTH; 6018 c = *nextchar(pRExC_state); 6019 } 6020 else 6021 lastbr = NULL; 6022 if (c != ')') 6023 vFAIL("Switch (?(condition)... contains too many branches"); 6024 ender = reg_node(pRExC_state, TAIL); 6025 REGTAIL(pRExC_state, br, ender); 6026 if (lastbr) { 6027 REGTAIL(pRExC_state, lastbr, ender); 6028 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); 6029 } 6030 else 6031 REGTAIL(pRExC_state, ret, ender); 6032 RExC_size++; /* XXX WHY do we need this?!! 6033 For large programs it seems to be required 6034 but I can't figure out why. -- dmq*/ 6035 return ret; 6036 } 6037 else { 6038 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); 6039 } 6040 } 6041 case 0: 6042 RExC_parse--; /* for vFAIL to print correctly */ 6043 vFAIL("Sequence (? incomplete"); 6044 break; 6045 default: 6046 --RExC_parse; 6047 parse_flags: /* (?i) */ 6048 { 6049 U32 posflags = 0, negflags = 0; 6050 U32 *flagsp = &posflags; 6051 6052 while (*RExC_parse) { 6053 /* && strchr("iogcmsx", *RExC_parse) */ 6054 /* (?g), (?gc) and (?o) are useless here 6055 and must be globally applied -- japhy */ 6056 switch (*RExC_parse) { 6057 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); 6058 case ONCE_PAT_MOD: /* 'o' */ 6059 case GLOBAL_PAT_MOD: /* 'g' */ 6060 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 6061 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; 6062 if (! (wastedflags & wflagbit) ) { 6063 wastedflags |= wflagbit; 6064 vWARN5( 6065 RExC_parse + 1, 6066 "Useless (%s%c) - %suse /%c modifier", 6067 flagsp == &negflags ? "?-" : "?", 6068 *RExC_parse, 6069 flagsp == &negflags ? "don't " : "", 6070 *RExC_parse 6071 ); 6072 } 6073 } 6074 break; 6075 6076 case CONTINUE_PAT_MOD: /* 'c' */ 6077 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 6078 if (! (wastedflags & WASTED_C) ) { 6079 wastedflags |= WASTED_GC; 6080 vWARN3( 6081 RExC_parse + 1, 6082 "Useless (%sc) - %suse /gc modifier", 6083 flagsp == &negflags ? "?-" : "?", 6084 flagsp == &negflags ? "don't " : "" 6085 ); 6086 } 6087 } 6088 break; 6089 case KEEPCOPY_PAT_MOD: /* 'p' */ 6090 if (flagsp == &negflags) { 6091 if (SIZE_ONLY) 6092 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); 6093 } else { 6094 *flagsp |= RXf_PMf_KEEPCOPY; 6095 } 6096 break; 6097 case '-': 6098 if (flagsp == &negflags) { 6099 RExC_parse++; 6100 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 6101 /*NOTREACHED*/ 6102 } 6103 flagsp = &negflags; 6104 wastedflags = 0; /* reset so (?g-c) warns twice */ 6105 break; 6106 case ':': 6107 paren = ':'; 6108 /*FALLTHROUGH*/ 6109 case ')': 6110 RExC_flags |= posflags; 6111 RExC_flags &= ~negflags; 6112 if (paren != ':') { 6113 oregflags |= posflags; 6114 oregflags &= ~negflags; 6115 } 6116 nextchar(pRExC_state); 6117 if (paren != ':') { 6118 *flagp = TRYAGAIN; 6119 return NULL; 6120 } else { 6121 ret = NULL; 6122 goto parse_rest; 6123 } 6124 /*NOTREACHED*/ 6125 default: 6126 RExC_parse++; 6127 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 6128 /*NOTREACHED*/ 6129 } 6130 ++RExC_parse; 6131 } 6132 }} /* one for the default block, one for the switch */ 6133 } 6134 else { /* (...) */ 6135 capturing_parens: 6136 parno = RExC_npar; 6137 RExC_npar++; 6138 6139 ret = reganode(pRExC_state, OPEN, parno); 6140 if (!SIZE_ONLY ){ 6141 if (!RExC_nestroot) 6142 RExC_nestroot = parno; 6143 if (RExC_seen & REG_SEEN_RECURSE 6144 && !RExC_open_parens[parno-1]) 6145 { 6146 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 6147 "Setting open paren #%"IVdf" to %d\n", 6148 (IV)parno, REG_NODE_NUM(ret))); 6149 RExC_open_parens[parno-1]= ret; 6150 } 6151 } 6152 Set_Node_Length(ret, 1); /* MJD */ 6153 Set_Node_Offset(ret, RExC_parse); /* MJD */ 6154 is_open = 1; 6155 } 6156 } 6157 else /* ! paren */ 6158 ret = NULL; 6159 6160 parse_rest: 6161 /* Pick up the branches, linking them together. */ 6162 parse_start = RExC_parse; /* MJD */ 6163 br = regbranch(pRExC_state, &flags, 1,depth+1); 6164 6165 if (freeze_paren) { 6166 if (RExC_npar > after_freeze) 6167 after_freeze = RExC_npar; 6168 RExC_npar = freeze_paren; 6169 } 6170 6171 /* branch_len = (paren != 0); */ 6172 6173 if (br == NULL) 6174 return(NULL); 6175 if (*RExC_parse == '|') { 6176 if (!SIZE_ONLY && RExC_extralen) { 6177 reginsert(pRExC_state, BRANCHJ, br, depth+1); 6178 } 6179 else { /* MJD */ 6180 reginsert(pRExC_state, BRANCH, br, depth+1); 6181 Set_Node_Length(br, paren != 0); 6182 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); 6183 } 6184 have_branch = 1; 6185 if (SIZE_ONLY) 6186 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ 6187 } 6188 else if (paren == ':') { 6189 *flagp |= flags&SIMPLE; 6190 } 6191 if (is_open) { /* Starts with OPEN. */ 6192 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ 6193 } 6194 else if (paren != '?') /* Not Conditional */ 6195 ret = br; 6196 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 6197 lastbr = br; 6198 while (*RExC_parse == '|') { 6199 if (!SIZE_ONLY && RExC_extralen) { 6200 ender = reganode(pRExC_state, LONGJMP,0); 6201 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ 6202 } 6203 if (SIZE_ONLY) 6204 RExC_extralen += 2; /* Account for LONGJMP. */ 6205 nextchar(pRExC_state); 6206 if (freeze_paren) { 6207 if (RExC_npar > after_freeze) 6208 after_freeze = RExC_npar; 6209 RExC_npar = freeze_paren; 6210 } 6211 br = regbranch(pRExC_state, &flags, 0, depth+1); 6212 6213 if (br == NULL) 6214 return(NULL); 6215 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ 6216 lastbr = br; 6217 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 6218 } 6219 6220 if (have_branch || paren != ':') { 6221 /* Make a closing node, and hook it on the end. */ 6222 switch (paren) { 6223 case ':': 6224 ender = reg_node(pRExC_state, TAIL); 6225 break; 6226 case 1: 6227 ender = reganode(pRExC_state, CLOSE, parno); 6228 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { 6229 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 6230 "Setting close paren #%"IVdf" to %d\n", 6231 (IV)parno, REG_NODE_NUM(ender))); 6232 RExC_close_parens[parno-1]= ender; 6233 if (RExC_nestroot == parno) 6234 RExC_nestroot = 0; 6235 } 6236 Set_Node_Offset(ender,RExC_parse+1); /* MJD */ 6237 Set_Node_Length(ender,1); /* MJD */ 6238 break; 6239 case '<': 6240 case ',': 6241 case '=': 6242 case '!': 6243 *flagp &= ~HASWIDTH; 6244 /* FALL THROUGH */ 6245 case '>': 6246 ender = reg_node(pRExC_state, SUCCEED); 6247 break; 6248 case 0: 6249 ender = reg_node(pRExC_state, END); 6250 if (!SIZE_ONLY) { 6251 assert(!RExC_opend); /* there can only be one! */ 6252 RExC_opend = ender; 6253 } 6254 break; 6255 } 6256 REGTAIL(pRExC_state, lastbr, ender); 6257 6258 if (have_branch && !SIZE_ONLY) { 6259 if (depth==1) 6260 RExC_seen |= REG_TOP_LEVEL_BRANCHES; 6261 6262 /* Hook the tails of the branches to the closing node. */ 6263 for (br = ret; br; br = regnext(br)) { 6264 const U8 op = PL_regkind[OP(br)]; 6265 if (op == BRANCH) { 6266 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); 6267 } 6268 else if (op == BRANCHJ) { 6269 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); 6270 } 6271 } 6272 } 6273 } 6274 6275 { 6276 const char *p; 6277 static const char parens[] = "=!<,>"; 6278 6279 if (paren && (p = strchr(parens, paren))) { 6280 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 6281 int flag = (p - parens) > 1; 6282 6283 if (paren == '>') 6284 node = SUSPEND, flag = 0; 6285 reginsert(pRExC_state, node,ret, depth+1); 6286 Set_Node_Cur_Length(ret); 6287 Set_Node_Offset(ret, parse_start + 1); 6288 ret->flags = flag; 6289 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); 6290 } 6291 } 6292 6293 /* Check for proper termination. */ 6294 if (paren) { 6295 RExC_flags = oregflags; 6296 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { 6297 RExC_parse = oregcomp_parse; 6298 vFAIL("Unmatched ("); 6299 } 6300 } 6301 else if (!paren && RExC_parse < RExC_end) { 6302 if (*RExC_parse == ')') { 6303 RExC_parse++; 6304 vFAIL("Unmatched )"); 6305 } 6306 else 6307 FAIL("Junk on end of regexp"); /* "Can't happen". */ 6308 /* NOTREACHED */ 6309 } 6310 if (after_freeze) 6311 RExC_npar = after_freeze; 6312 return(ret); 6313 } 6314 6315 /* 6316 - regbranch - one alternative of an | operator 6317 * 6318 * Implements the concatenation operator. 6319 */ 6320 STATIC regnode * 6321 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) 6322 { 6323 dVAR; 6324 register regnode *ret; 6325 register regnode *chain = NULL; 6326 register regnode *latest; 6327 I32 flags = 0, c = 0; 6328 GET_RE_DEBUG_FLAGS_DECL; 6329 6330 PERL_ARGS_ASSERT_REGBRANCH; 6331 6332 DEBUG_PARSE("brnc"); 6333 6334 if (first) 6335 ret = NULL; 6336 else { 6337 if (!SIZE_ONLY && RExC_extralen) 6338 ret = reganode(pRExC_state, BRANCHJ,0); 6339 else { 6340 ret = reg_node(pRExC_state, BRANCH); 6341 Set_Node_Length(ret, 1); 6342 } 6343 } 6344 6345 if (!first && SIZE_ONLY) 6346 RExC_extralen += 1; /* BRANCHJ */ 6347 6348 *flagp = WORST; /* Tentatively. */ 6349 6350 RExC_parse--; 6351 nextchar(pRExC_state); 6352 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 6353 flags &= ~TRYAGAIN; 6354 latest = regpiece(pRExC_state, &flags,depth+1); 6355 if (latest == NULL) { 6356 if (flags & TRYAGAIN) 6357 continue; 6358 return(NULL); 6359 } 6360 else if (ret == NULL) 6361 ret = latest; 6362 *flagp |= flags&(HASWIDTH|POSTPONED); 6363 if (chain == NULL) /* First piece. */ 6364 *flagp |= flags&SPSTART; 6365 else { 6366 RExC_naughty++; 6367 REGTAIL(pRExC_state, chain, latest); 6368 } 6369 chain = latest; 6370 c++; 6371 } 6372 if (chain == NULL) { /* Loop ran zero times. */ 6373 chain = reg_node(pRExC_state, NOTHING); 6374 if (ret == NULL) 6375 ret = chain; 6376 } 6377 if (c == 1) { 6378 *flagp |= flags&SIMPLE; 6379 } 6380 6381 return ret; 6382 } 6383 6384 /* 6385 - regpiece - something followed by possible [*+?] 6386 * 6387 * Note that the branching code sequences used for ? and the general cases 6388 * of * and + are somewhat optimized: they use the same NOTHING node as 6389 * both the endmarker for their branch list and the body of the last branch. 6390 * It might seem that this node could be dispensed with entirely, but the 6391 * endmarker role is not redundant. 6392 */ 6393 STATIC regnode * 6394 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 6395 { 6396 dVAR; 6397 register regnode *ret; 6398 register char op; 6399 register char *next; 6400 I32 flags; 6401 const char * const origparse = RExC_parse; 6402 I32 min; 6403 I32 max = REG_INFTY; 6404 char *parse_start; 6405 const char *maxpos = NULL; 6406 GET_RE_DEBUG_FLAGS_DECL; 6407 6408 PERL_ARGS_ASSERT_REGPIECE; 6409 6410 DEBUG_PARSE("piec"); 6411 6412 ret = regatom(pRExC_state, &flags,depth+1); 6413 if (ret == NULL) { 6414 if (flags & TRYAGAIN) 6415 *flagp |= TRYAGAIN; 6416 return(NULL); 6417 } 6418 6419 op = *RExC_parse; 6420 6421 if (op == '{' && regcurly(RExC_parse)) { 6422 maxpos = NULL; 6423 parse_start = RExC_parse; /* MJD */ 6424 next = RExC_parse + 1; 6425 while (isDIGIT(*next) || *next == ',') { 6426 if (*next == ',') { 6427 if (maxpos) 6428 break; 6429 else 6430 maxpos = next; 6431 } 6432 next++; 6433 } 6434 if (*next == '}') { /* got one */ 6435 if (!maxpos) 6436 maxpos = next; 6437 RExC_parse++; 6438 min = atoi(RExC_parse); 6439 if (*maxpos == ',') 6440 maxpos++; 6441 else 6442 maxpos = RExC_parse; 6443 max = atoi(maxpos); 6444 if (!max && *maxpos != '0') 6445 max = REG_INFTY; /* meaning "infinity" */ 6446 else if (max >= REG_INFTY) 6447 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 6448 RExC_parse = next; 6449 nextchar(pRExC_state); 6450 6451 do_curly: 6452 if ((flags&SIMPLE)) { 6453 RExC_naughty += 2 + RExC_naughty / 2; 6454 reginsert(pRExC_state, CURLY, ret, depth+1); 6455 Set_Node_Offset(ret, parse_start+1); /* MJD */ 6456 Set_Node_Cur_Length(ret); 6457 } 6458 else { 6459 regnode * const w = reg_node(pRExC_state, WHILEM); 6460 6461 w->flags = 0; 6462 REGTAIL(pRExC_state, ret, w); 6463 if (!SIZE_ONLY && RExC_extralen) { 6464 reginsert(pRExC_state, LONGJMP,ret, depth+1); 6465 reginsert(pRExC_state, NOTHING,ret, depth+1); 6466 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ 6467 } 6468 reginsert(pRExC_state, CURLYX,ret, depth+1); 6469 /* MJD hk */ 6470 Set_Node_Offset(ret, parse_start+1); 6471 Set_Node_Length(ret, 6472 op == '{' ? (RExC_parse - parse_start) : 1); 6473 6474 if (!SIZE_ONLY && RExC_extralen) 6475 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ 6476 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); 6477 if (SIZE_ONLY) 6478 RExC_whilem_seen++, RExC_extralen += 3; 6479 RExC_naughty += 4 + RExC_naughty; /* compound interest */ 6480 } 6481 ret->flags = 0; 6482 6483 if (min > 0) 6484 *flagp = WORST; 6485 if (max > 0) 6486 *flagp |= HASWIDTH; 6487 if (max < min) 6488 vFAIL("Can't do {n,m} with n > m"); 6489 if (!SIZE_ONLY) { 6490 ARG1_SET(ret, (U16)min); 6491 ARG2_SET(ret, (U16)max); 6492 } 6493 6494 goto nest_check; 6495 } 6496 } 6497 6498 if (!ISMULT1(op)) { 6499 *flagp = flags; 6500 return(ret); 6501 } 6502 6503 #if 0 /* Now runtime fix should be reliable. */ 6504 6505 /* if this is reinstated, don't forget to put this back into perldiag: 6506 6507 =item Regexp *+ operand could be empty at {#} in regex m/%s/ 6508 6509 (F) The part of the regexp subject to either the * or + quantifier 6510 could match an empty string. The {#} shows in the regular 6511 expression about where the problem was discovered. 6512 6513 */ 6514 6515 if (!(flags&HASWIDTH) && op != '?') 6516 vFAIL("Regexp *+ operand could be empty"); 6517 #endif 6518 6519 parse_start = RExC_parse; 6520 nextchar(pRExC_state); 6521 6522 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); 6523 6524 if (op == '*' && (flags&SIMPLE)) { 6525 reginsert(pRExC_state, STAR, ret, depth+1); 6526 ret->flags = 0; 6527 RExC_naughty += 4; 6528 } 6529 else if (op == '*') { 6530 min = 0; 6531 goto do_curly; 6532 } 6533 else if (op == '+' && (flags&SIMPLE)) { 6534 reginsert(pRExC_state, PLUS, ret, depth+1); 6535 ret->flags = 0; 6536 RExC_naughty += 3; 6537 } 6538 else if (op == '+') { 6539 min = 1; 6540 goto do_curly; 6541 } 6542 else if (op == '?') { 6543 min = 0; max = 1; 6544 goto do_curly; 6545 } 6546 nest_check: 6547 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { 6548 ckWARN3reg(RExC_parse, 6549 "%.*s matches null string many times", 6550 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), 6551 origparse); 6552 } 6553 6554 if (RExC_parse < RExC_end && *RExC_parse == '?') { 6555 nextchar(pRExC_state); 6556 reginsert(pRExC_state, MINMOD, ret, depth+1); 6557 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); 6558 } 6559 #ifndef REG_ALLOW_MINMOD_SUSPEND 6560 else 6561 #endif 6562 if (RExC_parse < RExC_end && *RExC_parse == '+') { 6563 regnode *ender; 6564 nextchar(pRExC_state); 6565 ender = reg_node(pRExC_state, SUCCEED); 6566 REGTAIL(pRExC_state, ret, ender); 6567 reginsert(pRExC_state, SUSPEND, ret, depth+1); 6568 ret->flags = 0; 6569 ender = reg_node(pRExC_state, TAIL); 6570 REGTAIL(pRExC_state, ret, ender); 6571 /*ret= ender;*/ 6572 } 6573 6574 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { 6575 RExC_parse++; 6576 vFAIL("Nested quantifiers"); 6577 } 6578 6579 return(ret); 6580 } 6581 6582 6583 /* reg_namedseq(pRExC_state,UVp) 6584 6585 This is expected to be called by a parser routine that has 6586 recognized '\N' and needs to handle the rest. RExC_parse is 6587 expected to point at the first char following the N at the time 6588 of the call. 6589 6590 The \N may be inside (indicated by valuep not being NULL) or outside a 6591 character class. 6592 6593 \N may begin either a named sequence, or if outside a character class, mean 6594 to match a non-newline. For non single-quoted regexes, the tokenizer has 6595 attempted to decide which, and in the case of a named sequence converted it 6596 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, 6597 where c1... are the characters in the sequence. For single-quoted regexes, 6598 the tokenizer passes the \N sequence through unchanged; this code will not 6599 attempt to determine this nor expand those. The net effect is that if the 6600 beginning of the passed-in pattern isn't '{U+' or there is no '}', it 6601 signals that this \N occurrence means to match a non-newline. 6602 6603 Only the \N{U+...} form should occur in a character class, for the same 6604 reason that '.' inside a character class means to just match a period: it 6605 just doesn't make sense. 6606 6607 If valuep is non-null then it is assumed that we are parsing inside 6608 of a charclass definition and the first codepoint in the resolved 6609 string is returned via *valuep and the routine will return NULL. 6610 In this mode if a multichar string is returned from the charnames 6611 handler, a warning will be issued, and only the first char in the 6612 sequence will be examined. If the string returned is zero length 6613 then the value of *valuep is undefined and NON-NULL will 6614 be returned to indicate failure. (This will NOT be a valid pointer 6615 to a regnode.) 6616 6617 If valuep is null then it is assumed that we are parsing normal text and a 6618 new EXACT node is inserted into the program containing the resolved string, 6619 and a pointer to the new node is returned. But if the string is zero length 6620 a NOTHING node is emitted instead. 6621 6622 On success RExC_parse is set to the char following the endbrace. 6623 Parsing failures will generate a fatal error via vFAIL(...) 6624 */ 6625 STATIC regnode * 6626 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) 6627 { 6628 char * endbrace; /* '}' following the name */ 6629 regnode *ret = NULL; 6630 #ifdef DEBUGGING 6631 char* parse_start = RExC_parse - 2; /* points to the '\N' */ 6632 #endif 6633 char* p; 6634 6635 GET_RE_DEBUG_FLAGS_DECL; 6636 6637 PERL_ARGS_ASSERT_REG_NAMEDSEQ; 6638 6639 GET_RE_DEBUG_FLAGS; 6640 6641 /* The [^\n] meaning of \N ignores spaces and comments under the /x 6642 * modifier. The other meaning does not */ 6643 p = (RExC_flags & RXf_PMf_EXTENDED) 6644 ? regwhite( pRExC_state, RExC_parse ) 6645 : RExC_parse; 6646 6647 /* Disambiguate between \N meaning a named character versus \N meaning 6648 * [^\n]. The former is assumed when it can't be the latter. */ 6649 if (*p != '{' || regcurly(p)) { 6650 RExC_parse = p; 6651 if (valuep) { 6652 /* no bare \N in a charclass */ 6653 vFAIL("\\N in a character class must be a named character: \\N{...}"); 6654 } 6655 nextchar(pRExC_state); 6656 ret = reg_node(pRExC_state, REG_ANY); 6657 *flagp |= HASWIDTH|SIMPLE; 6658 RExC_naughty++; 6659 RExC_parse--; 6660 Set_Node_Length(ret, 1); /* MJD */ 6661 return ret; 6662 } 6663 6664 /* Here, we have decided it should be a named sequence */ 6665 6666 /* The test above made sure that the next real character is a '{', but 6667 * under the /x modifier, it could be separated by space (or a comment and 6668 * \n) and this is not allowed (for consistency with \x{...} and the 6669 * tokenizer handling of \N{NAME}). */ 6670 if (*RExC_parse != '{') { 6671 vFAIL("Missing braces on \\N{}"); 6672 } 6673 6674 RExC_parse++; /* Skip past the '{' */ 6675 6676 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ 6677 || ! (endbrace == RExC_parse /* nothing between the {} */ 6678 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ 6679 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ 6680 { 6681 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ 6682 vFAIL("\\N{NAME} must be resolved by the lexer"); 6683 } 6684 6685 if (endbrace == RExC_parse) { /* empty: \N{} */ 6686 if (! valuep) { 6687 RExC_parse = endbrace + 1; 6688 return reg_node(pRExC_state,NOTHING); 6689 } 6690 6691 if (SIZE_ONLY) { 6692 ckWARNreg(RExC_parse, 6693 "Ignoring zero length \\N{} in character class" 6694 ); 6695 RExC_parse = endbrace + 1; 6696 } 6697 *valuep = 0; 6698 return (regnode *) &RExC_parse; /* Invalid regnode pointer */ 6699 } 6700 6701 RExC_utf8 = 1; /* named sequences imply Unicode semantics */ 6702 RExC_parse += 2; /* Skip past the 'U+' */ 6703 6704 if (valuep) { /* In a bracketed char class */ 6705 /* We only pay attention to the first char of 6706 multichar strings being returned. I kinda wonder 6707 if this makes sense as it does change the behaviour 6708 from earlier versions, OTOH that behaviour was broken 6709 as well. XXX Solution is to recharacterize as 6710 [rest-of-class]|multi1|multi2... */ 6711 6712 STRLEN length_of_hex; 6713 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 6714 | PERL_SCAN_DISALLOW_PREFIX 6715 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); 6716 6717 char * endchar = RExC_parse + strcspn(RExC_parse, ".}"); 6718 if (endchar < endbrace) { 6719 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); 6720 } 6721 6722 length_of_hex = (STRLEN)(endchar - RExC_parse); 6723 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL); 6724 6725 /* The tokenizer should have guaranteed validity, but it's possible to 6726 * bypass it by using single quoting, so check */ 6727 if (length_of_hex == 0 6728 || length_of_hex != (STRLEN)(endchar - RExC_parse) ) 6729 { 6730 RExC_parse += length_of_hex; /* Includes all the valid */ 6731 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ 6732 ? UTF8SKIP(RExC_parse) 6733 : 1; 6734 /* Guard against malformed utf8 */ 6735 if (RExC_parse >= endchar) RExC_parse = endchar; 6736 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 6737 } 6738 6739 RExC_parse = endbrace + 1; 6740 if (endchar == endbrace) return NULL; 6741 6742 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */ 6743 } 6744 else { /* Not a char class */ 6745 char *s; /* String to put in generated EXACT node */ 6746 STRLEN len = 0; /* Its current length */ 6747 char *endchar; /* Points to '.' or '}' ending cur char in the input 6748 stream */ 6749 6750 ret = reg_node(pRExC_state, 6751 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); 6752 s= STRING(ret); 6753 6754 /* Exact nodes can hold only a U8 length's of text = 255. Loop through 6755 * the input which is of the form now 'c1.c2.c3...}' until find the 6756 * ending brace or exeed length 255. The characters that exceed this 6757 * limit are dropped. The limit could be relaxed should it become 6758 * desirable by reparsing this as (?:\N{NAME}), so could generate 6759 * multiple EXACT nodes, as is done for just regular input. But this 6760 * is primarily a named character, and not intended to be a huge long 6761 * string, so 255 bytes should be good enough */ 6762 while (1) { 6763 STRLEN length_of_hex; 6764 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES 6765 | PERL_SCAN_DISALLOW_PREFIX 6766 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); 6767 UV cp; /* Ord of current character */ 6768 6769 /* Code points are separated by dots. If none, there is only one 6770 * code point, and is terminated by the brace */ 6771 endchar = RExC_parse + strcspn(RExC_parse, ".}"); 6772 6773 /* The values are Unicode even on EBCDIC machines */ 6774 length_of_hex = (STRLEN)(endchar - RExC_parse); 6775 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL); 6776 if ( length_of_hex == 0 6777 || length_of_hex != (STRLEN)(endchar - RExC_parse) ) 6778 { 6779 RExC_parse += length_of_hex; /* Includes all the valid */ 6780 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ 6781 ? UTF8SKIP(RExC_parse) 6782 : 1; 6783 /* Guard against malformed utf8 */ 6784 if (RExC_parse >= endchar) RExC_parse = endchar; 6785 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 6786 } 6787 6788 if (! FOLD) { /* Not folding, just append to the string */ 6789 STRLEN unilen; 6790 6791 /* Quit before adding this character if would exceed limit */ 6792 if (len + UNISKIP(cp) > U8_MAX) break; 6793 6794 unilen = reguni(pRExC_state, cp, s); 6795 if (unilen > 0) { 6796 s += unilen; 6797 len += unilen; 6798 } 6799 } else { /* Folding, output the folded equivalent */ 6800 STRLEN foldlen,numlen; 6801 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; 6802 cp = toFOLD_uni(cp, tmpbuf, &foldlen); 6803 6804 /* Quit before exceeding size limit */ 6805 if (len + foldlen > U8_MAX) break; 6806 6807 for (foldbuf = tmpbuf; 6808 foldlen; 6809 foldlen -= numlen) 6810 { 6811 cp = utf8_to_uvchr(foldbuf, &numlen); 6812 if (numlen > 0) { 6813 const STRLEN unilen = reguni(pRExC_state, cp, s); 6814 s += unilen; 6815 len += unilen; 6816 /* In EBCDIC the numlen and unilen can differ. */ 6817 foldbuf += numlen; 6818 if (numlen >= foldlen) 6819 break; 6820 } 6821 else 6822 break; /* "Can't happen." */ 6823 } 6824 } 6825 6826 /* Point to the beginning of the next character in the sequence. */ 6827 RExC_parse = endchar + 1; 6828 6829 /* Quit if no more characters */ 6830 if (RExC_parse >= endbrace) break; 6831 } 6832 6833 6834 if (SIZE_ONLY) { 6835 if (RExC_parse < endbrace) { 6836 ckWARNreg(RExC_parse - 1, 6837 "Using just the first characters returned by \\N{}"); 6838 } 6839 6840 RExC_size += STR_SZ(len); 6841 } else { 6842 STR_LEN(ret) = len; 6843 RExC_emit += STR_SZ(len); 6844 } 6845 6846 RExC_parse = endbrace + 1; 6847 6848 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail 6849 with malformed in t/re/pat_advanced.t */ 6850 RExC_parse --; 6851 Set_Node_Cur_Length(ret); /* MJD */ 6852 nextchar(pRExC_state); 6853 } 6854 6855 return ret; 6856 } 6857 6858 6859 /* 6860 * reg_recode 6861 * 6862 * It returns the code point in utf8 for the value in *encp. 6863 * value: a code value in the source encoding 6864 * encp: a pointer to an Encode object 6865 * 6866 * If the result from Encode is not a single character, 6867 * it returns U+FFFD (Replacement character) and sets *encp to NULL. 6868 */ 6869 STATIC UV 6870 S_reg_recode(pTHX_ const char value, SV **encp) 6871 { 6872 STRLEN numlen = 1; 6873 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); 6874 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); 6875 const STRLEN newlen = SvCUR(sv); 6876 UV uv = UNICODE_REPLACEMENT; 6877 6878 PERL_ARGS_ASSERT_REG_RECODE; 6879 6880 if (newlen) 6881 uv = SvUTF8(sv) 6882 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) 6883 : *(U8*)s; 6884 6885 if (!newlen || numlen != newlen) { 6886 uv = UNICODE_REPLACEMENT; 6887 *encp = NULL; 6888 } 6889 return uv; 6890 } 6891 6892 6893 /* 6894 - regatom - the lowest level 6895 6896 Try to identify anything special at the start of the pattern. If there 6897 is, then handle it as required. This may involve generating a single regop, 6898 such as for an assertion; or it may involve recursing, such as to 6899 handle a () structure. 6900 6901 If the string doesn't start with something special then we gobble up 6902 as much literal text as we can. 6903 6904 Once we have been able to handle whatever type of thing started the 6905 sequence, we return. 6906 6907 Note: we have to be careful with escapes, as they can be both literal 6908 and special, and in the case of \10 and friends can either, depending 6909 on context. Specifically there are two seperate switches for handling 6910 escape sequences, with the one for handling literal escapes requiring 6911 a dummy entry for all of the special escapes that are actually handled 6912 by the other. 6913 */ 6914 6915 STATIC regnode * 6916 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 6917 { 6918 dVAR; 6919 register regnode *ret = NULL; 6920 I32 flags; 6921 char *parse_start = RExC_parse; 6922 GET_RE_DEBUG_FLAGS_DECL; 6923 DEBUG_PARSE("atom"); 6924 *flagp = WORST; /* Tentatively. */ 6925 6926 PERL_ARGS_ASSERT_REGATOM; 6927 6928 tryagain: 6929 switch ((U8)*RExC_parse) { 6930 case '^': 6931 RExC_seen_zerolen++; 6932 nextchar(pRExC_state); 6933 if (RExC_flags & RXf_PMf_MULTILINE) 6934 ret = reg_node(pRExC_state, MBOL); 6935 else if (RExC_flags & RXf_PMf_SINGLELINE) 6936 ret = reg_node(pRExC_state, SBOL); 6937 else 6938 ret = reg_node(pRExC_state, BOL); 6939 Set_Node_Length(ret, 1); /* MJD */ 6940 break; 6941 case '$': 6942 nextchar(pRExC_state); 6943 if (*RExC_parse) 6944 RExC_seen_zerolen++; 6945 if (RExC_flags & RXf_PMf_MULTILINE) 6946 ret = reg_node(pRExC_state, MEOL); 6947 else if (RExC_flags & RXf_PMf_SINGLELINE) 6948 ret = reg_node(pRExC_state, SEOL); 6949 else 6950 ret = reg_node(pRExC_state, EOL); 6951 Set_Node_Length(ret, 1); /* MJD */ 6952 break; 6953 case '.': 6954 nextchar(pRExC_state); 6955 if (RExC_flags & RXf_PMf_SINGLELINE) 6956 ret = reg_node(pRExC_state, SANY); 6957 else 6958 ret = reg_node(pRExC_state, REG_ANY); 6959 *flagp |= HASWIDTH|SIMPLE; 6960 RExC_naughty++; 6961 Set_Node_Length(ret, 1); /* MJD */ 6962 break; 6963 case '[': 6964 { 6965 char * const oregcomp_parse = ++RExC_parse; 6966 ret = regclass(pRExC_state,depth+1); 6967 if (*RExC_parse != ']') { 6968 RExC_parse = oregcomp_parse; 6969 vFAIL("Unmatched ["); 6970 } 6971 nextchar(pRExC_state); 6972 *flagp |= HASWIDTH|SIMPLE; 6973 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ 6974 break; 6975 } 6976 case '(': 6977 nextchar(pRExC_state); 6978 ret = reg(pRExC_state, 1, &flags,depth+1); 6979 if (ret == NULL) { 6980 if (flags & TRYAGAIN) { 6981 if (RExC_parse == RExC_end) { 6982 /* Make parent create an empty node if needed. */ 6983 *flagp |= TRYAGAIN; 6984 return(NULL); 6985 } 6986 goto tryagain; 6987 } 6988 return(NULL); 6989 } 6990 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); 6991 break; 6992 case '|': 6993 case ')': 6994 if (flags & TRYAGAIN) { 6995 *flagp |= TRYAGAIN; 6996 return NULL; 6997 } 6998 vFAIL("Internal urp"); 6999 /* Supposed to be caught earlier. */ 7000 break; 7001 case '{': 7002 if (!regcurly(RExC_parse)) { 7003 RExC_parse++; 7004 goto defchar; 7005 } 7006 /* FALL THROUGH */ 7007 case '?': 7008 case '+': 7009 case '*': 7010 RExC_parse++; 7011 vFAIL("Quantifier follows nothing"); 7012 break; 7013 case 0xDF: 7014 case 0xC3: 7015 case 0xCE: 7016 do_foldchar: 7017 if (!LOC && FOLD) { 7018 U32 len,cp; 7019 len=0; /* silence a spurious compiler warning */ 7020 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { 7021 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ 7022 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ 7023 ret = reganode(pRExC_state, FOLDCHAR, cp); 7024 Set_Node_Length(ret, 1); /* MJD */ 7025 nextchar(pRExC_state); /* kill whitespace under /x */ 7026 return ret; 7027 } 7028 } 7029 goto outer_default; 7030 case '\\': 7031 /* Special Escapes 7032 7033 This switch handles escape sequences that resolve to some kind 7034 of special regop and not to literal text. Escape sequnces that 7035 resolve to literal text are handled below in the switch marked 7036 "Literal Escapes". 7037 7038 Every entry in this switch *must* have a corresponding entry 7039 in the literal escape switch. However, the opposite is not 7040 required, as the default for this switch is to jump to the 7041 literal text handling code. 7042 */ 7043 switch ((U8)*++RExC_parse) { 7044 case 0xDF: 7045 case 0xC3: 7046 case 0xCE: 7047 goto do_foldchar; 7048 /* Special Escapes */ 7049 case 'A': 7050 RExC_seen_zerolen++; 7051 ret = reg_node(pRExC_state, SBOL); 7052 *flagp |= SIMPLE; 7053 goto finish_meta_pat; 7054 case 'G': 7055 ret = reg_node(pRExC_state, GPOS); 7056 RExC_seen |= REG_SEEN_GPOS; 7057 *flagp |= SIMPLE; 7058 goto finish_meta_pat; 7059 case 'K': 7060 RExC_seen_zerolen++; 7061 ret = reg_node(pRExC_state, KEEPS); 7062 *flagp |= SIMPLE; 7063 /* XXX:dmq : disabling in-place substitution seems to 7064 * be necessary here to avoid cases of memory corruption, as 7065 * with: C<$_="x" x 80; s/x\K/y/> -- rgs 7066 */ 7067 RExC_seen |= REG_SEEN_LOOKBEHIND; 7068 goto finish_meta_pat; 7069 case 'Z': 7070 ret = reg_node(pRExC_state, SEOL); 7071 *flagp |= SIMPLE; 7072 RExC_seen_zerolen++; /* Do not optimize RE away */ 7073 goto finish_meta_pat; 7074 case 'z': 7075 ret = reg_node(pRExC_state, EOS); 7076 *flagp |= SIMPLE; 7077 RExC_seen_zerolen++; /* Do not optimize RE away */ 7078 goto finish_meta_pat; 7079 case 'C': 7080 ret = reg_node(pRExC_state, CANY); 7081 RExC_seen |= REG_SEEN_CANY; 7082 *flagp |= HASWIDTH|SIMPLE; 7083 goto finish_meta_pat; 7084 case 'X': 7085 ret = reg_node(pRExC_state, CLUMP); 7086 *flagp |= HASWIDTH; 7087 goto finish_meta_pat; 7088 case 'w': 7089 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); 7090 *flagp |= HASWIDTH|SIMPLE; 7091 goto finish_meta_pat; 7092 case 'W': 7093 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); 7094 *flagp |= HASWIDTH|SIMPLE; 7095 goto finish_meta_pat; 7096 case 'b': 7097 RExC_seen_zerolen++; 7098 RExC_seen |= REG_SEEN_LOOKBEHIND; 7099 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); 7100 *flagp |= SIMPLE; 7101 goto finish_meta_pat; 7102 case 'B': 7103 RExC_seen_zerolen++; 7104 RExC_seen |= REG_SEEN_LOOKBEHIND; 7105 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); 7106 *flagp |= SIMPLE; 7107 goto finish_meta_pat; 7108 case 's': 7109 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); 7110 *flagp |= HASWIDTH|SIMPLE; 7111 goto finish_meta_pat; 7112 case 'S': 7113 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); 7114 *flagp |= HASWIDTH|SIMPLE; 7115 goto finish_meta_pat; 7116 case 'd': 7117 ret = reg_node(pRExC_state, DIGIT); 7118 *flagp |= HASWIDTH|SIMPLE; 7119 goto finish_meta_pat; 7120 case 'D': 7121 ret = reg_node(pRExC_state, NDIGIT); 7122 *flagp |= HASWIDTH|SIMPLE; 7123 goto finish_meta_pat; 7124 case 'R': 7125 ret = reg_node(pRExC_state, LNBREAK); 7126 *flagp |= HASWIDTH|SIMPLE; 7127 goto finish_meta_pat; 7128 case 'h': 7129 ret = reg_node(pRExC_state, HORIZWS); 7130 *flagp |= HASWIDTH|SIMPLE; 7131 goto finish_meta_pat; 7132 case 'H': 7133 ret = reg_node(pRExC_state, NHORIZWS); 7134 *flagp |= HASWIDTH|SIMPLE; 7135 goto finish_meta_pat; 7136 case 'v': 7137 ret = reg_node(pRExC_state, VERTWS); 7138 *flagp |= HASWIDTH|SIMPLE; 7139 goto finish_meta_pat; 7140 case 'V': 7141 ret = reg_node(pRExC_state, NVERTWS); 7142 *flagp |= HASWIDTH|SIMPLE; 7143 finish_meta_pat: 7144 nextchar(pRExC_state); 7145 Set_Node_Length(ret, 2); /* MJD */ 7146 break; 7147 case 'p': 7148 case 'P': 7149 { 7150 char* const oldregxend = RExC_end; 7151 #ifdef DEBUGGING 7152 char* parse_start = RExC_parse - 2; 7153 #endif 7154 7155 if (RExC_parse[1] == '{') { 7156 /* a lovely hack--pretend we saw [\pX] instead */ 7157 RExC_end = strchr(RExC_parse, '}'); 7158 if (!RExC_end) { 7159 const U8 c = (U8)*RExC_parse; 7160 RExC_parse += 2; 7161 RExC_end = oldregxend; 7162 vFAIL2("Missing right brace on \\%c{}", c); 7163 } 7164 RExC_end++; 7165 } 7166 else { 7167 RExC_end = RExC_parse + 2; 7168 if (RExC_end > oldregxend) 7169 RExC_end = oldregxend; 7170 } 7171 RExC_parse--; 7172 7173 ret = regclass(pRExC_state,depth+1); 7174 7175 RExC_end = oldregxend; 7176 RExC_parse--; 7177 7178 Set_Node_Offset(ret, parse_start + 2); 7179 Set_Node_Cur_Length(ret); 7180 nextchar(pRExC_state); 7181 *flagp |= HASWIDTH|SIMPLE; 7182 } 7183 break; 7184 case 'N': 7185 /* Handle \N and \N{NAME} here and not below because it can be 7186 multicharacter. join_exact() will join them up later on. 7187 Also this makes sure that things like /\N{BLAH}+/ and 7188 \N{BLAH} being multi char Just Happen. dmq*/ 7189 ++RExC_parse; 7190 ret= reg_namedseq(pRExC_state, NULL, flagp); 7191 break; 7192 case 'k': /* Handle \k<NAME> and \k'NAME' */ 7193 parse_named_seq: 7194 { 7195 char ch= RExC_parse[1]; 7196 if (ch != '<' && ch != '\'' && ch != '{') { 7197 RExC_parse++; 7198 vFAIL2("Sequence %.2s... not terminated",parse_start); 7199 } else { 7200 /* this pretty much dupes the code for (?P=...) in reg(), if 7201 you change this make sure you change that */ 7202 char* name_start = (RExC_parse += 2); 7203 U32 num = 0; 7204 SV *sv_dat = reg_scan_name(pRExC_state, 7205 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 7206 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; 7207 if (RExC_parse == name_start || *RExC_parse != ch) 7208 vFAIL2("Sequence %.3s... not terminated",parse_start); 7209 7210 if (!SIZE_ONLY) { 7211 num = add_data( pRExC_state, 1, "S" ); 7212 RExC_rxi->data->data[num]=(void*)sv_dat; 7213 SvREFCNT_inc_simple_void(sv_dat); 7214 } 7215 7216 RExC_sawback = 1; 7217 ret = reganode(pRExC_state, 7218 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), 7219 num); 7220 *flagp |= HASWIDTH; 7221 7222 /* override incorrect value set in reganode MJD */ 7223 Set_Node_Offset(ret, parse_start+1); 7224 Set_Node_Cur_Length(ret); /* MJD */ 7225 nextchar(pRExC_state); 7226 7227 } 7228 break; 7229 } 7230 case 'g': 7231 case '1': case '2': case '3': case '4': 7232 case '5': case '6': case '7': case '8': case '9': 7233 { 7234 I32 num; 7235 bool isg = *RExC_parse == 'g'; 7236 bool isrel = 0; 7237 bool hasbrace = 0; 7238 if (isg) { 7239 RExC_parse++; 7240 if (*RExC_parse == '{') { 7241 RExC_parse++; 7242 hasbrace = 1; 7243 } 7244 if (*RExC_parse == '-') { 7245 RExC_parse++; 7246 isrel = 1; 7247 } 7248 if (hasbrace && !isDIGIT(*RExC_parse)) { 7249 if (isrel) RExC_parse--; 7250 RExC_parse -= 2; 7251 goto parse_named_seq; 7252 } } 7253 num = atoi(RExC_parse); 7254 if (isg && num == 0) 7255 vFAIL("Reference to invalid group 0"); 7256 if (isrel) { 7257 num = RExC_npar - num; 7258 if (num < 1) 7259 vFAIL("Reference to nonexistent or unclosed group"); 7260 } 7261 if (!isg && num > 9 && num >= RExC_npar) 7262 goto defchar; 7263 else { 7264 char * const parse_start = RExC_parse - 1; /* MJD */ 7265 while (isDIGIT(*RExC_parse)) 7266 RExC_parse++; 7267 if (parse_start == RExC_parse - 1) 7268 vFAIL("Unterminated \\g... pattern"); 7269 if (hasbrace) { 7270 if (*RExC_parse != '}') 7271 vFAIL("Unterminated \\g{...} pattern"); 7272 RExC_parse++; 7273 } 7274 if (!SIZE_ONLY) { 7275 if (num > (I32)RExC_rx->nparens) 7276 vFAIL("Reference to nonexistent group"); 7277 } 7278 RExC_sawback = 1; 7279 ret = reganode(pRExC_state, 7280 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF), 7281 num); 7282 *flagp |= HASWIDTH; 7283 7284 /* override incorrect value set in reganode MJD */ 7285 Set_Node_Offset(ret, parse_start+1); 7286 Set_Node_Cur_Length(ret); /* MJD */ 7287 RExC_parse--; 7288 nextchar(pRExC_state); 7289 } 7290 } 7291 break; 7292 case '\0': 7293 if (RExC_parse >= RExC_end) 7294 FAIL("Trailing \\"); 7295 /* FALL THROUGH */ 7296 default: 7297 /* Do not generate "unrecognized" warnings here, we fall 7298 back into the quick-grab loop below */ 7299 parse_start--; 7300 goto defchar; 7301 } 7302 break; 7303 7304 case '#': 7305 if (RExC_flags & RXf_PMf_EXTENDED) { 7306 if ( reg_skipcomment( pRExC_state ) ) 7307 goto tryagain; 7308 } 7309 /* FALL THROUGH */ 7310 7311 default: 7312 outer_default:{ 7313 register STRLEN len; 7314 register UV ender; 7315 register char *p; 7316 char *s; 7317 STRLEN foldlen; 7318 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; 7319 7320 parse_start = RExC_parse - 1; 7321 7322 RExC_parse++; 7323 7324 defchar: 7325 ender = 0; 7326 ret = reg_node(pRExC_state, 7327 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); 7328 s = STRING(ret); 7329 for (len = 0, p = RExC_parse - 1; 7330 len < 127 && p < RExC_end; 7331 len++) 7332 { 7333 char * const oldp = p; 7334 7335 if (RExC_flags & RXf_PMf_EXTENDED) 7336 p = regwhite( pRExC_state, p ); 7337 switch ((U8)*p) { 7338 case 0xDF: 7339 case 0xC3: 7340 case 0xCE: 7341 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) 7342 goto normal_default; 7343 case '^': 7344 case '$': 7345 case '.': 7346 case '[': 7347 case '(': 7348 case ')': 7349 case '|': 7350 goto loopdone; 7351 case '\\': 7352 /* Literal Escapes Switch 7353 7354 This switch is meant to handle escape sequences that 7355 resolve to a literal character. 7356 7357 Every escape sequence that represents something 7358 else, like an assertion or a char class, is handled 7359 in the switch marked 'Special Escapes' above in this 7360 routine, but also has an entry here as anything that 7361 isn't explicitly mentioned here will be treated as 7362 an unescaped equivalent literal. 7363 */ 7364 7365 switch ((U8)*++p) { 7366 /* These are all the special escapes. */ 7367 case 0xDF: 7368 case 0xC3: 7369 case 0xCE: 7370 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) 7371 goto normal_default; 7372 case 'A': /* Start assertion */ 7373 case 'b': case 'B': /* Word-boundary assertion*/ 7374 case 'C': /* Single char !DANGEROUS! */ 7375 case 'd': case 'D': /* digit class */ 7376 case 'g': case 'G': /* generic-backref, pos assertion */ 7377 case 'h': case 'H': /* HORIZWS */ 7378 case 'k': case 'K': /* named backref, keep marker */ 7379 case 'N': /* named char sequence */ 7380 case 'p': case 'P': /* Unicode property */ 7381 case 'R': /* LNBREAK */ 7382 case 's': case 'S': /* space class */ 7383 case 'v': case 'V': /* VERTWS */ 7384 case 'w': case 'W': /* word class */ 7385 case 'X': /* eXtended Unicode "combining character sequence" */ 7386 case 'z': case 'Z': /* End of line/string assertion */ 7387 --p; 7388 goto loopdone; 7389 7390 /* Anything after here is an escape that resolves to a 7391 literal. (Except digits, which may or may not) 7392 */ 7393 case 'n': 7394 ender = '\n'; 7395 p++; 7396 break; 7397 case 'r': 7398 ender = '\r'; 7399 p++; 7400 break; 7401 case 't': 7402 ender = '\t'; 7403 p++; 7404 break; 7405 case 'f': 7406 ender = '\f'; 7407 p++; 7408 break; 7409 case 'e': 7410 ender = ASCII_TO_NATIVE('\033'); 7411 p++; 7412 break; 7413 case 'a': 7414 ender = ASCII_TO_NATIVE('\007'); 7415 p++; 7416 break; 7417 case 'x': 7418 if (*++p == '{') { 7419 char* const e = strchr(p, '}'); 7420 7421 if (!e) { 7422 RExC_parse = p + 1; 7423 vFAIL("Missing right brace on \\x{}"); 7424 } 7425 else { 7426 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 7427 | PERL_SCAN_DISALLOW_PREFIX; 7428 STRLEN numlen = e - p - 1; 7429 ender = grok_hex(p + 1, &numlen, &flags, NULL); 7430 if (ender > 0xff) 7431 RExC_utf8 = 1; 7432 p = e + 1; 7433 } 7434 } 7435 else { 7436 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 7437 STRLEN numlen = 2; 7438 ender = grok_hex(p, &numlen, &flags, NULL); 7439 p += numlen; 7440 } 7441 if (PL_encoding && ender < 0x100) 7442 goto recode_encoding; 7443 break; 7444 case 'c': 7445 p++; 7446 ender = UCHARAT(p++); 7447 ender = toCTRL(ender); 7448 break; 7449 case '0': case '1': case '2': case '3':case '4': 7450 case '5': case '6': case '7': case '8':case '9': 7451 if (*p == '0' || 7452 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { 7453 I32 flags = 0; 7454 STRLEN numlen = 3; 7455 ender = grok_oct(p, &numlen, &flags, NULL); 7456 7457 /* An octal above 0xff is interpreted differently 7458 * depending on if the re is in utf8 or not. If it 7459 * is in utf8, the value will be itself, otherwise 7460 * it is interpreted as modulo 0x100. It has been 7461 * decided to discourage the use of octal above the 7462 * single-byte range. For now, warn only when 7463 * it ends up modulo */ 7464 if (SIZE_ONLY && ender >= 0x100 7465 && ! UTF && ! PL_encoding) { 7466 ckWARNregdep(p, "Use of octal value above 377 is deprecated"); 7467 } 7468 p += numlen; 7469 } 7470 else { 7471 --p; 7472 goto loopdone; 7473 } 7474 if (PL_encoding && ender < 0x100) 7475 goto recode_encoding; 7476 break; 7477 recode_encoding: 7478 { 7479 SV* enc = PL_encoding; 7480 ender = reg_recode((const char)(U8)ender, &enc); 7481 if (!enc && SIZE_ONLY) 7482 ckWARNreg(p, "Invalid escape in the specified encoding"); 7483 RExC_utf8 = 1; 7484 } 7485 break; 7486 case '\0': 7487 if (p >= RExC_end) 7488 FAIL("Trailing \\"); 7489 /* FALL THROUGH */ 7490 default: 7491 if (!SIZE_ONLY&& isALPHA(*p)) 7492 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); 7493 goto normal_default; 7494 } 7495 break; 7496 default: 7497 normal_default: 7498 if (UTF8_IS_START(*p) && UTF) { 7499 STRLEN numlen; 7500 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 7501 &numlen, UTF8_ALLOW_DEFAULT); 7502 p += numlen; 7503 } 7504 else 7505 ender = *p++; 7506 break; 7507 } 7508 if ( RExC_flags & RXf_PMf_EXTENDED) 7509 p = regwhite( pRExC_state, p ); 7510 if (UTF && FOLD) { 7511 /* Prime the casefolded buffer. */ 7512 ender = toFOLD_uni(ender, tmpbuf, &foldlen); 7513 } 7514 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ 7515 if (len) 7516 p = oldp; 7517 else if (UTF) { 7518 if (FOLD) { 7519 /* Emit all the Unicode characters. */ 7520 STRLEN numlen; 7521 for (foldbuf = tmpbuf; 7522 foldlen; 7523 foldlen -= numlen) { 7524 ender = utf8_to_uvchr(foldbuf, &numlen); 7525 if (numlen > 0) { 7526 const STRLEN unilen = reguni(pRExC_state, ender, s); 7527 s += unilen; 7528 len += unilen; 7529 /* In EBCDIC the numlen 7530 * and unilen can differ. */ 7531 foldbuf += numlen; 7532 if (numlen >= foldlen) 7533 break; 7534 } 7535 else 7536 break; /* "Can't happen." */ 7537 } 7538 } 7539 else { 7540 const STRLEN unilen = reguni(pRExC_state, ender, s); 7541 if (unilen > 0) { 7542 s += unilen; 7543 len += unilen; 7544 } 7545 } 7546 } 7547 else { 7548 len++; 7549 REGC((char)ender, s++); 7550 } 7551 break; 7552 } 7553 if (UTF) { 7554 if (FOLD) { 7555 /* Emit all the Unicode characters. */ 7556 STRLEN numlen; 7557 for (foldbuf = tmpbuf; 7558 foldlen; 7559 foldlen -= numlen) { 7560 ender = utf8_to_uvchr(foldbuf, &numlen); 7561 if (numlen > 0) { 7562 const STRLEN unilen = reguni(pRExC_state, ender, s); 7563 len += unilen; 7564 s += unilen; 7565 /* In EBCDIC the numlen 7566 * and unilen can differ. */ 7567 foldbuf += numlen; 7568 if (numlen >= foldlen) 7569 break; 7570 } 7571 else 7572 break; 7573 } 7574 } 7575 else { 7576 const STRLEN unilen = reguni(pRExC_state, ender, s); 7577 if (unilen > 0) { 7578 s += unilen; 7579 len += unilen; 7580 } 7581 } 7582 len--; 7583 } 7584 else 7585 REGC((char)ender, s++); 7586 } 7587 loopdone: 7588 RExC_parse = p - 1; 7589 Set_Node_Cur_Length(ret); /* MJD */ 7590 nextchar(pRExC_state); 7591 { 7592 /* len is STRLEN which is unsigned, need to copy to signed */ 7593 IV iv = len; 7594 if (iv < 0) 7595 vFAIL("Internal disaster"); 7596 } 7597 if (len > 0) 7598 *flagp |= HASWIDTH; 7599 if (len == 1 && UNI_IS_INVARIANT(ender)) 7600 *flagp |= SIMPLE; 7601 7602 if (SIZE_ONLY) 7603 RExC_size += STR_SZ(len); 7604 else { 7605 STR_LEN(ret) = len; 7606 RExC_emit += STR_SZ(len); 7607 } 7608 } 7609 break; 7610 } 7611 7612 return(ret); 7613 } 7614 7615 STATIC char * 7616 S_regwhite( RExC_state_t *pRExC_state, char *p ) 7617 { 7618 const char *e = RExC_end; 7619 7620 PERL_ARGS_ASSERT_REGWHITE; 7621 7622 while (p < e) { 7623 if (isSPACE(*p)) 7624 ++p; 7625 else if (*p == '#') { 7626 bool ended = 0; 7627 do { 7628 if (*p++ == '\n') { 7629 ended = 1; 7630 break; 7631 } 7632 } while (p < e); 7633 if (!ended) 7634 RExC_seen |= REG_SEEN_RUN_ON_COMMENT; 7635 } 7636 else 7637 break; 7638 } 7639 return p; 7640 } 7641 7642 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 7643 Character classes ([:foo:]) can also be negated ([:^foo:]). 7644 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 7645 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 7646 but trigger failures because they are currently unimplemented. */ 7647 7648 #define POSIXCC_DONE(c) ((c) == ':') 7649 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 7650 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 7651 7652 STATIC I32 7653 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) 7654 { 7655 dVAR; 7656 I32 namedclass = OOB_NAMEDCLASS; 7657 7658 PERL_ARGS_ASSERT_REGPPOSIXCC; 7659 7660 if (value == '[' && RExC_parse + 1 < RExC_end && 7661 /* I smell either [: or [= or [. -- POSIX has been here, right? */ 7662 POSIXCC(UCHARAT(RExC_parse))) { 7663 const char c = UCHARAT(RExC_parse); 7664 char* const s = RExC_parse++; 7665 7666 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) 7667 RExC_parse++; 7668 if (RExC_parse == RExC_end) 7669 /* Grandfather lone [:, [=, [. */ 7670 RExC_parse = s; 7671 else { 7672 const char* const t = RExC_parse++; /* skip over the c */ 7673 assert(*t == c); 7674 7675 if (UCHARAT(RExC_parse) == ']') { 7676 const char *posixcc = s + 1; 7677 RExC_parse++; /* skip over the ending ] */ 7678 7679 if (*s == ':') { 7680 const I32 complement = *posixcc == '^' ? *posixcc++ : 0; 7681 const I32 skip = t - posixcc; 7682 7683 /* Initially switch on the length of the name. */ 7684 switch (skip) { 7685 case 4: 7686 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ 7687 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM; 7688 break; 7689 case 5: 7690 /* Names all of length 5. */ 7691 /* alnum alpha ascii blank cntrl digit graph lower 7692 print punct space upper */ 7693 /* Offset 4 gives the best switch position. */ 7694 switch (posixcc[4]) { 7695 case 'a': 7696 if (memEQ(posixcc, "alph", 4)) /* alpha */ 7697 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA; 7698 break; 7699 case 'e': 7700 if (memEQ(posixcc, "spac", 4)) /* space */ 7701 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; 7702 break; 7703 case 'h': 7704 if (memEQ(posixcc, "grap", 4)) /* graph */ 7705 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH; 7706 break; 7707 case 'i': 7708 if (memEQ(posixcc, "asci", 4)) /* ascii */ 7709 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; 7710 break; 7711 case 'k': 7712 if (memEQ(posixcc, "blan", 4)) /* blank */ 7713 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK; 7714 break; 7715 case 'l': 7716 if (memEQ(posixcc, "cntr", 4)) /* cntrl */ 7717 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL; 7718 break; 7719 case 'm': 7720 if (memEQ(posixcc, "alnu", 4)) /* alnum */ 7721 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; 7722 break; 7723 case 'r': 7724 if (memEQ(posixcc, "lowe", 4)) /* lower */ 7725 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER; 7726 else if (memEQ(posixcc, "uppe", 4)) /* upper */ 7727 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER; 7728 break; 7729 case 't': 7730 if (memEQ(posixcc, "digi", 4)) /* digit */ 7731 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT; 7732 else if (memEQ(posixcc, "prin", 4)) /* print */ 7733 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT; 7734 else if (memEQ(posixcc, "punc", 4)) /* punct */ 7735 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT; 7736 break; 7737 } 7738 break; 7739 case 6: 7740 if (memEQ(posixcc, "xdigit", 6)) 7741 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; 7742 break; 7743 } 7744 7745 if (namedclass == OOB_NAMEDCLASS) 7746 Simple_vFAIL3("POSIX class [:%.*s:] unknown", 7747 t - s - 1, s + 1); 7748 assert (posixcc[skip] == ':'); 7749 assert (posixcc[skip+1] == ']'); 7750 } else if (!SIZE_ONLY) { 7751 /* [[=foo=]] and [[.foo.]] are still future. */ 7752 7753 /* adjust RExC_parse so the warning shows after 7754 the class closes */ 7755 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') 7756 RExC_parse++; 7757 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 7758 } 7759 } else { 7760 /* Maternal grandfather: 7761 * "[:" ending in ":" but not in ":]" */ 7762 RExC_parse = s; 7763 } 7764 } 7765 } 7766 7767 return namedclass; 7768 } 7769 7770 STATIC void 7771 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) 7772 { 7773 dVAR; 7774 7775 PERL_ARGS_ASSERT_CHECKPOSIXCC; 7776 7777 if (POSIXCC(UCHARAT(RExC_parse))) { 7778 const char *s = RExC_parse; 7779 const char c = *s++; 7780 7781 while (isALNUM(*s)) 7782 s++; 7783 if (*s && c == *s && s[1] == ']') { 7784 ckWARN3reg(s+2, 7785 "POSIX syntax [%c %c] belongs inside character classes", 7786 c, c); 7787 7788 /* [[=foo=]] and [[.foo.]] are still future. */ 7789 if (POSIXCC_NOTYET(c)) { 7790 /* adjust RExC_parse so the error shows after 7791 the class closes */ 7792 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']') 7793 NOOP; 7794 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 7795 } 7796 } 7797 } 7798 } 7799 7800 7801 #define _C_C_T_(NAME,TEST,WORD) \ 7802 ANYOF_##NAME: \ 7803 if (LOC) \ 7804 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ 7805 else { \ 7806 for (value = 0; value < 256; value++) \ 7807 if (TEST) \ 7808 ANYOF_BITMAP_SET(ret, value); \ 7809 } \ 7810 yesno = '+'; \ 7811 what = WORD; \ 7812 break; \ 7813 case ANYOF_N##NAME: \ 7814 if (LOC) \ 7815 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ 7816 else { \ 7817 for (value = 0; value < 256; value++) \ 7818 if (!TEST) \ 7819 ANYOF_BITMAP_SET(ret, value); \ 7820 } \ 7821 yesno = '!'; \ 7822 what = WORD; \ 7823 break 7824 7825 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \ 7826 ANYOF_##NAME: \ 7827 for (value = 0; value < 256; value++) \ 7828 if (TEST) \ 7829 ANYOF_BITMAP_SET(ret, value); \ 7830 yesno = '+'; \ 7831 what = WORD; \ 7832 break; \ 7833 case ANYOF_N##NAME: \ 7834 for (value = 0; value < 256; value++) \ 7835 if (!TEST) \ 7836 ANYOF_BITMAP_SET(ret, value); \ 7837 yesno = '!'; \ 7838 what = WORD; \ 7839 break 7840 7841 /* 7842 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test 7843 so that it is possible to override the option here without having to 7844 rebuild the entire core. as we are required to do if we change regcomp.h 7845 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. 7846 */ 7847 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 7848 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS 7849 #endif 7850 7851 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS 7852 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME 7853 #else 7854 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME 7855 #endif 7856 7857 /* 7858 parse a class specification and produce either an ANYOF node that 7859 matches the pattern or if the pattern matches a single char only and 7860 that char is < 256 and we are case insensitive then we produce an 7861 EXACT node instead. 7862 */ 7863 7864 STATIC regnode * 7865 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) 7866 { 7867 dVAR; 7868 register UV nextvalue; 7869 register IV prevvalue = OOB_UNICODE; 7870 register IV range = 0; 7871 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ 7872 register regnode *ret; 7873 STRLEN numlen; 7874 IV namedclass; 7875 char *rangebegin = NULL; 7876 bool need_class = 0; 7877 SV *listsv = NULL; 7878 UV n; 7879 bool optimize_invert = TRUE; 7880 AV* unicode_alternate = NULL; 7881 #ifdef EBCDIC 7882 UV literal_endpoint = 0; 7883 #endif 7884 UV stored = 0; /* number of chars stored in the class */ 7885 7886 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in 7887 case we need to change the emitted regop to an EXACT. */ 7888 const char * orig_parse = RExC_parse; 7889 GET_RE_DEBUG_FLAGS_DECL; 7890 7891 PERL_ARGS_ASSERT_REGCLASS; 7892 #ifndef DEBUGGING 7893 PERL_UNUSED_ARG(depth); 7894 #endif 7895 7896 DEBUG_PARSE("clas"); 7897 7898 /* Assume we are going to generate an ANYOF node. */ 7899 ret = reganode(pRExC_state, ANYOF, 0); 7900 7901 if (!SIZE_ONLY) 7902 ANYOF_FLAGS(ret) = 0; 7903 7904 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ 7905 RExC_naughty++; 7906 RExC_parse++; 7907 if (!SIZE_ONLY) 7908 ANYOF_FLAGS(ret) |= ANYOF_INVERT; 7909 } 7910 7911 if (SIZE_ONLY) { 7912 RExC_size += ANYOF_SKIP; 7913 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ 7914 } 7915 else { 7916 RExC_emit += ANYOF_SKIP; 7917 if (FOLD) 7918 ANYOF_FLAGS(ret) |= ANYOF_FOLD; 7919 if (LOC) 7920 ANYOF_FLAGS(ret) |= ANYOF_LOCALE; 7921 ANYOF_BITMAP_ZERO(ret); 7922 listsv = newSVpvs("# comment\n"); 7923 } 7924 7925 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 7926 7927 if (!SIZE_ONLY && POSIXCC(nextvalue)) 7928 checkposixcc(pRExC_state); 7929 7930 /* allow 1st char to be ] (allowing it to be - is dealt with later) */ 7931 if (UCHARAT(RExC_parse) == ']') 7932 goto charclassloop; 7933 7934 parseit: 7935 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { 7936 7937 charclassloop: 7938 7939 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 7940 7941 if (!range) 7942 rangebegin = RExC_parse; 7943 if (UTF) { 7944 value = utf8n_to_uvchr((U8*)RExC_parse, 7945 RExC_end - RExC_parse, 7946 &numlen, UTF8_ALLOW_DEFAULT); 7947 RExC_parse += numlen; 7948 } 7949 else 7950 value = UCHARAT(RExC_parse++); 7951 7952 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 7953 if (value == '[' && POSIXCC(nextvalue)) 7954 namedclass = regpposixcc(pRExC_state, value); 7955 else if (value == '\\') { 7956 if (UTF) { 7957 value = utf8n_to_uvchr((U8*)RExC_parse, 7958 RExC_end - RExC_parse, 7959 &numlen, UTF8_ALLOW_DEFAULT); 7960 RExC_parse += numlen; 7961 } 7962 else 7963 value = UCHARAT(RExC_parse++); 7964 /* Some compilers cannot handle switching on 64-bit integer 7965 * values, therefore value cannot be an UV. Yes, this will 7966 * be a problem later if we want switch on Unicode. 7967 * A similar issue a little bit later when switching on 7968 * namedclass. --jhi */ 7969 switch ((I32)value) { 7970 case 'w': namedclass = ANYOF_ALNUM; break; 7971 case 'W': namedclass = ANYOF_NALNUM; break; 7972 case 's': namedclass = ANYOF_SPACE; break; 7973 case 'S': namedclass = ANYOF_NSPACE; break; 7974 case 'd': namedclass = ANYOF_DIGIT; break; 7975 case 'D': namedclass = ANYOF_NDIGIT; break; 7976 case 'v': namedclass = ANYOF_VERTWS; break; 7977 case 'V': namedclass = ANYOF_NVERTWS; break; 7978 case 'h': namedclass = ANYOF_HORIZWS; break; 7979 case 'H': namedclass = ANYOF_NHORIZWS; break; 7980 case 'N': /* Handle \N{NAME} in class */ 7981 { 7982 /* We only pay attention to the first char of 7983 multichar strings being returned. I kinda wonder 7984 if this makes sense as it does change the behaviour 7985 from earlier versions, OTOH that behaviour was broken 7986 as well. */ 7987 UV v; /* value is register so we cant & it /grrr */ 7988 if (reg_namedseq(pRExC_state, &v, NULL)) { 7989 goto parseit; 7990 } 7991 value= v; 7992 } 7993 break; 7994 case 'p': 7995 case 'P': 7996 { 7997 char *e; 7998 if (RExC_parse >= RExC_end) 7999 vFAIL2("Empty \\%c{}", (U8)value); 8000 if (*RExC_parse == '{') { 8001 const U8 c = (U8)value; 8002 e = strchr(RExC_parse++, '}'); 8003 if (!e) 8004 vFAIL2("Missing right brace on \\%c{}", c); 8005 while (isSPACE(UCHARAT(RExC_parse))) 8006 RExC_parse++; 8007 if (e == RExC_parse) 8008 vFAIL2("Empty \\%c{}", c); 8009 n = e - RExC_parse; 8010 while (isSPACE(UCHARAT(RExC_parse + n - 1))) 8011 n--; 8012 } 8013 else { 8014 e = RExC_parse; 8015 n = 1; 8016 } 8017 if (!SIZE_ONLY) { 8018 if (UCHARAT(RExC_parse) == '^') { 8019 RExC_parse++; 8020 n--; 8021 value = value == 'p' ? 'P' : 'p'; /* toggle */ 8022 while (isSPACE(UCHARAT(RExC_parse))) { 8023 RExC_parse++; 8024 n--; 8025 } 8026 } 8027 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n", 8028 (value=='p' ? '+' : '!'), (int)n, RExC_parse); 8029 } 8030 RExC_parse = e + 1; 8031 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 8032 namedclass = ANYOF_MAX; /* no official name, but it's named */ 8033 } 8034 break; 8035 case 'n': value = '\n'; break; 8036 case 'r': value = '\r'; break; 8037 case 't': value = '\t'; break; 8038 case 'f': value = '\f'; break; 8039 case 'b': value = '\b'; break; 8040 case 'e': value = ASCII_TO_NATIVE('\033');break; 8041 case 'a': value = ASCII_TO_NATIVE('\007');break; 8042 case 'x': 8043 if (*RExC_parse == '{') { 8044 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 8045 | PERL_SCAN_DISALLOW_PREFIX; 8046 char * const e = strchr(RExC_parse++, '}'); 8047 if (!e) 8048 vFAIL("Missing right brace on \\x{}"); 8049 8050 numlen = e - RExC_parse; 8051 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 8052 RExC_parse = e + 1; 8053 } 8054 else { 8055 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 8056 numlen = 2; 8057 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 8058 RExC_parse += numlen; 8059 } 8060 if (PL_encoding && value < 0x100) 8061 goto recode_encoding; 8062 break; 8063 case 'c': 8064 value = UCHARAT(RExC_parse++); 8065 value = toCTRL(value); 8066 break; 8067 case '0': case '1': case '2': case '3': case '4': 8068 case '5': case '6': case '7': case '8': case '9': 8069 { 8070 I32 flags = 0; 8071 numlen = 3; 8072 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 8073 RExC_parse += numlen; 8074 if (PL_encoding && value < 0x100) 8075 goto recode_encoding; 8076 break; 8077 } 8078 recode_encoding: 8079 { 8080 SV* enc = PL_encoding; 8081 value = reg_recode((const char)(U8)value, &enc); 8082 if (!enc && SIZE_ONLY) 8083 ckWARNreg(RExC_parse, 8084 "Invalid escape in the specified encoding"); 8085 break; 8086 } 8087 default: 8088 if (!SIZE_ONLY && isALPHA(value)) 8089 ckWARN2reg(RExC_parse, 8090 "Unrecognized escape \\%c in character class passed through", 8091 (int)value); 8092 break; 8093 } 8094 } /* end of \blah */ 8095 #ifdef EBCDIC 8096 else 8097 literal_endpoint++; 8098 #endif 8099 8100 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 8101 8102 if (!SIZE_ONLY && !need_class) 8103 ANYOF_CLASS_ZERO(ret); 8104 8105 need_class = 1; 8106 8107 /* a bad range like a-\d, a-[:digit:] ? */ 8108 if (range) { 8109 if (!SIZE_ONLY) { 8110 const int w = 8111 RExC_parse >= rangebegin ? 8112 RExC_parse - rangebegin : 0; 8113 ckWARN4reg(RExC_parse, 8114 "False [] range \"%*.*s\"", 8115 w, w, rangebegin); 8116 8117 if (prevvalue < 256) { 8118 ANYOF_BITMAP_SET(ret, prevvalue); 8119 ANYOF_BITMAP_SET(ret, '-'); 8120 } 8121 else { 8122 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 8123 Perl_sv_catpvf(aTHX_ listsv, 8124 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); 8125 } 8126 } 8127 8128 range = 0; /* this was not a true range */ 8129 } 8130 8131 8132 8133 if (!SIZE_ONLY) { 8134 const char *what = NULL; 8135 char yesno = 0; 8136 8137 if (namedclass > OOB_NAMEDCLASS) 8138 optimize_invert = FALSE; 8139 /* Possible truncation here but in some 64-bit environments 8140 * the compiler gets heartburn about switch on 64-bit values. 8141 * A similar issue a little earlier when switching on value. 8142 * --jhi */ 8143 switch ((I32)namedclass) { 8144 8145 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); 8146 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); 8147 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); 8148 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); 8149 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); 8150 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); 8151 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); 8152 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); 8153 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); 8154 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); 8155 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS 8156 case _C_C_T_(ALNUM, isALNUM(value), "Word"); 8157 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); 8158 #else 8159 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace"); 8160 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord"); 8161 #endif 8162 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); 8163 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); 8164 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); 8165 case ANYOF_ASCII: 8166 if (LOC) 8167 ANYOF_CLASS_SET(ret, ANYOF_ASCII); 8168 else { 8169 #ifndef EBCDIC 8170 for (value = 0; value < 128; value++) 8171 ANYOF_BITMAP_SET(ret, value); 8172 #else /* EBCDIC */ 8173 for (value = 0; value < 256; value++) { 8174 if (isASCII(value)) 8175 ANYOF_BITMAP_SET(ret, value); 8176 } 8177 #endif /* EBCDIC */ 8178 } 8179 yesno = '+'; 8180 what = "ASCII"; 8181 break; 8182 case ANYOF_NASCII: 8183 if (LOC) 8184 ANYOF_CLASS_SET(ret, ANYOF_NASCII); 8185 else { 8186 #ifndef EBCDIC 8187 for (value = 128; value < 256; value++) 8188 ANYOF_BITMAP_SET(ret, value); 8189 #else /* EBCDIC */ 8190 for (value = 0; value < 256; value++) { 8191 if (!isASCII(value)) 8192 ANYOF_BITMAP_SET(ret, value); 8193 } 8194 #endif /* EBCDIC */ 8195 } 8196 yesno = '!'; 8197 what = "ASCII"; 8198 break; 8199 case ANYOF_DIGIT: 8200 if (LOC) 8201 ANYOF_CLASS_SET(ret, ANYOF_DIGIT); 8202 else { 8203 /* consecutive digits assumed */ 8204 for (value = '0'; value <= '9'; value++) 8205 ANYOF_BITMAP_SET(ret, value); 8206 } 8207 yesno = '+'; 8208 what = POSIX_CC_UNI_NAME("Digit"); 8209 break; 8210 case ANYOF_NDIGIT: 8211 if (LOC) 8212 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); 8213 else { 8214 /* consecutive digits assumed */ 8215 for (value = 0; value < '0'; value++) 8216 ANYOF_BITMAP_SET(ret, value); 8217 for (value = '9' + 1; value < 256; value++) 8218 ANYOF_BITMAP_SET(ret, value); 8219 } 8220 yesno = '!'; 8221 what = POSIX_CC_UNI_NAME("Digit"); 8222 break; 8223 case ANYOF_MAX: 8224 /* this is to handle \p and \P */ 8225 break; 8226 default: 8227 vFAIL("Invalid [::] class"); 8228 break; 8229 } 8230 if (what) { 8231 /* Strings such as "+utf8::isWord\n" */ 8232 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); 8233 } 8234 if (LOC) 8235 ANYOF_FLAGS(ret) |= ANYOF_CLASS; 8236 continue; 8237 } 8238 } /* end of namedclass \blah */ 8239 8240 if (range) { 8241 if (prevvalue > (IV)value) /* b-a */ { 8242 const int w = RExC_parse - rangebegin; 8243 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); 8244 range = 0; /* not a valid range */ 8245 } 8246 } 8247 else { 8248 prevvalue = value; /* save the beginning of the range */ 8249 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && 8250 RExC_parse[1] != ']') { 8251 RExC_parse++; 8252 8253 /* a bad range like \w-, [:word:]- ? */ 8254 if (namedclass > OOB_NAMEDCLASS) { 8255 if (ckWARN(WARN_REGEXP)) { 8256 const int w = 8257 RExC_parse >= rangebegin ? 8258 RExC_parse - rangebegin : 0; 8259 vWARN4(RExC_parse, 8260 "False [] range \"%*.*s\"", 8261 w, w, rangebegin); 8262 } 8263 if (!SIZE_ONLY) 8264 ANYOF_BITMAP_SET(ret, '-'); 8265 } else 8266 range = 1; /* yeah, it's a range! */ 8267 continue; /* but do it the next time */ 8268 } 8269 } 8270 8271 /* now is the next time */ 8272 /*stored += (value - prevvalue + 1);*/ 8273 if (!SIZE_ONLY) { 8274 if (prevvalue < 256) { 8275 const IV ceilvalue = value < 256 ? value : 255; 8276 IV i; 8277 #ifdef EBCDIC 8278 /* In EBCDIC [\x89-\x91] should include 8279 * the \x8e but [i-j] should not. */ 8280 if (literal_endpoint == 2 && 8281 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || 8282 (isUPPER(prevvalue) && isUPPER(ceilvalue)))) 8283 { 8284 if (isLOWER(prevvalue)) { 8285 for (i = prevvalue; i <= ceilvalue; i++) 8286 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { 8287 stored++; 8288 ANYOF_BITMAP_SET(ret, i); 8289 } 8290 } else { 8291 for (i = prevvalue; i <= ceilvalue; i++) 8292 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { 8293 stored++; 8294 ANYOF_BITMAP_SET(ret, i); 8295 } 8296 } 8297 } 8298 else 8299 #endif 8300 for (i = prevvalue; i <= ceilvalue; i++) { 8301 if (!ANYOF_BITMAP_TEST(ret,i)) { 8302 stored++; 8303 ANYOF_BITMAP_SET(ret, i); 8304 } 8305 } 8306 } 8307 if (value > 255 || UTF) { 8308 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); 8309 const UV natvalue = NATIVE_TO_UNI(value); 8310 stored+=2; /* can't optimize this class */ 8311 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 8312 if (prevnatvalue < natvalue) { /* what about > ? */ 8313 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", 8314 prevnatvalue, natvalue); 8315 } 8316 else if (prevnatvalue == natvalue) { 8317 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); 8318 if (FOLD) { 8319 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 8320 STRLEN foldlen; 8321 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen); 8322 8323 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */ 8324 if (RExC_precomp[0] == ':' && 8325 RExC_precomp[1] == '[' && 8326 (f == 0xDF || f == 0x92)) { 8327 f = NATIVE_TO_UNI(f); 8328 } 8329 #endif 8330 /* If folding and foldable and a single 8331 * character, insert also the folded version 8332 * to the charclass. */ 8333 if (f != value) { 8334 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */ 8335 if ((RExC_precomp[0] == ':' && 8336 RExC_precomp[1] == '[' && 8337 (f == 0xA2 && 8338 (value == 0xFB05 || value == 0xFB06))) ? 8339 foldlen == ((STRLEN)UNISKIP(f) - 1) : 8340 foldlen == (STRLEN)UNISKIP(f) ) 8341 #else 8342 if (foldlen == (STRLEN)UNISKIP(f)) 8343 #endif 8344 Perl_sv_catpvf(aTHX_ listsv, 8345 "%04"UVxf"\n", f); 8346 else { 8347 /* Any multicharacter foldings 8348 * require the following transform: 8349 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) 8350 * where E folds into "pq" and F folds 8351 * into "rst", all other characters 8352 * fold to single characters. We save 8353 * away these multicharacter foldings, 8354 * to be later saved as part of the 8355 * additional "s" data. */ 8356 SV *sv; 8357 8358 if (!unicode_alternate) 8359 unicode_alternate = newAV(); 8360 sv = newSVpvn_utf8((char*)foldbuf, foldlen, 8361 TRUE); 8362 av_push(unicode_alternate, sv); 8363 } 8364 } 8365 8366 /* If folding and the value is one of the Greek 8367 * sigmas insert a few more sigmas to make the 8368 * folding rules of the sigmas to work right. 8369 * Note that not all the possible combinations 8370 * are handled here: some of them are handled 8371 * by the standard folding rules, and some of 8372 * them (literal or EXACTF cases) are handled 8373 * during runtime in regexec.c:S_find_byclass(). */ 8374 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { 8375 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 8376 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); 8377 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 8378 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 8379 } 8380 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA) 8381 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 8382 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 8383 } 8384 } 8385 } 8386 #ifdef EBCDIC 8387 literal_endpoint = 0; 8388 #endif 8389 } 8390 8391 range = 0; /* this range (if it was one) is done now */ 8392 } 8393 8394 if (need_class) { 8395 ANYOF_FLAGS(ret) |= ANYOF_LARGE; 8396 if (SIZE_ONLY) 8397 RExC_size += ANYOF_CLASS_ADD_SKIP; 8398 else 8399 RExC_emit += ANYOF_CLASS_ADD_SKIP; 8400 } 8401 8402 8403 if (SIZE_ONLY) 8404 return ret; 8405 /****** !SIZE_ONLY AFTER HERE *********/ 8406 8407 if( stored == 1 && (value < 128 || (value < 256 && !UTF)) 8408 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) ) 8409 ) { 8410 /* optimize single char class to an EXACT node 8411 but *only* when its not a UTF/high char */ 8412 const char * cur_parse= RExC_parse; 8413 RExC_emit = (regnode *)orig_emit; 8414 RExC_parse = (char *)orig_parse; 8415 ret = reg_node(pRExC_state, 8416 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT)); 8417 RExC_parse = (char *)cur_parse; 8418 *STRING(ret)= (char)value; 8419 STR_LEN(ret)= 1; 8420 RExC_emit += STR_SZ(1); 8421 SvREFCNT_dec(listsv); 8422 return ret; 8423 } 8424 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ 8425 if ( /* If the only flag is folding (plus possibly inversion). */ 8426 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) 8427 ) { 8428 for (value = 0; value < 256; ++value) { 8429 if (ANYOF_BITMAP_TEST(ret, value)) { 8430 UV fold = PL_fold[value]; 8431 8432 if (fold != value) 8433 ANYOF_BITMAP_SET(ret, fold); 8434 } 8435 } 8436 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; 8437 } 8438 8439 /* optimize inverted simple patterns (e.g. [^a-z]) */ 8440 if (optimize_invert && 8441 /* If the only flag is inversion. */ 8442 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { 8443 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) 8444 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; 8445 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; 8446 } 8447 { 8448 AV * const av = newAV(); 8449 SV *rv; 8450 /* The 0th element stores the character class description 8451 * in its textual form: used later (regexec.c:Perl_regclass_swash()) 8452 * to initialize the appropriate swash (which gets stored in 8453 * the 1st element), and also useful for dumping the regnode. 8454 * The 2nd element stores the multicharacter foldings, 8455 * used later (regexec.c:S_reginclass()). */ 8456 av_store(av, 0, listsv); 8457 av_store(av, 1, NULL); 8458 av_store(av, 2, MUTABLE_SV(unicode_alternate)); 8459 rv = newRV_noinc(MUTABLE_SV(av)); 8460 n = add_data(pRExC_state, 1, "s"); 8461 RExC_rxi->data->data[n] = (void*)rv; 8462 ARG_SET(ret, n); 8463 } 8464 return ret; 8465 } 8466 #undef _C_C_T_ 8467 8468 8469 /* reg_skipcomment() 8470 8471 Absorbs an /x style # comments from the input stream. 8472 Returns true if there is more text remaining in the stream. 8473 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment 8474 terminates the pattern without including a newline. 8475 8476 Note its the callers responsibility to ensure that we are 8477 actually in /x mode 8478 8479 */ 8480 8481 STATIC bool 8482 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) 8483 { 8484 bool ended = 0; 8485 8486 PERL_ARGS_ASSERT_REG_SKIPCOMMENT; 8487 8488 while (RExC_parse < RExC_end) 8489 if (*RExC_parse++ == '\n') { 8490 ended = 1; 8491 break; 8492 } 8493 if (!ended) { 8494 /* we ran off the end of the pattern without ending 8495 the comment, so we have to add an \n when wrapping */ 8496 RExC_seen |= REG_SEEN_RUN_ON_COMMENT; 8497 return 0; 8498 } else 8499 return 1; 8500 } 8501 8502 /* nextchar() 8503 8504 Advance that parse position, and optionall absorbs 8505 "whitespace" from the inputstream. 8506 8507 Without /x "whitespace" means (?#...) style comments only, 8508 with /x this means (?#...) and # comments and whitespace proper. 8509 8510 Returns the RExC_parse point from BEFORE the scan occurs. 8511 8512 This is the /x friendly way of saying RExC_parse++. 8513 */ 8514 8515 STATIC char* 8516 S_nextchar(pTHX_ RExC_state_t *pRExC_state) 8517 { 8518 char* const retval = RExC_parse++; 8519 8520 PERL_ARGS_ASSERT_NEXTCHAR; 8521 8522 for (;;) { 8523 if (*RExC_parse == '(' && RExC_parse[1] == '?' && 8524 RExC_parse[2] == '#') { 8525 while (*RExC_parse != ')') { 8526 if (RExC_parse == RExC_end) 8527 FAIL("Sequence (?#... not terminated"); 8528 RExC_parse++; 8529 } 8530 RExC_parse++; 8531 continue; 8532 } 8533 if (RExC_flags & RXf_PMf_EXTENDED) { 8534 if (isSPACE(*RExC_parse)) { 8535 RExC_parse++; 8536 continue; 8537 } 8538 else if (*RExC_parse == '#') { 8539 if ( reg_skipcomment( pRExC_state ) ) 8540 continue; 8541 } 8542 } 8543 return retval; 8544 } 8545 } 8546 8547 /* 8548 - reg_node - emit a node 8549 */ 8550 STATIC regnode * /* Location. */ 8551 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 8552 { 8553 dVAR; 8554 register regnode *ptr; 8555 regnode * const ret = RExC_emit; 8556 GET_RE_DEBUG_FLAGS_DECL; 8557 8558 PERL_ARGS_ASSERT_REG_NODE; 8559 8560 if (SIZE_ONLY) { 8561 SIZE_ALIGN(RExC_size); 8562 RExC_size += 1; 8563 return(ret); 8564 } 8565 if (RExC_emit >= RExC_emit_bound) 8566 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); 8567 8568 NODE_ALIGN_FILL(ret); 8569 ptr = ret; 8570 FILL_ADVANCE_NODE(ptr, op); 8571 #ifdef RE_TRACK_PATTERN_OFFSETS 8572 if (RExC_offsets) { /* MJD */ 8573 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", 8574 "reg_node", __LINE__, 8575 PL_reg_name[op], 8576 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] 8577 ? "Overwriting end of array!\n" : "OK", 8578 (UV)(RExC_emit - RExC_emit_start), 8579 (UV)(RExC_parse - RExC_start), 8580 (UV)RExC_offsets[0])); 8581 Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); 8582 } 8583 #endif 8584 RExC_emit = ptr; 8585 return(ret); 8586 } 8587 8588 /* 8589 - reganode - emit a node with an argument 8590 */ 8591 STATIC regnode * /* Location. */ 8592 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 8593 { 8594 dVAR; 8595 register regnode *ptr; 8596 regnode * const ret = RExC_emit; 8597 GET_RE_DEBUG_FLAGS_DECL; 8598 8599 PERL_ARGS_ASSERT_REGANODE; 8600 8601 if (SIZE_ONLY) { 8602 SIZE_ALIGN(RExC_size); 8603 RExC_size += 2; 8604 /* 8605 We can't do this: 8606 8607 assert(2==regarglen[op]+1); 8608 8609 Anything larger than this has to allocate the extra amount. 8610 If we changed this to be: 8611 8612 RExC_size += (1 + regarglen[op]); 8613 8614 then it wouldn't matter. Its not clear what side effect 8615 might come from that so its not done so far. 8616 -- dmq 8617 */ 8618 return(ret); 8619 } 8620 if (RExC_emit >= RExC_emit_bound) 8621 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); 8622 8623 NODE_ALIGN_FILL(ret); 8624 ptr = ret; 8625 FILL_ADVANCE_NODE_ARG(ptr, op, arg); 8626 #ifdef RE_TRACK_PATTERN_OFFSETS 8627 if (RExC_offsets) { /* MJD */ 8628 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", 8629 "reganode", 8630 __LINE__, 8631 PL_reg_name[op], 8632 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? 8633 "Overwriting end of array!\n" : "OK", 8634 (UV)(RExC_emit - RExC_emit_start), 8635 (UV)(RExC_parse - RExC_start), 8636 (UV)RExC_offsets[0])); 8637 Set_Cur_Node_Offset; 8638 } 8639 #endif 8640 RExC_emit = ptr; 8641 return(ret); 8642 } 8643 8644 /* 8645 - reguni - emit (if appropriate) a Unicode character 8646 */ 8647 STATIC STRLEN 8648 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) 8649 { 8650 dVAR; 8651 8652 PERL_ARGS_ASSERT_REGUNI; 8653 8654 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); 8655 } 8656 8657 /* 8658 - reginsert - insert an operator in front of already-emitted operand 8659 * 8660 * Means relocating the operand. 8661 */ 8662 STATIC void 8663 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) 8664 { 8665 dVAR; 8666 register regnode *src; 8667 register regnode *dst; 8668 register regnode *place; 8669 const int offset = regarglen[(U8)op]; 8670 const int size = NODE_STEP_REGNODE + offset; 8671 GET_RE_DEBUG_FLAGS_DECL; 8672 8673 PERL_ARGS_ASSERT_REGINSERT; 8674 PERL_UNUSED_ARG(depth); 8675 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ 8676 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); 8677 if (SIZE_ONLY) { 8678 RExC_size += size; 8679 return; 8680 } 8681 8682 src = RExC_emit; 8683 RExC_emit += size; 8684 dst = RExC_emit; 8685 if (RExC_open_parens) { 8686 int paren; 8687 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ 8688 for ( paren=0 ; paren < RExC_npar ; paren++ ) { 8689 if ( RExC_open_parens[paren] >= opnd ) { 8690 /*DEBUG_PARSE_FMT("open"," - %d",size);*/ 8691 RExC_open_parens[paren] += size; 8692 } else { 8693 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ 8694 } 8695 if ( RExC_close_parens[paren] >= opnd ) { 8696 /*DEBUG_PARSE_FMT("close"," - %d",size);*/ 8697 RExC_close_parens[paren] += size; 8698 } else { 8699 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ 8700 } 8701 } 8702 } 8703 8704 while (src > opnd) { 8705 StructCopy(--src, --dst, regnode); 8706 #ifdef RE_TRACK_PATTERN_OFFSETS 8707 if (RExC_offsets) { /* MJD 20010112 */ 8708 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", 8709 "reg_insert", 8710 __LINE__, 8711 PL_reg_name[op], 8712 (UV)(dst - RExC_emit_start) > RExC_offsets[0] 8713 ? "Overwriting end of array!\n" : "OK", 8714 (UV)(src - RExC_emit_start), 8715 (UV)(dst - RExC_emit_start), 8716 (UV)RExC_offsets[0])); 8717 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); 8718 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); 8719 } 8720 #endif 8721 } 8722 8723 8724 place = opnd; /* Op node, where operand used to be. */ 8725 #ifdef RE_TRACK_PATTERN_OFFSETS 8726 if (RExC_offsets) { /* MJD */ 8727 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", 8728 "reginsert", 8729 __LINE__, 8730 PL_reg_name[op], 8731 (UV)(place - RExC_emit_start) > RExC_offsets[0] 8732 ? "Overwriting end of array!\n" : "OK", 8733 (UV)(place - RExC_emit_start), 8734 (UV)(RExC_parse - RExC_start), 8735 (UV)RExC_offsets[0])); 8736 Set_Node_Offset(place, RExC_parse); 8737 Set_Node_Length(place, 1); 8738 } 8739 #endif 8740 src = NEXTOPER(place); 8741 FILL_ADVANCE_NODE(place, op); 8742 Zero(src, offset, regnode); 8743 } 8744 8745 /* 8746 - regtail - set the next-pointer at the end of a node chain of p to val. 8747 - SEE ALSO: regtail_study 8748 */ 8749 /* TODO: All three parms should be const */ 8750 STATIC void 8751 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) 8752 { 8753 dVAR; 8754 register regnode *scan; 8755 GET_RE_DEBUG_FLAGS_DECL; 8756 8757 PERL_ARGS_ASSERT_REGTAIL; 8758 #ifndef DEBUGGING 8759 PERL_UNUSED_ARG(depth); 8760 #endif 8761 8762 if (SIZE_ONLY) 8763 return; 8764 8765 /* Find last node. */ 8766 scan = p; 8767 for (;;) { 8768 regnode * const temp = regnext(scan); 8769 DEBUG_PARSE_r({ 8770 SV * const mysv=sv_newmortal(); 8771 DEBUG_PARSE_MSG((scan==p ? "tail" : "")); 8772 regprop(RExC_rx, mysv, scan); 8773 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", 8774 SvPV_nolen_const(mysv), REG_NODE_NUM(scan), 8775 (temp == NULL ? "->" : ""), 8776 (temp == NULL ? PL_reg_name[OP(val)] : "") 8777 ); 8778 }); 8779 if (temp == NULL) 8780 break; 8781 scan = temp; 8782 } 8783 8784 if (reg_off_by_arg[OP(scan)]) { 8785 ARG_SET(scan, val - scan); 8786 } 8787 else { 8788 NEXT_OFF(scan) = val - scan; 8789 } 8790 } 8791 8792 #ifdef DEBUGGING 8793 /* 8794 - regtail_study - set the next-pointer at the end of a node chain of p to val. 8795 - Look for optimizable sequences at the same time. 8796 - currently only looks for EXACT chains. 8797 8798 This is expermental code. The idea is to use this routine to perform 8799 in place optimizations on branches and groups as they are constructed, 8800 with the long term intention of removing optimization from study_chunk so 8801 that it is purely analytical. 8802 8803 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used 8804 to control which is which. 8805 8806 */ 8807 /* TODO: All four parms should be const */ 8808 8809 STATIC U8 8810 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) 8811 { 8812 dVAR; 8813 register regnode *scan; 8814 U8 exact = PSEUDO; 8815 #ifdef EXPERIMENTAL_INPLACESCAN 8816 I32 min = 0; 8817 #endif 8818 GET_RE_DEBUG_FLAGS_DECL; 8819 8820 PERL_ARGS_ASSERT_REGTAIL_STUDY; 8821 8822 8823 if (SIZE_ONLY) 8824 return exact; 8825 8826 /* Find last node. */ 8827 8828 scan = p; 8829 for (;;) { 8830 regnode * const temp = regnext(scan); 8831 #ifdef EXPERIMENTAL_INPLACESCAN 8832 if (PL_regkind[OP(scan)] == EXACT) 8833 if (join_exact(pRExC_state,scan,&min,1,val,depth+1)) 8834 return EXACT; 8835 #endif 8836 if ( exact ) { 8837 switch (OP(scan)) { 8838 case EXACT: 8839 case EXACTF: 8840 case EXACTFL: 8841 if( exact == PSEUDO ) 8842 exact= OP(scan); 8843 else if ( exact != OP(scan) ) 8844 exact= 0; 8845 case NOTHING: 8846 break; 8847 default: 8848 exact= 0; 8849 } 8850 } 8851 DEBUG_PARSE_r({ 8852 SV * const mysv=sv_newmortal(); 8853 DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); 8854 regprop(RExC_rx, mysv, scan); 8855 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", 8856 SvPV_nolen_const(mysv), 8857 REG_NODE_NUM(scan), 8858 PL_reg_name[exact]); 8859 }); 8860 if (temp == NULL) 8861 break; 8862 scan = temp; 8863 } 8864 DEBUG_PARSE_r({ 8865 SV * const mysv_val=sv_newmortal(); 8866 DEBUG_PARSE_MSG(""); 8867 regprop(RExC_rx, mysv_val, val); 8868 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", 8869 SvPV_nolen_const(mysv_val), 8870 (IV)REG_NODE_NUM(val), 8871 (IV)(val - scan) 8872 ); 8873 }); 8874 if (reg_off_by_arg[OP(scan)]) { 8875 ARG_SET(scan, val - scan); 8876 } 8877 else { 8878 NEXT_OFF(scan) = val - scan; 8879 } 8880 8881 return exact; 8882 } 8883 #endif 8884 8885 /* 8886 - regcurly - a little FSA that accepts {\d+,?\d*} 8887 */ 8888 #ifndef PERL_IN_XSUB_RE 8889 I32 8890 Perl_regcurly(register const char *s) 8891 { 8892 PERL_ARGS_ASSERT_REGCURLY; 8893 8894 if (*s++ != '{') 8895 return FALSE; 8896 if (!isDIGIT(*s)) 8897 return FALSE; 8898 while (isDIGIT(*s)) 8899 s++; 8900 if (*s == ',') 8901 s++; 8902 while (isDIGIT(*s)) 8903 s++; 8904 if (*s != '}') 8905 return FALSE; 8906 return TRUE; 8907 } 8908 #endif 8909 8910 /* 8911 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 8912 */ 8913 #ifdef DEBUGGING 8914 static void 8915 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) 8916 { 8917 int bit; 8918 int set=0; 8919 8920 for (bit=0; bit<32; bit++) { 8921 if (flags & (1<<bit)) { 8922 if (!set++ && lead) 8923 PerlIO_printf(Perl_debug_log, "%s",lead); 8924 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]); 8925 } 8926 } 8927 if (lead) { 8928 if (set) 8929 PerlIO_printf(Perl_debug_log, "\n"); 8930 else 8931 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead); 8932 } 8933 } 8934 #endif 8935 8936 void 8937 Perl_regdump(pTHX_ const regexp *r) 8938 { 8939 #ifdef DEBUGGING 8940 dVAR; 8941 SV * const sv = sv_newmortal(); 8942 SV *dsv= sv_newmortal(); 8943 RXi_GET_DECL(r,ri); 8944 GET_RE_DEBUG_FLAGS_DECL; 8945 8946 PERL_ARGS_ASSERT_REGDUMP; 8947 8948 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); 8949 8950 /* Header fields of interest. */ 8951 if (r->anchored_substr) { 8952 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), 8953 RE_SV_DUMPLEN(r->anchored_substr), 30); 8954 PerlIO_printf(Perl_debug_log, 8955 "anchored %s%s at %"IVdf" ", 8956 s, RE_SV_TAIL(r->anchored_substr), 8957 (IV)r->anchored_offset); 8958 } else if (r->anchored_utf8) { 8959 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), 8960 RE_SV_DUMPLEN(r->anchored_utf8), 30); 8961 PerlIO_printf(Perl_debug_log, 8962 "anchored utf8 %s%s at %"IVdf" ", 8963 s, RE_SV_TAIL(r->anchored_utf8), 8964 (IV)r->anchored_offset); 8965 } 8966 if (r->float_substr) { 8967 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), 8968 RE_SV_DUMPLEN(r->float_substr), 30); 8969 PerlIO_printf(Perl_debug_log, 8970 "floating %s%s at %"IVdf"..%"UVuf" ", 8971 s, RE_SV_TAIL(r->float_substr), 8972 (IV)r->float_min_offset, (UV)r->float_max_offset); 8973 } else if (r->float_utf8) { 8974 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), 8975 RE_SV_DUMPLEN(r->float_utf8), 30); 8976 PerlIO_printf(Perl_debug_log, 8977 "floating utf8 %s%s at %"IVdf"..%"UVuf" ", 8978 s, RE_SV_TAIL(r->float_utf8), 8979 (IV)r->float_min_offset, (UV)r->float_max_offset); 8980 } 8981 if (r->check_substr || r->check_utf8) 8982 PerlIO_printf(Perl_debug_log, 8983 (const char *) 8984 (r->check_substr == r->float_substr 8985 && r->check_utf8 == r->float_utf8 8986 ? "(checking floating" : "(checking anchored")); 8987 if (r->extflags & RXf_NOSCAN) 8988 PerlIO_printf(Perl_debug_log, " noscan"); 8989 if (r->extflags & RXf_CHECK_ALL) 8990 PerlIO_printf(Perl_debug_log, " isall"); 8991 if (r->check_substr || r->check_utf8) 8992 PerlIO_printf(Perl_debug_log, ") "); 8993 8994 if (ri->regstclass) { 8995 regprop(r, sv, ri->regstclass); 8996 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); 8997 } 8998 if (r->extflags & RXf_ANCH) { 8999 PerlIO_printf(Perl_debug_log, "anchored"); 9000 if (r->extflags & RXf_ANCH_BOL) 9001 PerlIO_printf(Perl_debug_log, "(BOL)"); 9002 if (r->extflags & RXf_ANCH_MBOL) 9003 PerlIO_printf(Perl_debug_log, "(MBOL)"); 9004 if (r->extflags & RXf_ANCH_SBOL) 9005 PerlIO_printf(Perl_debug_log, "(SBOL)"); 9006 if (r->extflags & RXf_ANCH_GPOS) 9007 PerlIO_printf(Perl_debug_log, "(GPOS)"); 9008 PerlIO_putc(Perl_debug_log, ' '); 9009 } 9010 if (r->extflags & RXf_GPOS_SEEN) 9011 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); 9012 if (r->intflags & PREGf_SKIP) 9013 PerlIO_printf(Perl_debug_log, "plus "); 9014 if (r->intflags & PREGf_IMPLICIT) 9015 PerlIO_printf(Perl_debug_log, "implicit "); 9016 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); 9017 if (r->extflags & RXf_EVAL_SEEN) 9018 PerlIO_printf(Perl_debug_log, "with eval "); 9019 PerlIO_printf(Perl_debug_log, "\n"); 9020 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); 9021 #else 9022 PERL_ARGS_ASSERT_REGDUMP; 9023 PERL_UNUSED_CONTEXT; 9024 PERL_UNUSED_ARG(r); 9025 #endif /* DEBUGGING */ 9026 } 9027 9028 /* 9029 - regprop - printable representation of opcode 9030 */ 9031 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ 9032 STMT_START { \ 9033 if (do_sep) { \ 9034 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ 9035 if (flags & ANYOF_INVERT) \ 9036 /*make sure the invert info is in each */ \ 9037 sv_catpvs(sv, "^"); \ 9038 do_sep = 0; \ 9039 } \ 9040 } STMT_END 9041 9042 void 9043 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) 9044 { 9045 #ifdef DEBUGGING 9046 dVAR; 9047 register int k; 9048 RXi_GET_DECL(prog,progi); 9049 GET_RE_DEBUG_FLAGS_DECL; 9050 9051 PERL_ARGS_ASSERT_REGPROP; 9052 9053 sv_setpvs(sv, ""); 9054 9055 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ 9056 /* It would be nice to FAIL() here, but this may be called from 9057 regexec.c, and it would be hard to supply pRExC_state. */ 9058 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); 9059 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ 9060 9061 k = PL_regkind[OP(o)]; 9062 9063 if (k == EXACT) { 9064 sv_catpvs(sv, " "); 9065 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 9066 * is a crude hack but it may be the best for now since 9067 * we have no flag "this EXACTish node was UTF-8" 9068 * --jhi */ 9069 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], 9070 PERL_PV_ESCAPE_UNI_DETECT | 9071 PERL_PV_PRETTY_ELLIPSES | 9072 PERL_PV_PRETTY_LTGT | 9073 PERL_PV_PRETTY_NOCLEAR 9074 ); 9075 } else if (k == TRIE) { 9076 /* print the details of the trie in dumpuntil instead, as 9077 * progi->data isn't available here */ 9078 const char op = OP(o); 9079 const U32 n = ARG(o); 9080 const reg_ac_data * const ac = IS_TRIE_AC(op) ? 9081 (reg_ac_data *)progi->data->data[n] : 9082 NULL; 9083 const reg_trie_data * const trie 9084 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; 9085 9086 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); 9087 DEBUG_TRIE_COMPILE_r( 9088 Perl_sv_catpvf(aTHX_ sv, 9089 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">", 9090 (UV)trie->startstate, 9091 (IV)trie->statecount-1, /* -1 because of the unused 0 element */ 9092 (UV)trie->wordcount, 9093 (UV)trie->minlen, 9094 (UV)trie->maxlen, 9095 (UV)TRIE_CHARCOUNT(trie), 9096 (UV)trie->uniquecharcount 9097 ) 9098 ); 9099 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { 9100 int i; 9101 int rangestart = -1; 9102 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); 9103 sv_catpvs(sv, "["); 9104 for (i = 0; i <= 256; i++) { 9105 if (i < 256 && BITMAP_TEST(bitmap,i)) { 9106 if (rangestart == -1) 9107 rangestart = i; 9108 } else if (rangestart != -1) { 9109 if (i <= rangestart + 3) 9110 for (; rangestart < i; rangestart++) 9111 put_byte(sv, rangestart); 9112 else { 9113 put_byte(sv, rangestart); 9114 sv_catpvs(sv, "-"); 9115 put_byte(sv, i - 1); 9116 } 9117 rangestart = -1; 9118 } 9119 } 9120 sv_catpvs(sv, "]"); 9121 } 9122 9123 } else if (k == CURLY) { 9124 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) 9125 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ 9126 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); 9127 } 9128 else if (k == WHILEM && o->flags) /* Ordinal/of */ 9129 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); 9130 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { 9131 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ 9132 if ( RXp_PAREN_NAMES(prog) ) { 9133 if ( k != REF || OP(o) < NREF) { 9134 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 9135 SV **name= av_fetch(list, ARG(o), 0 ); 9136 if (name) 9137 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 9138 } 9139 else { 9140 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); 9141 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); 9142 I32 *nums=(I32*)SvPVX(sv_dat); 9143 SV **name= av_fetch(list, nums[0], 0 ); 9144 I32 n; 9145 if (name) { 9146 for ( n=0; n<SvIVX(sv_dat); n++ ) { 9147 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf, 9148 (n ? "," : ""), (IV)nums[n]); 9149 } 9150 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 9151 } 9152 } 9153 } 9154 } else if (k == GOSUB) 9155 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ 9156 else if (k == VERB) { 9157 if (!o->flags) 9158 Perl_sv_catpvf(aTHX_ sv, ":%"SVf, 9159 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); 9160 } else if (k == LOGICAL) 9161 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ 9162 else if (k == FOLDCHAR) 9163 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); 9164 else if (k == ANYOF) { 9165 int i, rangestart = -1; 9166 const U8 flags = ANYOF_FLAGS(o); 9167 int do_sep = 0; 9168 9169 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ 9170 static const char * const anyofs[] = { 9171 "\\w", 9172 "\\W", 9173 "\\s", 9174 "\\S", 9175 "\\d", 9176 "\\D", 9177 "[:alnum:]", 9178 "[:^alnum:]", 9179 "[:alpha:]", 9180 "[:^alpha:]", 9181 "[:ascii:]", 9182 "[:^ascii:]", 9183 "[:cntrl:]", 9184 "[:^cntrl:]", 9185 "[:graph:]", 9186 "[:^graph:]", 9187 "[:lower:]", 9188 "[:^lower:]", 9189 "[:print:]", 9190 "[:^print:]", 9191 "[:punct:]", 9192 "[:^punct:]", 9193 "[:upper:]", 9194 "[:^upper:]", 9195 "[:xdigit:]", 9196 "[:^xdigit:]", 9197 "[:space:]", 9198 "[:^space:]", 9199 "[:blank:]", 9200 "[:^blank:]" 9201 }; 9202 9203 if (flags & ANYOF_LOCALE) 9204 sv_catpvs(sv, "{loc}"); 9205 if (flags & ANYOF_FOLD) 9206 sv_catpvs(sv, "{i}"); 9207 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 9208 if (flags & ANYOF_INVERT) 9209 sv_catpvs(sv, "^"); 9210 9211 /* output what the standard cp 0-255 bitmap matches */ 9212 for (i = 0; i <= 256; i++) { 9213 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { 9214 if (rangestart == -1) 9215 rangestart = i; 9216 } else if (rangestart != -1) { 9217 if (i <= rangestart + 3) 9218 for (; rangestart < i; rangestart++) 9219 put_byte(sv, rangestart); 9220 else { 9221 put_byte(sv, rangestart); 9222 sv_catpvs(sv, "-"); 9223 put_byte(sv, i - 1); 9224 } 9225 do_sep = 1; 9226 rangestart = -1; 9227 } 9228 } 9229 9230 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); 9231 /* output any special charclass tests (used mostly under use locale) */ 9232 if (o->flags & ANYOF_CLASS) 9233 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) 9234 if (ANYOF_CLASS_TEST(o,i)) { 9235 sv_catpv(sv, anyofs[i]); 9236 do_sep = 1; 9237 } 9238 9239 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); 9240 9241 /* output information about the unicode matching */ 9242 if (flags & ANYOF_UNICODE) 9243 sv_catpvs(sv, "{unicode}"); 9244 else if (flags & ANYOF_UNICODE_ALL) 9245 sv_catpvs(sv, "{unicode_all}"); 9246 9247 { 9248 SV *lv; 9249 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0); 9250 9251 if (lv) { 9252 if (sw) { 9253 U8 s[UTF8_MAXBYTES_CASE+1]; 9254 9255 for (i = 0; i <= 256; i++) { /* just the first 256 */ 9256 uvchr_to_utf8(s, i); 9257 9258 if (i < 256 && swash_fetch(sw, s, TRUE)) { 9259 if (rangestart == -1) 9260 rangestart = i; 9261 } else if (rangestart != -1) { 9262 if (i <= rangestart + 3) 9263 for (; rangestart < i; rangestart++) { 9264 const U8 * const e = uvchr_to_utf8(s,rangestart); 9265 U8 *p; 9266 for(p = s; p < e; p++) 9267 put_byte(sv, *p); 9268 } 9269 else { 9270 const U8 *e = uvchr_to_utf8(s,rangestart); 9271 U8 *p; 9272 for (p = s; p < e; p++) 9273 put_byte(sv, *p); 9274 sv_catpvs(sv, "-"); 9275 e = uvchr_to_utf8(s, i-1); 9276 for (p = s; p < e; p++) 9277 put_byte(sv, *p); 9278 } 9279 rangestart = -1; 9280 } 9281 } 9282 9283 sv_catpvs(sv, "..."); /* et cetera */ 9284 } 9285 9286 { 9287 char *s = savesvpv(lv); 9288 char * const origs = s; 9289 9290 while (*s && *s != '\n') 9291 s++; 9292 9293 if (*s == '\n') { 9294 const char * const t = ++s; 9295 9296 while (*s) { 9297 if (*s == '\n') 9298 *s = ' '; 9299 s++; 9300 } 9301 if (s[-1] == ' ') 9302 s[-1] = 0; 9303 9304 sv_catpv(sv, t); 9305 } 9306 9307 Safefree(origs); 9308 } 9309 } 9310 } 9311 9312 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 9313 } 9314 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) 9315 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); 9316 #else 9317 PERL_UNUSED_CONTEXT; 9318 PERL_UNUSED_ARG(sv); 9319 PERL_UNUSED_ARG(o); 9320 PERL_UNUSED_ARG(prog); 9321 #endif /* DEBUGGING */ 9322 } 9323 9324 SV * 9325 Perl_re_intuit_string(pTHX_ REGEXP * const r) 9326 { /* Assume that RE_INTUIT is set */ 9327 dVAR; 9328 struct regexp *const prog = (struct regexp *)SvANY(r); 9329 GET_RE_DEBUG_FLAGS_DECL; 9330 9331 PERL_ARGS_ASSERT_RE_INTUIT_STRING; 9332 PERL_UNUSED_CONTEXT; 9333 9334 DEBUG_COMPILE_r( 9335 { 9336 const char * const s = SvPV_nolen_const(prog->check_substr 9337 ? prog->check_substr : prog->check_utf8); 9338 9339 if (!PL_colorset) reginitcolors(); 9340 PerlIO_printf(Perl_debug_log, 9341 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", 9342 PL_colors[4], 9343 prog->check_substr ? "" : "utf8 ", 9344 PL_colors[5],PL_colors[0], 9345 s, 9346 PL_colors[1], 9347 (strlen(s) > 60 ? "..." : "")); 9348 } ); 9349 9350 return prog->check_substr ? prog->check_substr : prog->check_utf8; 9351 } 9352 9353 /* 9354 pregfree() 9355 9356 handles refcounting and freeing the perl core regexp structure. When 9357 it is necessary to actually free the structure the first thing it 9358 does is call the 'free' method of the regexp_engine associated to to 9359 the regexp, allowing the handling of the void *pprivate; member 9360 first. (This routine is not overridable by extensions, which is why 9361 the extensions free is called first.) 9362 9363 See regdupe and regdupe_internal if you change anything here. 9364 */ 9365 #ifndef PERL_IN_XSUB_RE 9366 void 9367 Perl_pregfree(pTHX_ REGEXP *r) 9368 { 9369 SvREFCNT_dec(r); 9370 } 9371 9372 void 9373 Perl_pregfree2(pTHX_ REGEXP *rx) 9374 { 9375 dVAR; 9376 struct regexp *const r = (struct regexp *)SvANY(rx); 9377 GET_RE_DEBUG_FLAGS_DECL; 9378 9379 PERL_ARGS_ASSERT_PREGFREE2; 9380 9381 if (r->mother_re) { 9382 ReREFCNT_dec(r->mother_re); 9383 } else { 9384 CALLREGFREE_PVT(rx); /* free the private data */ 9385 SvREFCNT_dec(RXp_PAREN_NAMES(r)); 9386 } 9387 if (r->substrs) { 9388 SvREFCNT_dec(r->anchored_substr); 9389 SvREFCNT_dec(r->anchored_utf8); 9390 SvREFCNT_dec(r->float_substr); 9391 SvREFCNT_dec(r->float_utf8); 9392 Safefree(r->substrs); 9393 } 9394 RX_MATCH_COPY_FREE(rx); 9395 #ifdef PERL_OLD_COPY_ON_WRITE 9396 SvREFCNT_dec(r->saved_copy); 9397 #endif 9398 Safefree(r->offs); 9399 } 9400 9401 /* reg_temp_copy() 9402 9403 This is a hacky workaround to the structural issue of match results 9404 being stored in the regexp structure which is in turn stored in 9405 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern 9406 could be PL_curpm in multiple contexts, and could require multiple 9407 result sets being associated with the pattern simultaneously, such 9408 as when doing a recursive match with (??{$qr}) 9409 9410 The solution is to make a lightweight copy of the regexp structure 9411 when a qr// is returned from the code executed by (??{$qr}) this 9412 lightweight copy doesnt actually own any of its data except for 9413 the starp/end and the actual regexp structure itself. 9414 9415 */ 9416 9417 9418 REGEXP * 9419 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) 9420 { 9421 struct regexp *ret; 9422 struct regexp *const r = (struct regexp *)SvANY(rx); 9423 register const I32 npar = r->nparens+1; 9424 9425 PERL_ARGS_ASSERT_REG_TEMP_COPY; 9426 9427 if (!ret_x) 9428 ret_x = (REGEXP*) newSV_type(SVt_REGEXP); 9429 ret = (struct regexp *)SvANY(ret_x); 9430 9431 (void)ReREFCNT_inc(rx); 9432 /* We can take advantage of the existing "copied buffer" mechanism in SVs 9433 by pointing directly at the buffer, but flagging that the allocated 9434 space in the copy is zero. As we've just done a struct copy, it's now 9435 a case of zero-ing that, rather than copying the current length. */ 9436 SvPV_set(ret_x, RX_WRAPPED(rx)); 9437 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8); 9438 memcpy(&(ret->xpv_cur), &(r->xpv_cur), 9439 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); 9440 SvLEN_set(ret_x, 0); 9441 SvSTASH_set(ret_x, NULL); 9442 SvMAGIC_set(ret_x, NULL); 9443 Newx(ret->offs, npar, regexp_paren_pair); 9444 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 9445 if (r->substrs) { 9446 Newx(ret->substrs, 1, struct reg_substr_data); 9447 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 9448 9449 SvREFCNT_inc_void(ret->anchored_substr); 9450 SvREFCNT_inc_void(ret->anchored_utf8); 9451 SvREFCNT_inc_void(ret->float_substr); 9452 SvREFCNT_inc_void(ret->float_utf8); 9453 9454 /* check_substr and check_utf8, if non-NULL, point to either their 9455 anchored or float namesakes, and don't hold a second reference. */ 9456 } 9457 RX_MATCH_COPIED_off(ret_x); 9458 #ifdef PERL_OLD_COPY_ON_WRITE 9459 ret->saved_copy = NULL; 9460 #endif 9461 ret->mother_re = rx; 9462 9463 return ret_x; 9464 } 9465 #endif 9466 9467 /* regfree_internal() 9468 9469 Free the private data in a regexp. This is overloadable by 9470 extensions. Perl takes care of the regexp structure in pregfree(), 9471 this covers the *pprivate pointer which technically perldoesnt 9472 know about, however of course we have to handle the 9473 regexp_internal structure when no extension is in use. 9474 9475 Note this is called before freeing anything in the regexp 9476 structure. 9477 */ 9478 9479 void 9480 Perl_regfree_internal(pTHX_ REGEXP * const rx) 9481 { 9482 dVAR; 9483 struct regexp *const r = (struct regexp *)SvANY(rx); 9484 RXi_GET_DECL(r,ri); 9485 GET_RE_DEBUG_FLAGS_DECL; 9486 9487 PERL_ARGS_ASSERT_REGFREE_INTERNAL; 9488 9489 DEBUG_COMPILE_r({ 9490 if (!PL_colorset) 9491 reginitcolors(); 9492 { 9493 SV *dsv= sv_newmortal(); 9494 RE_PV_QUOTED_DECL(s, RX_UTF8(rx), 9495 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); 9496 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", 9497 PL_colors[4],PL_colors[5],s); 9498 } 9499 }); 9500 #ifdef RE_TRACK_PATTERN_OFFSETS 9501 if (ri->u.offsets) 9502 Safefree(ri->u.offsets); /* 20010421 MJD */ 9503 #endif 9504 if (ri->data) { 9505 int n = ri->data->count; 9506 PAD* new_comppad = NULL; 9507 PAD* old_comppad; 9508 PADOFFSET refcnt; 9509 9510 while (--n >= 0) { 9511 /* If you add a ->what type here, update the comment in regcomp.h */ 9512 switch (ri->data->what[n]) { 9513 case 's': 9514 case 'S': 9515 case 'u': 9516 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); 9517 break; 9518 case 'f': 9519 Safefree(ri->data->data[n]); 9520 break; 9521 case 'p': 9522 new_comppad = MUTABLE_AV(ri->data->data[n]); 9523 break; 9524 case 'o': 9525 if (new_comppad == NULL) 9526 Perl_croak(aTHX_ "panic: pregfree comppad"); 9527 PAD_SAVE_LOCAL(old_comppad, 9528 /* Watch out for global destruction's random ordering. */ 9529 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL 9530 ); 9531 OP_REFCNT_LOCK; 9532 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]); 9533 OP_REFCNT_UNLOCK; 9534 if (!refcnt) 9535 op_free((OP_4tree*)ri->data->data[n]); 9536 9537 PAD_RESTORE_LOCAL(old_comppad); 9538 SvREFCNT_dec(MUTABLE_SV(new_comppad)); 9539 new_comppad = NULL; 9540 break; 9541 case 'n': 9542 break; 9543 case 'T': 9544 { /* Aho Corasick add-on structure for a trie node. 9545 Used in stclass optimization only */ 9546 U32 refcount; 9547 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; 9548 OP_REFCNT_LOCK; 9549 refcount = --aho->refcount; 9550 OP_REFCNT_UNLOCK; 9551 if ( !refcount ) { 9552 PerlMemShared_free(aho->states); 9553 PerlMemShared_free(aho->fail); 9554 /* do this last!!!! */ 9555 PerlMemShared_free(ri->data->data[n]); 9556 PerlMemShared_free(ri->regstclass); 9557 } 9558 } 9559 break; 9560 case 't': 9561 { 9562 /* trie structure. */ 9563 U32 refcount; 9564 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; 9565 OP_REFCNT_LOCK; 9566 refcount = --trie->refcount; 9567 OP_REFCNT_UNLOCK; 9568 if ( !refcount ) { 9569 PerlMemShared_free(trie->charmap); 9570 PerlMemShared_free(trie->states); 9571 PerlMemShared_free(trie->trans); 9572 if (trie->bitmap) 9573 PerlMemShared_free(trie->bitmap); 9574 if (trie->wordlen) 9575 PerlMemShared_free(trie->wordlen); 9576 if (trie->jump) 9577 PerlMemShared_free(trie->jump); 9578 if (trie->nextword) 9579 PerlMemShared_free(trie->nextword); 9580 /* do this last!!!! */ 9581 PerlMemShared_free(ri->data->data[n]); 9582 } 9583 } 9584 break; 9585 default: 9586 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); 9587 } 9588 } 9589 Safefree(ri->data->what); 9590 Safefree(ri->data); 9591 } 9592 9593 Safefree(ri); 9594 } 9595 9596 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) 9597 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t))) 9598 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t))) 9599 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) 9600 9601 /* 9602 re_dup - duplicate a regexp. 9603 9604 This routine is expected to clone a given regexp structure. It is only 9605 compiled under USE_ITHREADS. 9606 9607 After all of the core data stored in struct regexp is duplicated 9608 the regexp_engine.dupe method is used to copy any private data 9609 stored in the *pprivate pointer. This allows extensions to handle 9610 any duplication it needs to do. 9611 9612 See pregfree() and regfree_internal() if you change anything here. 9613 */ 9614 #if defined(USE_ITHREADS) 9615 #ifndef PERL_IN_XSUB_RE 9616 void 9617 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) 9618 { 9619 dVAR; 9620 I32 npar; 9621 const struct regexp *r = (const struct regexp *)SvANY(sstr); 9622 struct regexp *ret = (struct regexp *)SvANY(dstr); 9623 9624 PERL_ARGS_ASSERT_RE_DUP_GUTS; 9625 9626 npar = r->nparens+1; 9627 Newx(ret->offs, npar, regexp_paren_pair); 9628 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 9629 if(ret->swap) { 9630 /* no need to copy these */ 9631 Newx(ret->swap, npar, regexp_paren_pair); 9632 } 9633 9634 if (ret->substrs) { 9635 /* Do it this way to avoid reading from *r after the StructCopy(). 9636 That way, if any of the sv_dup_inc()s dislodge *r from the L1 9637 cache, it doesn't matter. */ 9638 const bool anchored = r->check_substr 9639 ? r->check_substr == r->anchored_substr 9640 : r->check_utf8 == r->anchored_utf8; 9641 Newx(ret->substrs, 1, struct reg_substr_data); 9642 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 9643 9644 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); 9645 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); 9646 ret->float_substr = sv_dup_inc(ret->float_substr, param); 9647 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); 9648 9649 /* check_substr and check_utf8, if non-NULL, point to either their 9650 anchored or float namesakes, and don't hold a second reference. */ 9651 9652 if (ret->check_substr) { 9653 if (anchored) { 9654 assert(r->check_utf8 == r->anchored_utf8); 9655 ret->check_substr = ret->anchored_substr; 9656 ret->check_utf8 = ret->anchored_utf8; 9657 } else { 9658 assert(r->check_substr == r->float_substr); 9659 assert(r->check_utf8 == r->float_utf8); 9660 ret->check_substr = ret->float_substr; 9661 ret->check_utf8 = ret->float_utf8; 9662 } 9663 } else if (ret->check_utf8) { 9664 if (anchored) { 9665 ret->check_utf8 = ret->anchored_utf8; 9666 } else { 9667 ret->check_utf8 = ret->float_utf8; 9668 } 9669 } 9670 } 9671 9672 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); 9673 9674 if (ret->pprivate) 9675 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); 9676 9677 if (RX_MATCH_COPIED(dstr)) 9678 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); 9679 else 9680 ret->subbeg = NULL; 9681 #ifdef PERL_OLD_COPY_ON_WRITE 9682 ret->saved_copy = NULL; 9683 #endif 9684 9685 if (ret->mother_re) { 9686 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) { 9687 /* Our storage points directly to our mother regexp, but that's 9688 1: a buffer in a different thread 9689 2: something we no longer hold a reference on 9690 so we need to copy it locally. */ 9691 /* Note we need to sue SvCUR() on our mother_re, because it, in 9692 turn, may well be pointing to its own mother_re. */ 9693 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re), 9694 SvCUR(ret->mother_re)+1)); 9695 SvLEN_set(dstr, SvCUR(ret->mother_re)+1); 9696 } 9697 ret->mother_re = NULL; 9698 } 9699 ret->gofs = 0; 9700 } 9701 #endif /* PERL_IN_XSUB_RE */ 9702 9703 /* 9704 regdupe_internal() 9705 9706 This is the internal complement to regdupe() which is used to copy 9707 the structure pointed to by the *pprivate pointer in the regexp. 9708 This is the core version of the extension overridable cloning hook. 9709 The regexp structure being duplicated will be copied by perl prior 9710 to this and will be provided as the regexp *r argument, however 9711 with the /old/ structures pprivate pointer value. Thus this routine 9712 may override any copying normally done by perl. 9713 9714 It returns a pointer to the new regexp_internal structure. 9715 */ 9716 9717 void * 9718 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) 9719 { 9720 dVAR; 9721 struct regexp *const r = (struct regexp *)SvANY(rx); 9722 regexp_internal *reti; 9723 int len, npar; 9724 RXi_GET_DECL(r,ri); 9725 9726 PERL_ARGS_ASSERT_REGDUPE_INTERNAL; 9727 9728 npar = r->nparens+1; 9729 len = ProgLen(ri); 9730 9731 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); 9732 Copy(ri->program, reti->program, len+1, regnode); 9733 9734 9735 reti->regstclass = NULL; 9736 9737 if (ri->data) { 9738 struct reg_data *d; 9739 const int count = ri->data->count; 9740 int i; 9741 9742 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), 9743 char, struct reg_data); 9744 Newx(d->what, count, U8); 9745 9746 d->count = count; 9747 for (i = 0; i < count; i++) { 9748 d->what[i] = ri->data->what[i]; 9749 switch (d->what[i]) { 9750 /* legal options are one of: sSfpontTu 9751 see also regcomp.h and pregfree() */ 9752 case 's': 9753 case 'S': 9754 case 'p': /* actually an AV, but the dup function is identical. */ 9755 case 'u': /* actually an HV, but the dup function is identical. */ 9756 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); 9757 break; 9758 case 'f': 9759 /* This is cheating. */ 9760 Newx(d->data[i], 1, struct regnode_charclass_class); 9761 StructCopy(ri->data->data[i], d->data[i], 9762 struct regnode_charclass_class); 9763 reti->regstclass = (regnode*)d->data[i]; 9764 break; 9765 case 'o': 9766 /* Compiled op trees are readonly and in shared memory, 9767 and can thus be shared without duplication. */ 9768 OP_REFCNT_LOCK; 9769 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]); 9770 OP_REFCNT_UNLOCK; 9771 break; 9772 case 'T': 9773 /* Trie stclasses are readonly and can thus be shared 9774 * without duplication. We free the stclass in pregfree 9775 * when the corresponding reg_ac_data struct is freed. 9776 */ 9777 reti->regstclass= ri->regstclass; 9778 /* Fall through */ 9779 case 't': 9780 OP_REFCNT_LOCK; 9781 ((reg_trie_data*)ri->data->data[i])->refcount++; 9782 OP_REFCNT_UNLOCK; 9783 /* Fall through */ 9784 case 'n': 9785 d->data[i] = ri->data->data[i]; 9786 break; 9787 default: 9788 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); 9789 } 9790 } 9791 9792 reti->data = d; 9793 } 9794 else 9795 reti->data = NULL; 9796 9797 reti->name_list_idx = ri->name_list_idx; 9798 9799 #ifdef RE_TRACK_PATTERN_OFFSETS 9800 if (ri->u.offsets) { 9801 Newx(reti->u.offsets, 2*len+1, U32); 9802 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); 9803 } 9804 #else 9805 SetProgLen(reti,len); 9806 #endif 9807 9808 return (void*)reti; 9809 } 9810 9811 #endif /* USE_ITHREADS */ 9812 9813 #ifndef PERL_IN_XSUB_RE 9814 9815 /* 9816 - regnext - dig the "next" pointer out of a node 9817 */ 9818 regnode * 9819 Perl_regnext(pTHX_ register regnode *p) 9820 { 9821 dVAR; 9822 register I32 offset; 9823 9824 if (!p) 9825 return(NULL); 9826 9827 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); 9828 if (offset == 0) 9829 return(NULL); 9830 9831 return(p+offset); 9832 } 9833 #endif 9834 9835 STATIC void 9836 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) 9837 { 9838 va_list args; 9839 STRLEN l1 = strlen(pat1); 9840 STRLEN l2 = strlen(pat2); 9841 char buf[512]; 9842 SV *msv; 9843 const char *message; 9844 9845 PERL_ARGS_ASSERT_RE_CROAK2; 9846 9847 if (l1 > 510) 9848 l1 = 510; 9849 if (l1 + l2 > 510) 9850 l2 = 510 - l1; 9851 Copy(pat1, buf, l1 , char); 9852 Copy(pat2, buf + l1, l2 , char); 9853 buf[l1 + l2] = '\n'; 9854 buf[l1 + l2 + 1] = '\0'; 9855 #ifdef I_STDARG 9856 /* ANSI variant takes additional second argument */ 9857 va_start(args, pat2); 9858 #else 9859 va_start(args); 9860 #endif 9861 msv = vmess(buf, &args); 9862 va_end(args); 9863 message = SvPV_const(msv,l1); 9864 if (l1 > 512) 9865 l1 = 512; 9866 Copy(message, buf, l1 , char); 9867 buf[l1-1] = '\0'; /* Overwrite \n */ 9868 Perl_croak(aTHX_ "%s", buf); 9869 } 9870 9871 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 9872 9873 #ifndef PERL_IN_XSUB_RE 9874 void 9875 Perl_save_re_context(pTHX) 9876 { 9877 dVAR; 9878 9879 struct re_save_state *state; 9880 9881 SAVEVPTR(PL_curcop); 9882 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1); 9883 9884 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix); 9885 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE; 9886 SSPUSHINT(SAVEt_RE_STATE); 9887 9888 Copy(&PL_reg_state, state, 1, struct re_save_state); 9889 9890 PL_reg_start_tmp = 0; 9891 PL_reg_start_tmpl = 0; 9892 PL_reg_oldsaved = NULL; 9893 PL_reg_oldsavedlen = 0; 9894 PL_reg_maxiter = 0; 9895 PL_reg_leftiter = 0; 9896 PL_reg_poscache = NULL; 9897 PL_reg_poscache_size = 0; 9898 #ifdef PERL_OLD_COPY_ON_WRITE 9899 PL_nrs = NULL; 9900 #endif 9901 9902 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ 9903 if (PL_curpm) { 9904 const REGEXP * const rx = PM_GETRE(PL_curpm); 9905 if (rx) { 9906 U32 i; 9907 for (i = 1; i <= RX_NPARENS(rx); i++) { 9908 char digits[TYPE_CHARS(long)]; 9909 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); 9910 GV *const *const gvp 9911 = (GV**)hv_fetch(PL_defstash, digits, len, 0); 9912 9913 if (gvp) { 9914 GV * const gv = *gvp; 9915 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) 9916 save_scalar(gv); 9917 } 9918 } 9919 } 9920 } 9921 } 9922 #endif 9923 9924 static void 9925 clear_re(pTHX_ void *r) 9926 { 9927 dVAR; 9928 ReREFCNT_dec((REGEXP *)r); 9929 } 9930 9931 #ifdef DEBUGGING 9932 9933 STATIC void 9934 S_put_byte(pTHX_ SV *sv, int c) 9935 { 9936 PERL_ARGS_ASSERT_PUT_BYTE; 9937 9938 /* Our definition of isPRINT() ignores locales, so only bytes that are 9939 not part of UTF-8 are considered printable. I assume that the same 9940 holds for UTF-EBCDIC. 9941 Also, code point 255 is not printable in either (it's E0 in EBCDIC, 9942 which Wikipedia says: 9943 9944 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all 9945 ones (binary 1111 1111, hexadecimal FF). It is similar, but not 9946 identical, to the ASCII delete (DEL) or rubout control character. 9947 ) So the old condition can be simplified to !isPRINT(c) */ 9948 if (!isPRINT(c)) 9949 Perl_sv_catpvf(aTHX_ sv, "\\%o", c); 9950 else { 9951 const char string = c; 9952 if (c == '-' || c == ']' || c == '\\' || c == '^') 9953 sv_catpvs(sv, "\\"); 9954 sv_catpvn(sv, &string, 1); 9955 } 9956 } 9957 9958 9959 #define CLEAR_OPTSTART \ 9960 if (optstart) STMT_START { \ 9961 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ 9962 optstart=NULL; \ 9963 } STMT_END 9964 9965 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); 9966 9967 STATIC const regnode * 9968 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, 9969 const regnode *last, const regnode *plast, 9970 SV* sv, I32 indent, U32 depth) 9971 { 9972 dVAR; 9973 register U8 op = PSEUDO; /* Arbitrary non-END op. */ 9974 register const regnode *next; 9975 const regnode *optstart= NULL; 9976 9977 RXi_GET_DECL(r,ri); 9978 GET_RE_DEBUG_FLAGS_DECL; 9979 9980 PERL_ARGS_ASSERT_DUMPUNTIL; 9981 9982 #ifdef DEBUG_DUMPUNTIL 9983 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, 9984 last ? last-start : 0,plast ? plast-start : 0); 9985 #endif 9986 9987 if (plast && plast < last) 9988 last= plast; 9989 9990 while (PL_regkind[op] != END && (!last || node < last)) { 9991 /* While that wasn't END last time... */ 9992 NODE_ALIGN(node); 9993 op = OP(node); 9994 if (op == CLOSE || op == WHILEM) 9995 indent--; 9996 next = regnext((regnode *)node); 9997 9998 /* Where, what. */ 9999 if (OP(node) == OPTIMIZED) { 10000 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) 10001 optstart = node; 10002 else 10003 goto after_print; 10004 } else 10005 CLEAR_OPTSTART; 10006 10007 regprop(r, sv, node); 10008 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), 10009 (int)(2*indent + 1), "", SvPVX_const(sv)); 10010 10011 if (OP(node) != OPTIMIZED) { 10012 if (next == NULL) /* Next ptr. */ 10013 PerlIO_printf(Perl_debug_log, " (0)"); 10014 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) 10015 PerlIO_printf(Perl_debug_log, " (FAIL)"); 10016 else 10017 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); 10018 (void)PerlIO_putc(Perl_debug_log, '\n'); 10019 } 10020 10021 after_print: 10022 if (PL_regkind[(U8)op] == BRANCHJ) { 10023 assert(next); 10024 { 10025 register const regnode *nnode = (OP(next) == LONGJMP 10026 ? regnext((regnode *)next) 10027 : next); 10028 if (last && nnode > last) 10029 nnode = last; 10030 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); 10031 } 10032 } 10033 else if (PL_regkind[(U8)op] == BRANCH) { 10034 assert(next); 10035 DUMPUNTIL(NEXTOPER(node), next); 10036 } 10037 else if ( PL_regkind[(U8)op] == TRIE ) { 10038 const regnode *this_trie = node; 10039 const char op = OP(node); 10040 const U32 n = ARG(node); 10041 const reg_ac_data * const ac = op>=AHOCORASICK ? 10042 (reg_ac_data *)ri->data->data[n] : 10043 NULL; 10044 const reg_trie_data * const trie = 10045 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; 10046 #ifdef DEBUGGING 10047 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); 10048 #endif 10049 const regnode *nextbranch= NULL; 10050 I32 word_idx; 10051 sv_setpvs(sv, ""); 10052 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { 10053 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); 10054 10055 PerlIO_printf(Perl_debug_log, "%*s%s ", 10056 (int)(2*(indent+3)), "", 10057 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, 10058 PL_colors[0], PL_colors[1], 10059 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | 10060 PERL_PV_PRETTY_ELLIPSES | 10061 PERL_PV_PRETTY_LTGT 10062 ) 10063 : "???" 10064 ); 10065 if (trie->jump) { 10066 U16 dist= trie->jump[word_idx+1]; 10067 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", 10068 (UV)((dist ? this_trie + dist : next) - start)); 10069 if (dist) { 10070 if (!nextbranch) 10071 nextbranch= this_trie + trie->jump[0]; 10072 DUMPUNTIL(this_trie + dist, nextbranch); 10073 } 10074 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 10075 nextbranch= regnext((regnode *)nextbranch); 10076 } else { 10077 PerlIO_printf(Perl_debug_log, "\n"); 10078 } 10079 } 10080 if (last && next > last) 10081 node= last; 10082 else 10083 node= next; 10084 } 10085 else if ( op == CURLY ) { /* "next" might be very big: optimizer */ 10086 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, 10087 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); 10088 } 10089 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { 10090 assert(next); 10091 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); 10092 } 10093 else if ( op == PLUS || op == STAR) { 10094 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); 10095 } 10096 else if (op == ANYOF) { 10097 /* arglen 1 + class block */ 10098 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE) 10099 ? ANYOF_CLASS_SKIP : ANYOF_SKIP); 10100 node = NEXTOPER(node); 10101 } 10102 else if (PL_regkind[(U8)op] == EXACT) { 10103 /* Literal string, where present. */ 10104 node += NODE_SZ_STR(node) - 1; 10105 node = NEXTOPER(node); 10106 } 10107 else { 10108 node = NEXTOPER(node); 10109 node += regarglen[(U8)op]; 10110 } 10111 if (op == CURLYX || op == OPEN) 10112 indent++; 10113 } 10114 CLEAR_OPTSTART; 10115 #ifdef DEBUG_DUMPUNTIL 10116 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); 10117 #endif 10118 return node; 10119 } 10120 10121 #endif /* DEBUGGING */ 10122 10123 /* 10124 * Local variables: 10125 * c-indentation-style: bsd 10126 * c-basic-offset: 4 10127 * indent-tabs-mode: t 10128 * End: 10129 * 10130 * ex: set ts=8 sts=4 sw=4 noet: 10131 */ 10132