1 /* regcomp.c 2 */ 3 4 /* 5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee 6 * 7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] 8 */ 9 10 /* This file contains functions for compiling a regular expression. See 11 * also regexec.c which funnily enough, contains functions for executing 12 * a regular expression. 13 * 14 * This file is also copied at build time to ext/re/re_comp.c, where 15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 16 * This causes the main functions to be compiled under new names and with 17 * debugging support added, which makes "use re 'debug'" work. 18 */ 19 20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 21 * confused with the original package (see point 3 below). Thanks, Henry! 22 */ 23 24 /* Additional note: this code is very heavily munged from Henry's version 25 * in places. In some spots I've traded clarity for efficiency, so don't 26 * blame Henry for some of the lack of readability. 27 */ 28 29 /* The names of the functions have been changed from regcomp and 30 * regexec to pregcomp and pregexec in order to avoid conflicts 31 * with the POSIX routines of the same names. 32 */ 33 34 #ifdef PERL_EXT_RE_BUILD 35 #include "re_top.h" 36 #endif 37 38 /* 39 * pregcomp and pregexec -- regsub and regerror are not used in perl 40 * 41 * Copyright (c) 1986 by University of Toronto. 42 * Written by Henry Spencer. Not derived from licensed software. 43 * 44 * Permission is granted to anyone to use this software for any 45 * purpose on any computer system, and to redistribute it freely, 46 * subject to the following restrictions: 47 * 48 * 1. The author is not responsible for the consequences of use of 49 * this software, no matter how awful, even if they arise 50 * from defects in it. 51 * 52 * 2. The origin of this software must not be misrepresented, either 53 * by explicit claim or by omission. 54 * 55 * 3. Altered versions must be plainly marked as such, and must not 56 * be misrepresented as being the original software. 57 * 58 * 59 **** Alterations to Henry's code are... 60 **** 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 63 **** by Larry Wall and others 64 **** 65 **** You may distribute under the terms of either the GNU General Public 66 **** License or the Artistic License, as specified in the README file. 67 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73 #include "EXTERN.h" 74 #define PERL_IN_REGCOMP_C 75 #include "perl.h" 76 77 #ifndef PERL_IN_XSUB_RE 78 # include "INTERN.h" 79 #endif 80 81 #define REG_COMP_C 82 #ifdef PERL_IN_XSUB_RE 83 # include "re_comp.h" 84 #else 85 # include "regcomp.h" 86 #endif 87 88 #ifdef op 89 #undef op 90 #endif /* op */ 91 92 #ifdef MSDOS 93 # if defined(BUGGY_MSC6) 94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ 95 # pragma optimize("a",off) 96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ 97 # pragma optimize("w",on ) 98 # endif /* BUGGY_MSC6 */ 99 #endif /* MSDOS */ 100 101 #ifndef STATIC 102 #define STATIC static 103 #endif 104 105 typedef struct RExC_state_t { 106 U32 flags; /* are we folding, multilining? */ 107 char *precomp; /* uncompiled string. */ 108 regexp *rx; /* perl core regexp structure */ 109 regexp_internal *rxi; /* internal data for regexp object pprivate field */ 110 char *start; /* Start of input for compile */ 111 char *end; /* End of input for compile */ 112 char *parse; /* Input-scan pointer. */ 113 I32 whilem_seen; /* number of WHILEM in this expr */ 114 regnode *emit_start; /* Start of emitted-code area */ 115 regnode *emit_bound; /* First regnode outside of the allocated space */ 116 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ 117 I32 naughty; /* How bad is this pattern? */ 118 I32 sawback; /* Did we see \1, ...? */ 119 U32 seen; 120 I32 size; /* Code size. */ 121 I32 npar; /* Capture buffer count, (OPEN). */ 122 I32 cpar; /* Capture buffer count, (CLOSE). */ 123 I32 nestroot; /* root parens we are in - used by accept */ 124 I32 extralen; 125 I32 seen_zerolen; 126 I32 seen_evals; 127 regnode **open_parens; /* pointers to open parens */ 128 regnode **close_parens; /* pointers to close parens */ 129 regnode *opend; /* END node in program */ 130 I32 utf8; /* whether the pattern is utf8 or not */ 131 I32 orig_utf8; /* whether the pattern was originally in utf8 */ 132 /* XXX use this for future optimisation of case 133 * where pattern must be upgraded to utf8. */ 134 HV *charnames; /* cache of named sequences */ 135 HV *paren_names; /* Paren names */ 136 137 regnode **recurse; /* Recurse regops */ 138 I32 recurse_count; /* Number of recurse regops */ 139 #if ADD_TO_REGEXEC 140 char *starttry; /* -Dr: where regtry was called. */ 141 #define RExC_starttry (pRExC_state->starttry) 142 #endif 143 #ifdef DEBUGGING 144 const char *lastparse; 145 I32 lastnum; 146 AV *paren_name_list; /* idx -> name */ 147 #define RExC_lastparse (pRExC_state->lastparse) 148 #define RExC_lastnum (pRExC_state->lastnum) 149 #define RExC_paren_name_list (pRExC_state->paren_name_list) 150 #endif 151 } RExC_state_t; 152 153 #define RExC_flags (pRExC_state->flags) 154 #define RExC_precomp (pRExC_state->precomp) 155 #define RExC_rx (pRExC_state->rx) 156 #define RExC_rxi (pRExC_state->rxi) 157 #define RExC_start (pRExC_state->start) 158 #define RExC_end (pRExC_state->end) 159 #define RExC_parse (pRExC_state->parse) 160 #define RExC_whilem_seen (pRExC_state->whilem_seen) 161 #ifdef RE_TRACK_PATTERN_OFFSETS 162 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ 163 #endif 164 #define RExC_emit (pRExC_state->emit) 165 #define RExC_emit_start (pRExC_state->emit_start) 166 #define RExC_emit_bound (pRExC_state->emit_bound) 167 #define RExC_naughty (pRExC_state->naughty) 168 #define RExC_sawback (pRExC_state->sawback) 169 #define RExC_seen (pRExC_state->seen) 170 #define RExC_size (pRExC_state->size) 171 #define RExC_npar (pRExC_state->npar) 172 #define RExC_nestroot (pRExC_state->nestroot) 173 #define RExC_extralen (pRExC_state->extralen) 174 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 175 #define RExC_seen_evals (pRExC_state->seen_evals) 176 #define RExC_utf8 (pRExC_state->utf8) 177 #define RExC_orig_utf8 (pRExC_state->orig_utf8) 178 #define RExC_charnames (pRExC_state->charnames) 179 #define RExC_open_parens (pRExC_state->open_parens) 180 #define RExC_close_parens (pRExC_state->close_parens) 181 #define RExC_opend (pRExC_state->opend) 182 #define RExC_paren_names (pRExC_state->paren_names) 183 #define RExC_recurse (pRExC_state->recurse) 184 #define RExC_recurse_count (pRExC_state->recurse_count) 185 186 187 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') 188 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ 189 ((*s) == '{' && regcurly(s))) 190 191 #ifdef SPSTART 192 #undef SPSTART /* dratted cpp namespace... */ 193 #endif 194 /* 195 * Flags to be passed up and down. 196 */ 197 #define WORST 0 /* Worst case. */ 198 #define HASWIDTH 0x01 /* Known to match non-null strings. */ 199 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ 200 #define SPSTART 0x04 /* Starts with * or +. */ 201 #define TRYAGAIN 0x08 /* Weeded out a declaration. */ 202 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ 203 204 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) 205 206 /* whether trie related optimizations are enabled */ 207 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 208 #define TRIE_STUDY_OPT 209 #define FULL_TRIE_STUDY 210 #define TRIE_STCLASS 211 #endif 212 213 214 215 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] 216 #define PBITVAL(paren) (1 << ((paren) & 7)) 217 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) 218 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) 219 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) 220 221 222 /* About scan_data_t. 223 224 During optimisation we recurse through the regexp program performing 225 various inplace (keyhole style) optimisations. In addition study_chunk 226 and scan_commit populate this data structure with information about 227 what strings MUST appear in the pattern. We look for the longest 228 string that must appear for at a fixed location, and we look for the 229 longest string that may appear at a floating location. So for instance 230 in the pattern: 231 232 /FOO[xX]A.*B[xX]BAR/ 233 234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating 235 strings (because they follow a .* construct). study_chunk will identify 236 both FOO and BAR as being the longest fixed and floating strings respectively. 237 238 The strings can be composites, for instance 239 240 /(f)(o)(o)/ 241 242 will result in a composite fixed substring 'foo'. 243 244 For each string some basic information is maintained: 245 246 - offset or min_offset 247 This is the position the string must appear at, or not before. 248 It also implicitly (when combined with minlenp) tells us how many 249 character must match before the string we are searching. 250 Likewise when combined with minlenp and the length of the string 251 tells us how many characters must appear after the string we have 252 found. 253 254 - max_offset 255 Only used for floating strings. This is the rightmost point that 256 the string can appear at. Ifset to I32 max it indicates that the 257 string can occur infinitely far to the right. 258 259 - minlenp 260 A pointer to the minimum length of the pattern that the string 261 was found inside. This is important as in the case of positive 262 lookahead or positive lookbehind we can have multiple patterns 263 involved. Consider 264 265 /(?=FOO).*F/ 266 267 The minimum length of the pattern overall is 3, the minimum length 268 of the lookahead part is 3, but the minimum length of the part that 269 will actually match is 1. So 'FOO's minimum length is 3, but the 270 minimum length for the F is 1. This is important as the minimum length 271 is used to determine offsets in front of and behind the string being 272 looked for. Since strings can be composites this is the length of the 273 pattern at the time it was commited with a scan_commit. Note that 274 the length is calculated by study_chunk, so that the minimum lengths 275 are not known until the full pattern has been compiled, thus the 276 pointer to the value. 277 278 - lookbehind 279 280 In the case of lookbehind the string being searched for can be 281 offset past the start point of the final matching string. 282 If this value was just blithely removed from the min_offset it would 283 invalidate some of the calculations for how many chars must match 284 before or after (as they are derived from min_offset and minlen and 285 the length of the string being searched for). 286 When the final pattern is compiled and the data is moved from the 287 scan_data_t structure into the regexp structure the information 288 about lookbehind is factored in, with the information that would 289 have been lost precalculated in the end_shift field for the 290 associated string. 291 292 The fields pos_min and pos_delta are used to store the minimum offset 293 and the delta to the maximum offset at the current point in the pattern. 294 295 */ 296 297 typedef struct scan_data_t { 298 /*I32 len_min; unused */ 299 /*I32 len_delta; unused */ 300 I32 pos_min; 301 I32 pos_delta; 302 SV *last_found; 303 I32 last_end; /* min value, <0 unless valid. */ 304 I32 last_start_min; 305 I32 last_start_max; 306 SV **longest; /* Either &l_fixed, or &l_float. */ 307 SV *longest_fixed; /* longest fixed string found in pattern */ 308 I32 offset_fixed; /* offset where it starts */ 309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */ 310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */ 311 SV *longest_float; /* longest floating string found in pattern */ 312 I32 offset_float_min; /* earliest point in string it can appear */ 313 I32 offset_float_max; /* latest point in string it can appear */ 314 I32 *minlen_float; /* pointer to the minlen relevent to the string */ 315 I32 lookbehind_float; /* is the position of the string modified by LB */ 316 I32 flags; 317 I32 whilem_c; 318 I32 *last_closep; 319 struct regnode_charclass_class *start_class; 320 } scan_data_t; 321 322 /* 323 * Forward declarations for pregcomp()'s friends. 324 */ 325 326 static const scan_data_t zero_scan_data = 327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; 328 329 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 330 #define SF_BEFORE_SEOL 0x0001 331 #define SF_BEFORE_MEOL 0x0002 332 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) 333 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) 334 335 #ifdef NO_UNARY_PLUS 336 # define SF_FIX_SHIFT_EOL (0+2) 337 # define SF_FL_SHIFT_EOL (0+4) 338 #else 339 # define SF_FIX_SHIFT_EOL (+2) 340 # define SF_FL_SHIFT_EOL (+4) 341 #endif 342 343 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) 344 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) 345 346 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) 347 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ 348 #define SF_IS_INF 0x0040 349 #define SF_HAS_PAR 0x0080 350 #define SF_IN_PAR 0x0100 351 #define SF_HAS_EVAL 0x0200 352 #define SCF_DO_SUBSTR 0x0400 353 #define SCF_DO_STCLASS_AND 0x0800 354 #define SCF_DO_STCLASS_OR 0x1000 355 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 356 #define SCF_WHILEM_VISITED_POS 0x2000 357 358 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ 359 #define SCF_SEEN_ACCEPT 0x8000 360 361 #define UTF (RExC_utf8 != 0) 362 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0) 363 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0) 364 365 #define OOB_UNICODE 12345678 366 #define OOB_NAMEDCLASS -1 367 368 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 369 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) 370 371 372 /* length of regex to show in messages that don't mark a position within */ 373 #define RegexLengthToShowInErrorMessages 127 374 375 /* 376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 378 * op/pragma/warn/regcomp. 379 */ 380 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 381 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 382 383 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" 384 385 /* 386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 387 * arg. Show regex, up to a maximum length. If it's too long, chop and add 388 * "...". 389 */ 390 #define _FAIL(code) STMT_START { \ 391 const char *ellipses = ""; \ 392 IV len = RExC_end - RExC_precomp; \ 393 \ 394 if (!SIZE_ONLY) \ 395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 396 if (len > RegexLengthToShowInErrorMessages) { \ 397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 398 len = RegexLengthToShowInErrorMessages - 10; \ 399 ellipses = "..."; \ 400 } \ 401 code; \ 402 } STMT_END 403 404 #define FAIL(msg) _FAIL( \ 405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ 406 msg, (int)len, RExC_precomp, ellipses)) 407 408 #define FAIL2(msg,arg) _FAIL( \ 409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ 410 arg, (int)len, RExC_precomp, ellipses)) 411 412 /* 413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 414 */ 415 #define Simple_vFAIL(m) STMT_START { \ 416 const IV offset = RExC_parse - RExC_precomp; \ 417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 419 } STMT_END 420 421 /* 422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 423 */ 424 #define vFAIL(m) STMT_START { \ 425 if (!SIZE_ONLY) \ 426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 427 Simple_vFAIL(m); \ 428 } STMT_END 429 430 /* 431 * Like Simple_vFAIL(), but accepts two arguments. 432 */ 433 #define Simple_vFAIL2(m,a1) STMT_START { \ 434 const IV offset = RExC_parse - RExC_precomp; \ 435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ 436 (int)offset, RExC_precomp, RExC_precomp + offset); \ 437 } STMT_END 438 439 /* 440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 441 */ 442 #define vFAIL2(m,a1) STMT_START { \ 443 if (!SIZE_ONLY) \ 444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 445 Simple_vFAIL2(m, a1); \ 446 } STMT_END 447 448 449 /* 450 * Like Simple_vFAIL(), but accepts three arguments. 451 */ 452 #define Simple_vFAIL3(m, a1, a2) STMT_START { \ 453 const IV offset = RExC_parse - RExC_precomp; \ 454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ 455 (int)offset, RExC_precomp, RExC_precomp + offset); \ 456 } STMT_END 457 458 /* 459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 460 */ 461 #define vFAIL3(m,a1,a2) STMT_START { \ 462 if (!SIZE_ONLY) \ 463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 464 Simple_vFAIL3(m, a1, a2); \ 465 } STMT_END 466 467 /* 468 * Like Simple_vFAIL(), but accepts four arguments. 469 */ 470 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ 471 const IV offset = RExC_parse - RExC_precomp; \ 472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ 473 (int)offset, RExC_precomp, RExC_precomp + offset); \ 474 } STMT_END 475 476 #define vWARN(loc,m) STMT_START { \ 477 const IV offset = loc - RExC_precomp; \ 478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ 479 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 480 } STMT_END 481 482 #define vWARNdep(loc,m) STMT_START { \ 483 const IV offset = loc - RExC_precomp; \ 484 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ 485 "%s" REPORT_LOCATION, \ 486 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 487 } STMT_END 488 489 490 #define vWARN2(loc, m, a1) STMT_START { \ 491 const IV offset = loc - RExC_precomp; \ 492 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 493 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ 494 } STMT_END 495 496 #define vWARN3(loc, m, a1, a2) STMT_START { \ 497 const IV offset = loc - RExC_precomp; \ 498 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 499 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ 500 } STMT_END 501 502 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ 503 const IV offset = loc - RExC_precomp; \ 504 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 505 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ 506 } STMT_END 507 508 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ 509 const IV offset = loc - RExC_precomp; \ 510 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 511 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ 512 } STMT_END 513 514 515 /* Allow for side effects in s */ 516 #define REGC(c,s) STMT_START { \ 517 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ 518 } STMT_END 519 520 /* Macros for recording node offsets. 20001227 mjd@plover.com 521 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in 522 * element 2*n-1 of the array. Element #2n holds the byte length node #n. 523 * Element 0 holds the number n. 524 * Position is 1 indexed. 525 */ 526 #ifndef RE_TRACK_PATTERN_OFFSETS 527 #define Set_Node_Offset_To_R(node,byte) 528 #define Set_Node_Offset(node,byte) 529 #define Set_Cur_Node_Offset 530 #define Set_Node_Length_To_R(node,len) 531 #define Set_Node_Length(node,len) 532 #define Set_Node_Cur_Length(node) 533 #define Node_Offset(n) 534 #define Node_Length(n) 535 #define Set_Node_Offset_Length(node,offset,len) 536 #define ProgLen(ri) ri->u.proglen 537 #define SetProgLen(ri,x) ri->u.proglen = x 538 #else 539 #define ProgLen(ri) ri->u.offsets[0] 540 #define SetProgLen(ri,x) ri->u.offsets[0] = x 541 #define Set_Node_Offset_To_R(node,byte) STMT_START { \ 542 if (! SIZE_ONLY) { \ 543 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ 544 __LINE__, (int)(node), (int)(byte))); \ 545 if((node) < 0) { \ 546 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ 547 } else { \ 548 RExC_offsets[2*(node)-1] = (byte); \ 549 } \ 550 } \ 551 } STMT_END 552 553 #define Set_Node_Offset(node,byte) \ 554 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) 555 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) 556 557 #define Set_Node_Length_To_R(node,len) STMT_START { \ 558 if (! SIZE_ONLY) { \ 559 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ 560 __LINE__, (int)(node), (int)(len))); \ 561 if((node) < 0) { \ 562 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ 563 } else { \ 564 RExC_offsets[2*(node)] = (len); \ 565 } \ 566 } \ 567 } STMT_END 568 569 #define Set_Node_Length(node,len) \ 570 Set_Node_Length_To_R((node)-RExC_emit_start, len) 571 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) 572 #define Set_Node_Cur_Length(node) \ 573 Set_Node_Length(node, RExC_parse - parse_start) 574 575 /* Get offsets and lengths */ 576 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) 577 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) 578 579 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \ 580 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ 581 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ 582 } STMT_END 583 #endif 584 585 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 586 #define EXPERIMENTAL_INPLACESCAN 587 #endif /*RE_TRACK_PATTERN_OFFSETS*/ 588 589 #define DEBUG_STUDYDATA(str,data,depth) \ 590 DEBUG_OPTIMISE_MORE_r(if(data){ \ 591 PerlIO_printf(Perl_debug_log, \ 592 "%*s" str "Pos:%"IVdf"/%"IVdf \ 593 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ 594 (int)(depth)*2, "", \ 595 (IV)((data)->pos_min), \ 596 (IV)((data)->pos_delta), \ 597 (UV)((data)->flags), \ 598 (IV)((data)->whilem_c), \ 599 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ 600 is_inf ? "INF " : "" \ 601 ); \ 602 if ((data)->last_found) \ 603 PerlIO_printf(Perl_debug_log, \ 604 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ 605 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ 606 SvPVX_const((data)->last_found), \ 607 (IV)((data)->last_end), \ 608 (IV)((data)->last_start_min), \ 609 (IV)((data)->last_start_max), \ 610 ((data)->longest && \ 611 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ 612 SvPVX_const((data)->longest_fixed), \ 613 (IV)((data)->offset_fixed), \ 614 ((data)->longest && \ 615 (data)->longest==&((data)->longest_float)) ? "*" : "", \ 616 SvPVX_const((data)->longest_float), \ 617 (IV)((data)->offset_float_min), \ 618 (IV)((data)->offset_float_max) \ 619 ); \ 620 PerlIO_printf(Perl_debug_log,"\n"); \ 621 }); 622 623 static void clear_re(pTHX_ void *r); 624 625 /* Mark that we cannot extend a found fixed substring at this point. 626 Update the longest found anchored substring and the longest found 627 floating substrings if needed. */ 628 629 STATIC void 630 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) 631 { 632 const STRLEN l = CHR_SVLEN(data->last_found); 633 const STRLEN old_l = CHR_SVLEN(*data->longest); 634 GET_RE_DEBUG_FLAGS_DECL; 635 636 PERL_ARGS_ASSERT_SCAN_COMMIT; 637 638 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 639 SvSetMagicSV(*data->longest, data->last_found); 640 if (*data->longest == data->longest_fixed) { 641 data->offset_fixed = l ? data->last_start_min : data->pos_min; 642 if (data->flags & SF_BEFORE_EOL) 643 data->flags 644 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); 645 else 646 data->flags &= ~SF_FIX_BEFORE_EOL; 647 data->minlen_fixed=minlenp; 648 data->lookbehind_fixed=0; 649 } 650 else { /* *data->longest == data->longest_float */ 651 data->offset_float_min = l ? data->last_start_min : data->pos_min; 652 data->offset_float_max = (l 653 ? data->last_start_max 654 : data->pos_min + data->pos_delta); 655 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) 656 data->offset_float_max = I32_MAX; 657 if (data->flags & SF_BEFORE_EOL) 658 data->flags 659 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); 660 else 661 data->flags &= ~SF_FL_BEFORE_EOL; 662 data->minlen_float=minlenp; 663 data->lookbehind_float=0; 664 } 665 } 666 SvCUR_set(data->last_found, 0); 667 { 668 SV * const sv = data->last_found; 669 if (SvUTF8(sv) && SvMAGICAL(sv)) { 670 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); 671 if (mg) 672 mg->mg_len = 0; 673 } 674 } 675 data->last_end = -1; 676 data->flags &= ~SF_BEFORE_EOL; 677 DEBUG_STUDYDATA("commit: ",data,0); 678 } 679 680 /* Can match anything (initialization) */ 681 STATIC void 682 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 683 { 684 PERL_ARGS_ASSERT_CL_ANYTHING; 685 686 ANYOF_CLASS_ZERO(cl); 687 ANYOF_BITMAP_SETALL(cl); 688 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; 689 if (LOC) 690 cl->flags |= ANYOF_LOCALE; 691 } 692 693 /* Can match anything (initialization) */ 694 STATIC int 695 S_cl_is_anything(const struct regnode_charclass_class *cl) 696 { 697 int value; 698 699 PERL_ARGS_ASSERT_CL_IS_ANYTHING; 700 701 for (value = 0; value <= ANYOF_MAX; value += 2) 702 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) 703 return 1; 704 if (!(cl->flags & ANYOF_UNICODE_ALL)) 705 return 0; 706 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) 707 return 0; 708 return 1; 709 } 710 711 /* Can match anything (initialization) */ 712 STATIC void 713 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 714 { 715 PERL_ARGS_ASSERT_CL_INIT; 716 717 Zero(cl, 1, struct regnode_charclass_class); 718 cl->type = ANYOF; 719 cl_anything(pRExC_state, cl); 720 } 721 722 STATIC void 723 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 724 { 725 PERL_ARGS_ASSERT_CL_INIT_ZERO; 726 727 Zero(cl, 1, struct regnode_charclass_class); 728 cl->type = ANYOF; 729 cl_anything(pRExC_state, cl); 730 if (LOC) 731 cl->flags |= ANYOF_LOCALE; 732 } 733 734 /* 'And' a given class with another one. Can create false positives */ 735 /* We assume that cl is not inverted */ 736 STATIC void 737 S_cl_and(struct regnode_charclass_class *cl, 738 const struct regnode_charclass_class *and_with) 739 { 740 PERL_ARGS_ASSERT_CL_AND; 741 742 assert(and_with->type == ANYOF); 743 if (!(and_with->flags & ANYOF_CLASS) 744 && !(cl->flags & ANYOF_CLASS) 745 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 746 && !(and_with->flags & ANYOF_FOLD) 747 && !(cl->flags & ANYOF_FOLD)) { 748 int i; 749 750 if (and_with->flags & ANYOF_INVERT) 751 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 752 cl->bitmap[i] &= ~and_with->bitmap[i]; 753 else 754 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 755 cl->bitmap[i] &= and_with->bitmap[i]; 756 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ 757 if (!(and_with->flags & ANYOF_EOS)) 758 cl->flags &= ~ANYOF_EOS; 759 760 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE && 761 !(and_with->flags & ANYOF_INVERT)) { 762 cl->flags &= ~ANYOF_UNICODE_ALL; 763 cl->flags |= ANYOF_UNICODE; 764 ARG_SET(cl, ARG(and_with)); 765 } 766 if (!(and_with->flags & ANYOF_UNICODE_ALL) && 767 !(and_with->flags & ANYOF_INVERT)) 768 cl->flags &= ~ANYOF_UNICODE_ALL; 769 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) && 770 !(and_with->flags & ANYOF_INVERT)) 771 cl->flags &= ~ANYOF_UNICODE; 772 } 773 774 /* 'OR' a given class with another one. Can create false positives */ 775 /* We assume that cl is not inverted */ 776 STATIC void 777 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) 778 { 779 PERL_ARGS_ASSERT_CL_OR; 780 781 if (or_with->flags & ANYOF_INVERT) { 782 /* We do not use 783 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) 784 * <= (B1 | !B2) | (CL1 | !CL2) 785 * which is wasteful if CL2 is small, but we ignore CL2: 786 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 787 * XXXX Can we handle case-fold? Unclear: 788 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = 789 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) 790 */ 791 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 792 && !(or_with->flags & ANYOF_FOLD) 793 && !(cl->flags & ANYOF_FOLD) ) { 794 int i; 795 796 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 797 cl->bitmap[i] |= ~or_with->bitmap[i]; 798 } /* XXXX: logic is complicated otherwise */ 799 else { 800 cl_anything(pRExC_state, cl); 801 } 802 } else { 803 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ 804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 805 && (!(or_with->flags & ANYOF_FOLD) 806 || (cl->flags & ANYOF_FOLD)) ) { 807 int i; 808 809 /* OR char bitmap and class bitmap separately */ 810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 811 cl->bitmap[i] |= or_with->bitmap[i]; 812 if (or_with->flags & ANYOF_CLASS) { 813 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++) 814 cl->classflags[i] |= or_with->classflags[i]; 815 cl->flags |= ANYOF_CLASS; 816 } 817 } 818 else { /* XXXX: logic is complicated, leave it along for a moment. */ 819 cl_anything(pRExC_state, cl); 820 } 821 } 822 if (or_with->flags & ANYOF_EOS) 823 cl->flags |= ANYOF_EOS; 824 825 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE && 826 ARG(cl) != ARG(or_with)) { 827 cl->flags |= ANYOF_UNICODE_ALL; 828 cl->flags &= ~ANYOF_UNICODE; 829 } 830 if (or_with->flags & ANYOF_UNICODE_ALL) { 831 cl->flags |= ANYOF_UNICODE_ALL; 832 cl->flags &= ~ANYOF_UNICODE; 833 } 834 } 835 836 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] 837 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) 838 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) 839 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) 840 841 842 #ifdef DEBUGGING 843 /* 844 dump_trie(trie,widecharmap,revcharmap) 845 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) 846 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) 847 848 These routines dump out a trie in a somewhat readable format. 849 The _interim_ variants are used for debugging the interim 850 tables that are used to generate the final compressed 851 representation which is what dump_trie expects. 852 853 Part of the reason for their existance is to provide a form 854 of documentation as to how the different representations function. 855 856 */ 857 858 /* 859 Dumps the final compressed table form of the trie to Perl_debug_log. 860 Used for debugging make_trie(). 861 */ 862 863 STATIC void 864 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, 865 AV *revcharmap, U32 depth) 866 { 867 U32 state; 868 SV *sv=sv_newmortal(); 869 int colwidth= widecharmap ? 6 : 4; 870 GET_RE_DEBUG_FLAGS_DECL; 871 872 PERL_ARGS_ASSERT_DUMP_TRIE; 873 874 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", 875 (int)depth * 2 + 2,"", 876 "Match","Base","Ofs" ); 877 878 for( state = 0 ; state < trie->uniquecharcount ; state++ ) { 879 SV ** const tmp = av_fetch( revcharmap, state, 0); 880 if ( tmp ) { 881 PerlIO_printf( Perl_debug_log, "%*s", 882 colwidth, 883 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 884 PL_colors[0], PL_colors[1], 885 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 886 PERL_PV_ESCAPE_FIRSTCHAR 887 ) 888 ); 889 } 890 } 891 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", 892 (int)depth * 2 + 2,""); 893 894 for( state = 0 ; state < trie->uniquecharcount ; state++ ) 895 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); 896 PerlIO_printf( Perl_debug_log, "\n"); 897 898 for( state = 1 ; state < trie->statecount ; state++ ) { 899 const U32 base = trie->states[ state ].trans.base; 900 901 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); 902 903 if ( trie->states[ state ].wordnum ) { 904 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); 905 } else { 906 PerlIO_printf( Perl_debug_log, "%6s", "" ); 907 } 908 909 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); 910 911 if ( base ) { 912 U32 ofs = 0; 913 914 while( ( base + ofs < trie->uniquecharcount ) || 915 ( base + ofs - trie->uniquecharcount < trie->lasttrans 916 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) 917 ofs++; 918 919 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); 920 921 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 922 if ( ( base + ofs >= trie->uniquecharcount ) && 923 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 924 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 925 { 926 PerlIO_printf( Perl_debug_log, "%*"UVXf, 927 colwidth, 928 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); 929 } else { 930 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); 931 } 932 } 933 934 PerlIO_printf( Perl_debug_log, "]"); 935 936 } 937 PerlIO_printf( Perl_debug_log, "\n" ); 938 } 939 } 940 /* 941 Dumps a fully constructed but uncompressed trie in list form. 942 List tries normally only are used for construction when the number of 943 possible chars (trie->uniquecharcount) is very high. 944 Used for debugging make_trie(). 945 */ 946 STATIC void 947 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, 948 HV *widecharmap, AV *revcharmap, U32 next_alloc, 949 U32 depth) 950 { 951 U32 state; 952 SV *sv=sv_newmortal(); 953 int colwidth= widecharmap ? 6 : 4; 954 GET_RE_DEBUG_FLAGS_DECL; 955 956 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; 957 958 /* print out the table precompression. */ 959 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", 960 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", 961 "------:-----+-----------------\n" ); 962 963 for( state=1 ; state < next_alloc ; state ++ ) { 964 U16 charid; 965 966 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", 967 (int)depth * 2 + 2,"", (UV)state ); 968 if ( ! trie->states[ state ].wordnum ) { 969 PerlIO_printf( Perl_debug_log, "%5s| ",""); 970 } else { 971 PerlIO_printf( Perl_debug_log, "W%4x| ", 972 trie->states[ state ].wordnum 973 ); 974 } 975 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { 976 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); 977 if ( tmp ) { 978 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", 979 colwidth, 980 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 981 PL_colors[0], PL_colors[1], 982 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 983 PERL_PV_ESCAPE_FIRSTCHAR 984 ) , 985 TRIE_LIST_ITEM(state,charid).forid, 986 (UV)TRIE_LIST_ITEM(state,charid).newstate 987 ); 988 if (!(charid % 10)) 989 PerlIO_printf(Perl_debug_log, "\n%*s| ", 990 (int)((depth * 2) + 14), ""); 991 } 992 } 993 PerlIO_printf( Perl_debug_log, "\n"); 994 } 995 } 996 997 /* 998 Dumps a fully constructed but uncompressed trie in table form. 999 This is the normal DFA style state transition table, with a few 1000 twists to facilitate compression later. 1001 Used for debugging make_trie(). 1002 */ 1003 STATIC void 1004 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, 1005 HV *widecharmap, AV *revcharmap, U32 next_alloc, 1006 U32 depth) 1007 { 1008 U32 state; 1009 U16 charid; 1010 SV *sv=sv_newmortal(); 1011 int colwidth= widecharmap ? 6 : 4; 1012 GET_RE_DEBUG_FLAGS_DECL; 1013 1014 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; 1015 1016 /* 1017 print out the table precompression so that we can do a visual check 1018 that they are identical. 1019 */ 1020 1021 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); 1022 1023 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 1024 SV ** const tmp = av_fetch( revcharmap, charid, 0); 1025 if ( tmp ) { 1026 PerlIO_printf( Perl_debug_log, "%*s", 1027 colwidth, 1028 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 1029 PL_colors[0], PL_colors[1], 1030 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 1031 PERL_PV_ESCAPE_FIRSTCHAR 1032 ) 1033 ); 1034 } 1035 } 1036 1037 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); 1038 1039 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { 1040 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); 1041 } 1042 1043 PerlIO_printf( Perl_debug_log, "\n" ); 1044 1045 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { 1046 1047 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", 1048 (int)depth * 2 + 2,"", 1049 (UV)TRIE_NODENUM( state ) ); 1050 1051 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 1052 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); 1053 if (v) 1054 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); 1055 else 1056 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); 1057 } 1058 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { 1059 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); 1060 } else { 1061 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, 1062 trie->states[ TRIE_NODENUM( state ) ].wordnum ); 1063 } 1064 } 1065 } 1066 1067 #endif 1068 1069 /* make_trie(startbranch,first,last,tail,word_count,flags,depth) 1070 startbranch: the first branch in the whole branch sequence 1071 first : start branch of sequence of branch-exact nodes. 1072 May be the same as startbranch 1073 last : Thing following the last branch. 1074 May be the same as tail. 1075 tail : item following the branch sequence 1076 count : words in the sequence 1077 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ 1078 depth : indent depth 1079 1080 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. 1081 1082 A trie is an N'ary tree where the branches are determined by digital 1083 decomposition of the key. IE, at the root node you look up the 1st character and 1084 follow that branch repeat until you find the end of the branches. Nodes can be 1085 marked as "accepting" meaning they represent a complete word. Eg: 1086 1087 /he|she|his|hers/ 1088 1089 would convert into the following structure. Numbers represent states, letters 1090 following numbers represent valid transitions on the letter from that state, if 1091 the number is in square brackets it represents an accepting state, otherwise it 1092 will be in parenthesis. 1093 1094 +-h->+-e->[3]-+-r->(8)-+-s->[9] 1095 | | 1096 | (2) 1097 | | 1098 (1) +-i->(6)-+-s->[7] 1099 | 1100 +-s->(3)-+-h->(4)-+-e->[5] 1101 1102 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) 1103 1104 This shows that when matching against the string 'hers' we will begin at state 1 1105 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, 1106 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which 1107 is also accepting. Thus we know that we can match both 'he' and 'hers' with a 1108 single traverse. We store a mapping from accepting to state to which word was 1109 matched, and then when we have multiple possibilities we try to complete the 1110 rest of the regex in the order in which they occured in the alternation. 1111 1112 The only prior NFA like behaviour that would be changed by the TRIE support is 1113 the silent ignoring of duplicate alternations which are of the form: 1114 1115 / (DUPE|DUPE) X? (?{ ... }) Y /x 1116 1117 Thus EVAL blocks follwing a trie may be called a different number of times with 1118 and without the optimisation. With the optimisations dupes will be silently 1119 ignored. This inconsistant behaviour of EVAL type nodes is well established as 1120 the following demonstrates: 1121 1122 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ 1123 1124 which prints out 'word' three times, but 1125 1126 'words'=~/(word|word|word)(?{ print $1 })S/ 1127 1128 which doesnt print it out at all. This is due to other optimisations kicking in. 1129 1130 Example of what happens on a structural level: 1131 1132 The regexp /(ac|ad|ab)+/ will produce the folowing debug output: 1133 1134 1: CURLYM[1] {1,32767}(18) 1135 5: BRANCH(8) 1136 6: EXACT <ac>(16) 1137 8: BRANCH(11) 1138 9: EXACT <ad>(16) 1139 11: BRANCH(14) 1140 12: EXACT <ab>(16) 1141 16: SUCCEED(0) 1142 17: NOTHING(18) 1143 18: END(0) 1144 1145 This would be optimizable with startbranch=5, first=5, last=16, tail=16 1146 and should turn into: 1147 1148 1: CURLYM[1] {1,32767}(18) 1149 5: TRIE(16) 1150 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] 1151 <ac> 1152 <ad> 1153 <ab> 1154 16: SUCCEED(0) 1155 17: NOTHING(18) 1156 18: END(0) 1157 1158 Cases where tail != last would be like /(?foo|bar)baz/: 1159 1160 1: BRANCH(4) 1161 2: EXACT <foo>(8) 1162 4: BRANCH(7) 1163 5: EXACT <bar>(8) 1164 7: TAIL(8) 1165 8: EXACT <baz>(10) 1166 10: END(0) 1167 1168 which would be optimizable with startbranch=1, first=1, last=7, tail=8 1169 and would end up looking like: 1170 1171 1: TRIE(8) 1172 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] 1173 <foo> 1174 <bar> 1175 7: TAIL(8) 1176 8: EXACT <baz>(10) 1177 10: END(0) 1178 1179 d = uvuni_to_utf8_flags(d, uv, 0); 1180 1181 is the recommended Unicode-aware way of saying 1182 1183 *(d++) = uv; 1184 */ 1185 1186 #define TRIE_STORE_REVCHAR \ 1187 STMT_START { \ 1188 if (UTF) { \ 1189 SV *zlopp = newSV(2); \ 1190 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ 1191 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ 1192 SvCUR_set(zlopp, kapow - flrbbbbb); \ 1193 SvPOK_on(zlopp); \ 1194 SvUTF8_on(zlopp); \ 1195 av_push(revcharmap, zlopp); \ 1196 } else { \ 1197 char ooooff = (char)uvc; \ 1198 av_push(revcharmap, newSVpvn(&ooooff, 1)); \ 1199 } \ 1200 } STMT_END 1201 1202 #define TRIE_READ_CHAR STMT_START { \ 1203 wordlen++; \ 1204 if ( UTF ) { \ 1205 if ( folder ) { \ 1206 if ( foldlen > 0 ) { \ 1207 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \ 1208 foldlen -= len; \ 1209 scan += len; \ 1210 len = 0; \ 1211 } else { \ 1212 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ 1213 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ 1214 foldlen -= UNISKIP( uvc ); \ 1215 scan = foldbuf + UNISKIP( uvc ); \ 1216 } \ 1217 } else { \ 1218 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ 1219 } \ 1220 } else { \ 1221 uvc = (U32)*uc; \ 1222 len = 1; \ 1223 } \ 1224 } STMT_END 1225 1226 1227 1228 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ 1229 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ 1230 U32 ging = TRIE_LIST_LEN( state ) *= 2; \ 1231 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ 1232 } \ 1233 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ 1234 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ 1235 TRIE_LIST_CUR( state )++; \ 1236 } STMT_END 1237 1238 #define TRIE_LIST_NEW(state) STMT_START { \ 1239 Newxz( trie->states[ state ].trans.list, \ 1240 4, reg_trie_trans_le ); \ 1241 TRIE_LIST_CUR( state ) = 1; \ 1242 TRIE_LIST_LEN( state ) = 4; \ 1243 } STMT_END 1244 1245 #define TRIE_HANDLE_WORD(state) STMT_START { \ 1246 U16 dupe= trie->states[ state ].wordnum; \ 1247 regnode * const noper_next = regnext( noper ); \ 1248 \ 1249 if (trie->wordlen) \ 1250 trie->wordlen[ curword ] = wordlen; \ 1251 DEBUG_r({ \ 1252 /* store the word for dumping */ \ 1253 SV* tmp; \ 1254 if (OP(noper) != NOTHING) \ 1255 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ 1256 else \ 1257 tmp = newSVpvn_utf8( "", 0, UTF ); \ 1258 av_push( trie_words, tmp ); \ 1259 }); \ 1260 \ 1261 curword++; \ 1262 \ 1263 if ( noper_next < tail ) { \ 1264 if (!trie->jump) \ 1265 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ 1266 trie->jump[curword] = (U16)(noper_next - convert); \ 1267 if (!jumper) \ 1268 jumper = noper_next; \ 1269 if (!nextbranch) \ 1270 nextbranch= regnext(cur); \ 1271 } \ 1272 \ 1273 if ( dupe ) { \ 1274 /* So it's a dupe. This means we need to maintain a */\ 1275 /* linked-list from the first to the next. */\ 1276 /* we only allocate the nextword buffer when there */\ 1277 /* a dupe, so first time we have to do the allocation */\ 1278 if (!trie->nextword) \ 1279 trie->nextword = (U16 *) \ 1280 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \ 1281 while ( trie->nextword[dupe] ) \ 1282 dupe= trie->nextword[dupe]; \ 1283 trie->nextword[dupe]= curword; \ 1284 } else { \ 1285 /* we haven't inserted this word yet. */ \ 1286 trie->states[ state ].wordnum = curword; \ 1287 } \ 1288 } STMT_END 1289 1290 1291 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ 1292 ( ( base + charid >= ucharcount \ 1293 && base + charid < ubound \ 1294 && state == trie->trans[ base - ucharcount + charid ].check \ 1295 && trie->trans[ base - ucharcount + charid ].next ) \ 1296 ? trie->trans[ base - ucharcount + charid ].next \ 1297 : ( state==1 ? special : 0 ) \ 1298 ) 1299 1300 #define MADE_TRIE 1 1301 #define MADE_JUMP_TRIE 2 1302 #define MADE_EXACT_TRIE 4 1303 1304 STATIC I32 1305 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) 1306 { 1307 dVAR; 1308 /* first pass, loop through and scan words */ 1309 reg_trie_data *trie; 1310 HV *widecharmap = NULL; 1311 AV *revcharmap = newAV(); 1312 regnode *cur; 1313 const U32 uniflags = UTF8_ALLOW_DEFAULT; 1314 STRLEN len = 0; 1315 UV uvc = 0; 1316 U16 curword = 0; 1317 U32 next_alloc = 0; 1318 regnode *jumper = NULL; 1319 regnode *nextbranch = NULL; 1320 regnode *convert = NULL; 1321 /* we just use folder as a flag in utf8 */ 1322 const U8 * const folder = ( flags == EXACTF 1323 ? PL_fold 1324 : ( flags == EXACTFL 1325 ? PL_fold_locale 1326 : NULL 1327 ) 1328 ); 1329 1330 #ifdef DEBUGGING 1331 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); 1332 AV *trie_words = NULL; 1333 /* along with revcharmap, this only used during construction but both are 1334 * useful during debugging so we store them in the struct when debugging. 1335 */ 1336 #else 1337 const U32 data_slot = add_data( pRExC_state, 2, "tu" ); 1338 STRLEN trie_charcount=0; 1339 #endif 1340 SV *re_trie_maxbuff; 1341 GET_RE_DEBUG_FLAGS_DECL; 1342 1343 PERL_ARGS_ASSERT_MAKE_TRIE; 1344 #ifndef DEBUGGING 1345 PERL_UNUSED_ARG(depth); 1346 #endif 1347 1348 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); 1349 trie->refcount = 1; 1350 trie->startstate = 1; 1351 trie->wordcount = word_count; 1352 RExC_rxi->data->data[ data_slot ] = (void*)trie; 1353 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); 1354 if (!(UTF && folder)) 1355 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); 1356 DEBUG_r({ 1357 trie_words = newAV(); 1358 }); 1359 1360 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 1361 if (!SvIOK(re_trie_maxbuff)) { 1362 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 1363 } 1364 DEBUG_OPTIMISE_r({ 1365 PerlIO_printf( Perl_debug_log, 1366 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", 1367 (int)depth * 2 + 2, "", 1368 REG_NODE_NUM(startbranch),REG_NODE_NUM(first), 1369 REG_NODE_NUM(last), REG_NODE_NUM(tail), 1370 (int)depth); 1371 }); 1372 1373 /* Find the node we are going to overwrite */ 1374 if ( first == startbranch && OP( last ) != BRANCH ) { 1375 /* whole branch chain */ 1376 convert = first; 1377 } else { 1378 /* branch sub-chain */ 1379 convert = NEXTOPER( first ); 1380 } 1381 1382 /* -- First loop and Setup -- 1383 1384 We first traverse the branches and scan each word to determine if it 1385 contains widechars, and how many unique chars there are, this is 1386 important as we have to build a table with at least as many columns as we 1387 have unique chars. 1388 1389 We use an array of integers to represent the character codes 0..255 1390 (trie->charmap) and we use a an HV* to store Unicode characters. We use the 1391 native representation of the character value as the key and IV's for the 1392 coded index. 1393 1394 *TODO* If we keep track of how many times each character is used we can 1395 remap the columns so that the table compression later on is more 1396 efficient in terms of memory by ensuring most common value is in the 1397 middle and the least common are on the outside. IMO this would be better 1398 than a most to least common mapping as theres a decent chance the most 1399 common letter will share a node with the least common, meaning the node 1400 will not be compressable. With a middle is most common approach the worst 1401 case is when we have the least common nodes twice. 1402 1403 */ 1404 1405 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1406 regnode * const noper = NEXTOPER( cur ); 1407 const U8 *uc = (U8*)STRING( noper ); 1408 const U8 * const e = uc + STR_LEN( noper ); 1409 STRLEN foldlen = 0; 1410 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1411 const U8 *scan = (U8*)NULL; 1412 U32 wordlen = 0; /* required init */ 1413 STRLEN chars = 0; 1414 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ 1415 1416 if (OP(noper) == NOTHING) { 1417 trie->minlen= 0; 1418 continue; 1419 } 1420 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ 1421 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte 1422 regardless of encoding */ 1423 1424 for ( ; uc < e ; uc += len ) { 1425 TRIE_CHARCOUNT(trie)++; 1426 TRIE_READ_CHAR; 1427 chars++; 1428 if ( uvc < 256 ) { 1429 if ( !trie->charmap[ uvc ] ) { 1430 trie->charmap[ uvc ]=( ++trie->uniquecharcount ); 1431 if ( folder ) 1432 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; 1433 TRIE_STORE_REVCHAR; 1434 } 1435 if ( set_bit ) { 1436 /* store the codepoint in the bitmap, and if its ascii 1437 also store its folded equivelent. */ 1438 TRIE_BITMAP_SET(trie,uvc); 1439 1440 /* store the folded codepoint */ 1441 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); 1442 1443 if ( !UTF ) { 1444 /* store first byte of utf8 representation of 1445 codepoints in the 127 < uvc < 256 range */ 1446 if (127 < uvc && uvc < 192) { 1447 TRIE_BITMAP_SET(trie,194); 1448 } else if (191 < uvc ) { 1449 TRIE_BITMAP_SET(trie,195); 1450 /* && uvc < 256 -- we know uvc is < 256 already */ 1451 } 1452 } 1453 set_bit = 0; /* We've done our bit :-) */ 1454 } 1455 } else { 1456 SV** svpp; 1457 if ( !widecharmap ) 1458 widecharmap = newHV(); 1459 1460 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); 1461 1462 if ( !svpp ) 1463 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); 1464 1465 if ( !SvTRUE( *svpp ) ) { 1466 sv_setiv( *svpp, ++trie->uniquecharcount ); 1467 TRIE_STORE_REVCHAR; 1468 } 1469 } 1470 } 1471 if( cur == first ) { 1472 trie->minlen=chars; 1473 trie->maxlen=chars; 1474 } else if (chars < trie->minlen) { 1475 trie->minlen=chars; 1476 } else if (chars > trie->maxlen) { 1477 trie->maxlen=chars; 1478 } 1479 1480 } /* end first pass */ 1481 DEBUG_TRIE_COMPILE_r( 1482 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", 1483 (int)depth * 2 + 2,"", 1484 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, 1485 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, 1486 (int)trie->minlen, (int)trie->maxlen ) 1487 ); 1488 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) ); 1489 1490 /* 1491 We now know what we are dealing with in terms of unique chars and 1492 string sizes so we can calculate how much memory a naive 1493 representation using a flat table will take. If it's over a reasonable 1494 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory 1495 conservative but potentially much slower representation using an array 1496 of lists. 1497 1498 At the end we convert both representations into the same compressed 1499 form that will be used in regexec.c for matching with. The latter 1500 is a form that cannot be used to construct with but has memory 1501 properties similar to the list form and access properties similar 1502 to the table form making it both suitable for fast searches and 1503 small enough that its feasable to store for the duration of a program. 1504 1505 See the comment in the code where the compressed table is produced 1506 inplace from the flat tabe representation for an explanation of how 1507 the compression works. 1508 1509 */ 1510 1511 1512 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { 1513 /* 1514 Second Pass -- Array Of Lists Representation 1515 1516 Each state will be represented by a list of charid:state records 1517 (reg_trie_trans_le) the first such element holds the CUR and LEN 1518 points of the allocated array. (See defines above). 1519 1520 We build the initial structure using the lists, and then convert 1521 it into the compressed table form which allows faster lookups 1522 (but cant be modified once converted). 1523 */ 1524 1525 STRLEN transcount = 1; 1526 1527 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 1528 "%*sCompiling trie using list compiler\n", 1529 (int)depth * 2 + 2, "")); 1530 1531 trie->states = (reg_trie_state *) 1532 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 1533 sizeof(reg_trie_state) ); 1534 TRIE_LIST_NEW(1); 1535 next_alloc = 2; 1536 1537 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1538 1539 regnode * const noper = NEXTOPER( cur ); 1540 U8 *uc = (U8*)STRING( noper ); 1541 const U8 * const e = uc + STR_LEN( noper ); 1542 U32 state = 1; /* required init */ 1543 U16 charid = 0; /* sanity init */ 1544 U8 *scan = (U8*)NULL; /* sanity init */ 1545 STRLEN foldlen = 0; /* required init */ 1546 U32 wordlen = 0; /* required init */ 1547 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1548 1549 if (OP(noper) != NOTHING) { 1550 for ( ; uc < e ; uc += len ) { 1551 1552 TRIE_READ_CHAR; 1553 1554 if ( uvc < 256 ) { 1555 charid = trie->charmap[ uvc ]; 1556 } else { 1557 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); 1558 if ( !svpp ) { 1559 charid = 0; 1560 } else { 1561 charid=(U16)SvIV( *svpp ); 1562 } 1563 } 1564 /* charid is now 0 if we dont know the char read, or nonzero if we do */ 1565 if ( charid ) { 1566 1567 U16 check; 1568 U32 newstate = 0; 1569 1570 charid--; 1571 if ( !trie->states[ state ].trans.list ) { 1572 TRIE_LIST_NEW( state ); 1573 } 1574 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { 1575 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { 1576 newstate = TRIE_LIST_ITEM( state, check ).newstate; 1577 break; 1578 } 1579 } 1580 if ( ! newstate ) { 1581 newstate = next_alloc++; 1582 TRIE_LIST_PUSH( state, charid, newstate ); 1583 transcount++; 1584 } 1585 state = newstate; 1586 } else { 1587 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 1588 } 1589 } 1590 } 1591 TRIE_HANDLE_WORD(state); 1592 1593 } /* end second pass */ 1594 1595 /* next alloc is the NEXT state to be allocated */ 1596 trie->statecount = next_alloc; 1597 trie->states = (reg_trie_state *) 1598 PerlMemShared_realloc( trie->states, 1599 next_alloc 1600 * sizeof(reg_trie_state) ); 1601 1602 /* and now dump it out before we compress it */ 1603 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, 1604 revcharmap, next_alloc, 1605 depth+1) 1606 ); 1607 1608 trie->trans = (reg_trie_trans *) 1609 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); 1610 { 1611 U32 state; 1612 U32 tp = 0; 1613 U32 zp = 0; 1614 1615 1616 for( state=1 ; state < next_alloc ; state ++ ) { 1617 U32 base=0; 1618 1619 /* 1620 DEBUG_TRIE_COMPILE_MORE_r( 1621 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) 1622 ); 1623 */ 1624 1625 if (trie->states[state].trans.list) { 1626 U16 minid=TRIE_LIST_ITEM( state, 1).forid; 1627 U16 maxid=minid; 1628 U16 idx; 1629 1630 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 1631 const U16 forid = TRIE_LIST_ITEM( state, idx).forid; 1632 if ( forid < minid ) { 1633 minid=forid; 1634 } else if ( forid > maxid ) { 1635 maxid=forid; 1636 } 1637 } 1638 if ( transcount < tp + maxid - minid + 1) { 1639 transcount *= 2; 1640 trie->trans = (reg_trie_trans *) 1641 PerlMemShared_realloc( trie->trans, 1642 transcount 1643 * sizeof(reg_trie_trans) ); 1644 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); 1645 } 1646 base = trie->uniquecharcount + tp - minid; 1647 if ( maxid == minid ) { 1648 U32 set = 0; 1649 for ( ; zp < tp ; zp++ ) { 1650 if ( ! trie->trans[ zp ].next ) { 1651 base = trie->uniquecharcount + zp - minid; 1652 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; 1653 trie->trans[ zp ].check = state; 1654 set = 1; 1655 break; 1656 } 1657 } 1658 if ( !set ) { 1659 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; 1660 trie->trans[ tp ].check = state; 1661 tp++; 1662 zp = tp; 1663 } 1664 } else { 1665 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 1666 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; 1667 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; 1668 trie->trans[ tid ].check = state; 1669 } 1670 tp += ( maxid - minid + 1 ); 1671 } 1672 Safefree(trie->states[ state ].trans.list); 1673 } 1674 /* 1675 DEBUG_TRIE_COMPILE_MORE_r( 1676 PerlIO_printf( Perl_debug_log, " base: %d\n",base); 1677 ); 1678 */ 1679 trie->states[ state ].trans.base=base; 1680 } 1681 trie->lasttrans = tp + 1; 1682 } 1683 } else { 1684 /* 1685 Second Pass -- Flat Table Representation. 1686 1687 we dont use the 0 slot of either trans[] or states[] so we add 1 to each. 1688 We know that we will need Charcount+1 trans at most to store the data 1689 (one row per char at worst case) So we preallocate both structures 1690 assuming worst case. 1691 1692 We then construct the trie using only the .next slots of the entry 1693 structs. 1694 1695 We use the .check field of the first entry of the node temporarily to 1696 make compression both faster and easier by keeping track of how many non 1697 zero fields are in the node. 1698 1699 Since trans are numbered from 1 any 0 pointer in the table is a FAIL 1700 transition. 1701 1702 There are two terms at use here: state as a TRIE_NODEIDX() which is a 1703 number representing the first entry of the node, and state as a 1704 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and 1705 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there 1706 are 2 entrys per node. eg: 1707 1708 A B A B 1709 1. 2 4 1. 3 7 1710 2. 0 3 3. 0 5 1711 3. 0 0 5. 0 0 1712 4. 0 0 7. 0 0 1713 1714 The table is internally in the right hand, idx form. However as we also 1715 have to deal with the states array which is indexed by nodenum we have to 1716 use TRIE_NODENUM() to convert. 1717 1718 */ 1719 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 1720 "%*sCompiling trie using table compiler\n", 1721 (int)depth * 2 + 2, "")); 1722 1723 trie->trans = (reg_trie_trans *) 1724 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) 1725 * trie->uniquecharcount + 1, 1726 sizeof(reg_trie_trans) ); 1727 trie->states = (reg_trie_state *) 1728 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 1729 sizeof(reg_trie_state) ); 1730 next_alloc = trie->uniquecharcount + 1; 1731 1732 1733 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1734 1735 regnode * const noper = NEXTOPER( cur ); 1736 const U8 *uc = (U8*)STRING( noper ); 1737 const U8 * const e = uc + STR_LEN( noper ); 1738 1739 U32 state = 1; /* required init */ 1740 1741 U16 charid = 0; /* sanity init */ 1742 U32 accept_state = 0; /* sanity init */ 1743 U8 *scan = (U8*)NULL; /* sanity init */ 1744 1745 STRLEN foldlen = 0; /* required init */ 1746 U32 wordlen = 0; /* required init */ 1747 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1748 1749 if ( OP(noper) != NOTHING ) { 1750 for ( ; uc < e ; uc += len ) { 1751 1752 TRIE_READ_CHAR; 1753 1754 if ( uvc < 256 ) { 1755 charid = trie->charmap[ uvc ]; 1756 } else { 1757 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); 1758 charid = svpp ? (U16)SvIV(*svpp) : 0; 1759 } 1760 if ( charid ) { 1761 charid--; 1762 if ( !trie->trans[ state + charid ].next ) { 1763 trie->trans[ state + charid ].next = next_alloc; 1764 trie->trans[ state ].check++; 1765 next_alloc += trie->uniquecharcount; 1766 } 1767 state = trie->trans[ state + charid ].next; 1768 } else { 1769 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 1770 } 1771 /* charid is now 0 if we dont know the char read, or nonzero if we do */ 1772 } 1773 } 1774 accept_state = TRIE_NODENUM( state ); 1775 TRIE_HANDLE_WORD(accept_state); 1776 1777 } /* end second pass */ 1778 1779 /* and now dump it out before we compress it */ 1780 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, 1781 revcharmap, 1782 next_alloc, depth+1)); 1783 1784 { 1785 /* 1786 * Inplace compress the table.* 1787 1788 For sparse data sets the table constructed by the trie algorithm will 1789 be mostly 0/FAIL transitions or to put it another way mostly empty. 1790 (Note that leaf nodes will not contain any transitions.) 1791 1792 This algorithm compresses the tables by eliminating most such 1793 transitions, at the cost of a modest bit of extra work during lookup: 1794 1795 - Each states[] entry contains a .base field which indicates the 1796 index in the state[] array wheres its transition data is stored. 1797 1798 - If .base is 0 there are no valid transitions from that node. 1799 1800 - If .base is nonzero then charid is added to it to find an entry in 1801 the trans array. 1802 1803 -If trans[states[state].base+charid].check!=state then the 1804 transition is taken to be a 0/Fail transition. Thus if there are fail 1805 transitions at the front of the node then the .base offset will point 1806 somewhere inside the previous nodes data (or maybe even into a node 1807 even earlier), but the .check field determines if the transition is 1808 valid. 1809 1810 XXX - wrong maybe? 1811 The following process inplace converts the table to the compressed 1812 table: We first do not compress the root node 1,and mark its all its 1813 .check pointers as 1 and set its .base pointer as 1 as well. This 1814 allows to do a DFA construction from the compressed table later, and 1815 ensures that any .base pointers we calculate later are greater than 1816 0. 1817 1818 - We set 'pos' to indicate the first entry of the second node. 1819 1820 - We then iterate over the columns of the node, finding the first and 1821 last used entry at l and m. We then copy l..m into pos..(pos+m-l), 1822 and set the .check pointers accordingly, and advance pos 1823 appropriately and repreat for the next node. Note that when we copy 1824 the next pointers we have to convert them from the original 1825 NODEIDX form to NODENUM form as the former is not valid post 1826 compression. 1827 1828 - If a node has no transitions used we mark its base as 0 and do not 1829 advance the pos pointer. 1830 1831 - If a node only has one transition we use a second pointer into the 1832 structure to fill in allocated fail transitions from other states. 1833 This pointer is independent of the main pointer and scans forward 1834 looking for null transitions that are allocated to a state. When it 1835 finds one it writes the single transition into the "hole". If the 1836 pointer doesnt find one the single transition is appended as normal. 1837 1838 - Once compressed we can Renew/realloc the structures to release the 1839 excess space. 1840 1841 See "Table-Compression Methods" in sec 3.9 of the Red Dragon, 1842 specifically Fig 3.47 and the associated pseudocode. 1843 1844 demq 1845 */ 1846 const U32 laststate = TRIE_NODENUM( next_alloc ); 1847 U32 state, charid; 1848 U32 pos = 0, zp=0; 1849 trie->statecount = laststate; 1850 1851 for ( state = 1 ; state < laststate ; state++ ) { 1852 U8 flag = 0; 1853 const U32 stateidx = TRIE_NODEIDX( state ); 1854 const U32 o_used = trie->trans[ stateidx ].check; 1855 U32 used = trie->trans[ stateidx ].check; 1856 trie->trans[ stateidx ].check = 0; 1857 1858 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { 1859 if ( flag || trie->trans[ stateidx + charid ].next ) { 1860 if ( trie->trans[ stateidx + charid ].next ) { 1861 if (o_used == 1) { 1862 for ( ; zp < pos ; zp++ ) { 1863 if ( ! trie->trans[ zp ].next ) { 1864 break; 1865 } 1866 } 1867 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; 1868 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); 1869 trie->trans[ zp ].check = state; 1870 if ( ++zp > pos ) pos = zp; 1871 break; 1872 } 1873 used--; 1874 } 1875 if ( !flag ) { 1876 flag = 1; 1877 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; 1878 } 1879 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); 1880 trie->trans[ pos ].check = state; 1881 pos++; 1882 } 1883 } 1884 } 1885 trie->lasttrans = pos + 1; 1886 trie->states = (reg_trie_state *) 1887 PerlMemShared_realloc( trie->states, laststate 1888 * sizeof(reg_trie_state) ); 1889 DEBUG_TRIE_COMPILE_MORE_r( 1890 PerlIO_printf( Perl_debug_log, 1891 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", 1892 (int)depth * 2 + 2,"", 1893 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), 1894 (IV)next_alloc, 1895 (IV)pos, 1896 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); 1897 ); 1898 1899 } /* end table compress */ 1900 } 1901 DEBUG_TRIE_COMPILE_MORE_r( 1902 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", 1903 (int)depth * 2 + 2, "", 1904 (UV)trie->statecount, 1905 (UV)trie->lasttrans) 1906 ); 1907 /* resize the trans array to remove unused space */ 1908 trie->trans = (reg_trie_trans *) 1909 PerlMemShared_realloc( trie->trans, trie->lasttrans 1910 * sizeof(reg_trie_trans) ); 1911 1912 /* and now dump out the compressed format */ 1913 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); 1914 1915 { /* Modify the program and insert the new TRIE node*/ 1916 U8 nodetype =(U8)(flags & 0xFF); 1917 char *str=NULL; 1918 1919 #ifdef DEBUGGING 1920 regnode *optimize = NULL; 1921 #ifdef RE_TRACK_PATTERN_OFFSETS 1922 1923 U32 mjd_offset = 0; 1924 U32 mjd_nodelen = 0; 1925 #endif /* RE_TRACK_PATTERN_OFFSETS */ 1926 #endif /* DEBUGGING */ 1927 /* 1928 This means we convert either the first branch or the first Exact, 1929 depending on whether the thing following (in 'last') is a branch 1930 or not and whther first is the startbranch (ie is it a sub part of 1931 the alternation or is it the whole thing.) 1932 Assuming its a sub part we conver the EXACT otherwise we convert 1933 the whole branch sequence, including the first. 1934 */ 1935 /* Find the node we are going to overwrite */ 1936 if ( first != startbranch || OP( last ) == BRANCH ) { 1937 /* branch sub-chain */ 1938 NEXT_OFF( first ) = (U16)(last - first); 1939 #ifdef RE_TRACK_PATTERN_OFFSETS 1940 DEBUG_r({ 1941 mjd_offset= Node_Offset((convert)); 1942 mjd_nodelen= Node_Length((convert)); 1943 }); 1944 #endif 1945 /* whole branch chain */ 1946 } 1947 #ifdef RE_TRACK_PATTERN_OFFSETS 1948 else { 1949 DEBUG_r({ 1950 const regnode *nop = NEXTOPER( convert ); 1951 mjd_offset= Node_Offset((nop)); 1952 mjd_nodelen= Node_Length((nop)); 1953 }); 1954 } 1955 DEBUG_OPTIMISE_r( 1956 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", 1957 (int)depth * 2 + 2, "", 1958 (UV)mjd_offset, (UV)mjd_nodelen) 1959 ); 1960 #endif 1961 /* But first we check to see if there is a common prefix we can 1962 split out as an EXACT and put in front of the TRIE node. */ 1963 trie->startstate= 1; 1964 if ( trie->bitmap && !widecharmap && !trie->jump ) { 1965 U32 state; 1966 for ( state = 1 ; state < trie->statecount-1 ; state++ ) { 1967 U32 ofs = 0; 1968 I32 idx = -1; 1969 U32 count = 0; 1970 const U32 base = trie->states[ state ].trans.base; 1971 1972 if ( trie->states[state].wordnum ) 1973 count = 1; 1974 1975 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 1976 if ( ( base + ofs >= trie->uniquecharcount ) && 1977 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 1978 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 1979 { 1980 if ( ++count > 1 ) { 1981 SV **tmp = av_fetch( revcharmap, ofs, 0); 1982 const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); 1983 if ( state == 1 ) break; 1984 if ( count == 2 ) { 1985 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); 1986 DEBUG_OPTIMISE_r( 1987 PerlIO_printf(Perl_debug_log, 1988 "%*sNew Start State=%"UVuf" Class: [", 1989 (int)depth * 2 + 2, "", 1990 (UV)state)); 1991 if (idx >= 0) { 1992 SV ** const tmp = av_fetch( revcharmap, idx, 0); 1993 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); 1994 1995 TRIE_BITMAP_SET(trie,*ch); 1996 if ( folder ) 1997 TRIE_BITMAP_SET(trie, folder[ *ch ]); 1998 DEBUG_OPTIMISE_r( 1999 PerlIO_printf(Perl_debug_log, "%s", (char*)ch) 2000 ); 2001 } 2002 } 2003 TRIE_BITMAP_SET(trie,*ch); 2004 if ( folder ) 2005 TRIE_BITMAP_SET(trie,folder[ *ch ]); 2006 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); 2007 } 2008 idx = ofs; 2009 } 2010 } 2011 if ( count == 1 ) { 2012 SV **tmp = av_fetch( revcharmap, idx, 0); 2013 STRLEN len; 2014 char *ch = SvPV( *tmp, len ); 2015 DEBUG_OPTIMISE_r({ 2016 SV *sv=sv_newmortal(); 2017 PerlIO_printf( Perl_debug_log, 2018 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", 2019 (int)depth * 2 + 2, "", 2020 (UV)state, (UV)idx, 2021 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, 2022 PL_colors[0], PL_colors[1], 2023 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 2024 PERL_PV_ESCAPE_FIRSTCHAR 2025 ) 2026 ); 2027 }); 2028 if ( state==1 ) { 2029 OP( convert ) = nodetype; 2030 str=STRING(convert); 2031 STR_LEN(convert)=0; 2032 } 2033 STR_LEN(convert) += len; 2034 while (len--) 2035 *str++ = *ch++; 2036 } else { 2037 #ifdef DEBUGGING 2038 if (state>1) 2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); 2040 #endif 2041 break; 2042 } 2043 } 2044 if (str) { 2045 regnode *n = convert+NODE_SZ_STR(convert); 2046 NEXT_OFF(convert) = NODE_SZ_STR(convert); 2047 trie->startstate = state; 2048 trie->minlen -= (state - 1); 2049 trie->maxlen -= (state - 1); 2050 #ifdef DEBUGGING 2051 /* At least the UNICOS C compiler choked on this 2052 * being argument to DEBUG_r(), so let's just have 2053 * it right here. */ 2054 if ( 2055 #ifdef PERL_EXT_RE_BUILD 2056 1 2057 #else 2058 DEBUG_r_TEST 2059 #endif 2060 ) { 2061 regnode *fix = convert; 2062 U32 word = trie->wordcount; 2063 mjd_nodelen++; 2064 Set_Node_Offset_Length(convert, mjd_offset, state - 1); 2065 while( ++fix < n ) { 2066 Set_Node_Offset_Length(fix, 0, 0); 2067 } 2068 while (word--) { 2069 SV ** const tmp = av_fetch( trie_words, word, 0 ); 2070 if (tmp) { 2071 if ( STR_LEN(convert) <= SvCUR(*tmp) ) 2072 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); 2073 else 2074 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); 2075 } 2076 } 2077 } 2078 #endif 2079 if (trie->maxlen) { 2080 convert = n; 2081 } else { 2082 NEXT_OFF(convert) = (U16)(tail - convert); 2083 DEBUG_r(optimize= n); 2084 } 2085 } 2086 } 2087 if (!jumper) 2088 jumper = last; 2089 if ( trie->maxlen ) { 2090 NEXT_OFF( convert ) = (U16)(tail - convert); 2091 ARG_SET( convert, data_slot ); 2092 /* Store the offset to the first unabsorbed branch in 2093 jump[0], which is otherwise unused by the jump logic. 2094 We use this when dumping a trie and during optimisation. */ 2095 if (trie->jump) 2096 trie->jump[0] = (U16)(nextbranch - convert); 2097 2098 /* XXXX */ 2099 if ( !trie->states[trie->startstate].wordnum && trie->bitmap && 2100 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) 2101 { 2102 OP( convert ) = TRIEC; 2103 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); 2104 PerlMemShared_free(trie->bitmap); 2105 trie->bitmap= NULL; 2106 } else 2107 OP( convert ) = TRIE; 2108 2109 /* store the type in the flags */ 2110 convert->flags = nodetype; 2111 DEBUG_r({ 2112 optimize = convert 2113 + NODE_STEP_REGNODE 2114 + regarglen[ OP( convert ) ]; 2115 }); 2116 /* XXX We really should free up the resource in trie now, 2117 as we won't use them - (which resources?) dmq */ 2118 } 2119 /* needed for dumping*/ 2120 DEBUG_r(if (optimize) { 2121 regnode *opt = convert; 2122 2123 while ( ++opt < optimize) { 2124 Set_Node_Offset_Length(opt,0,0); 2125 } 2126 /* 2127 Try to clean up some of the debris left after the 2128 optimisation. 2129 */ 2130 while( optimize < jumper ) { 2131 mjd_nodelen += Node_Length((optimize)); 2132 OP( optimize ) = OPTIMIZED; 2133 Set_Node_Offset_Length(optimize,0,0); 2134 optimize++; 2135 } 2136 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); 2137 }); 2138 } /* end node insert */ 2139 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; 2140 #ifdef DEBUGGING 2141 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; 2142 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; 2143 #else 2144 SvREFCNT_dec(revcharmap); 2145 #endif 2146 return trie->jump 2147 ? MADE_JUMP_TRIE 2148 : trie->startstate>1 2149 ? MADE_EXACT_TRIE 2150 : MADE_TRIE; 2151 } 2152 2153 STATIC void 2154 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) 2155 { 2156 /* The Trie is constructed and compressed now so we can build a fail array now if its needed 2157 2158 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the 2159 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 2160 ISBN 0-201-10088-6 2161 2162 We find the fail state for each state in the trie, this state is the longest proper 2163 suffix of the current states 'word' that is also a proper prefix of another word in our 2164 trie. State 1 represents the word '' and is the thus the default fail state. This allows 2165 the DFA not to have to restart after its tried and failed a word at a given point, it 2166 simply continues as though it had been matching the other word in the first place. 2167 Consider 2168 'abcdgu'=~/abcdefg|cdgu/ 2169 When we get to 'd' we are still matching the first word, we would encounter 'g' which would 2170 fail, which would bring use to the state representing 'd' in the second word where we would 2171 try 'g' and succeed, prodceding to match 'cdgu'. 2172 */ 2173 /* add a fail transition */ 2174 const U32 trie_offset = ARG(source); 2175 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; 2176 U32 *q; 2177 const U32 ucharcount = trie->uniquecharcount; 2178 const U32 numstates = trie->statecount; 2179 const U32 ubound = trie->lasttrans + ucharcount; 2180 U32 q_read = 0; 2181 U32 q_write = 0; 2182 U32 charid; 2183 U32 base = trie->states[ 1 ].trans.base; 2184 U32 *fail; 2185 reg_ac_data *aho; 2186 const U32 data_slot = add_data( pRExC_state, 1, "T" ); 2187 GET_RE_DEBUG_FLAGS_DECL; 2188 2189 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; 2190 #ifndef DEBUGGING 2191 PERL_UNUSED_ARG(depth); 2192 #endif 2193 2194 2195 ARG_SET( stclass, data_slot ); 2196 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); 2197 RExC_rxi->data->data[ data_slot ] = (void*)aho; 2198 aho->trie=trie_offset; 2199 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); 2200 Copy( trie->states, aho->states, numstates, reg_trie_state ); 2201 Newxz( q, numstates, U32); 2202 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); 2203 aho->refcount = 1; 2204 fail = aho->fail; 2205 /* initialize fail[0..1] to be 1 so that we always have 2206 a valid final fail state */ 2207 fail[ 0 ] = fail[ 1 ] = 1; 2208 2209 for ( charid = 0; charid < ucharcount ; charid++ ) { 2210 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); 2211 if ( newstate ) { 2212 q[ q_write ] = newstate; 2213 /* set to point at the root */ 2214 fail[ q[ q_write++ ] ]=1; 2215 } 2216 } 2217 while ( q_read < q_write) { 2218 const U32 cur = q[ q_read++ % numstates ]; 2219 base = trie->states[ cur ].trans.base; 2220 2221 for ( charid = 0 ; charid < ucharcount ; charid++ ) { 2222 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); 2223 if (ch_state) { 2224 U32 fail_state = cur; 2225 U32 fail_base; 2226 do { 2227 fail_state = fail[ fail_state ]; 2228 fail_base = aho->states[ fail_state ].trans.base; 2229 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); 2230 2231 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); 2232 fail[ ch_state ] = fail_state; 2233 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) 2234 { 2235 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; 2236 } 2237 q[ q_write++ % numstates] = ch_state; 2238 } 2239 } 2240 } 2241 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop 2242 when we fail in state 1, this allows us to use the 2243 charclass scan to find a valid start char. This is based on the principle 2244 that theres a good chance the string being searched contains lots of stuff 2245 that cant be a start char. 2246 */ 2247 fail[ 0 ] = fail[ 1 ] = 0; 2248 DEBUG_TRIE_COMPILE_r({ 2249 PerlIO_printf(Perl_debug_log, 2250 "%*sStclass Failtable (%"UVuf" states): 0", 2251 (int)(depth * 2), "", (UV)numstates 2252 ); 2253 for( q_read=1; q_read<numstates; q_read++ ) { 2254 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]); 2255 } 2256 PerlIO_printf(Perl_debug_log, "\n"); 2257 }); 2258 Safefree(q); 2259 /*RExC_seen |= REG_SEEN_TRIEDFA;*/ 2260 } 2261 2262 2263 /* 2264 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. 2265 * These need to be revisited when a newer toolchain becomes available. 2266 */ 2267 #if defined(__sparc64__) && defined(__GNUC__) 2268 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) 2269 # undef SPARC64_GCC_WORKAROUND 2270 # define SPARC64_GCC_WORKAROUND 1 2271 # endif 2272 #endif 2273 2274 #define DEBUG_PEEP(str,scan,depth) \ 2275 DEBUG_OPTIMISE_r({if (scan){ \ 2276 SV * const mysv=sv_newmortal(); \ 2277 regnode *Next = regnext(scan); \ 2278 regprop(RExC_rx, mysv, scan); \ 2279 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ 2280 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ 2281 Next ? (REG_NODE_NUM(Next)) : 0 ); \ 2282 }}); 2283 2284 2285 2286 2287 2288 #define JOIN_EXACT(scan,min,flags) \ 2289 if (PL_regkind[OP(scan)] == EXACT) \ 2290 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1) 2291 2292 STATIC U32 2293 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) { 2294 /* Merge several consecutive EXACTish nodes into one. */ 2295 regnode *n = regnext(scan); 2296 U32 stringok = 1; 2297 regnode *next = scan + NODE_SZ_STR(scan); 2298 U32 merged = 0; 2299 U32 stopnow = 0; 2300 #ifdef DEBUGGING 2301 regnode *stop = scan; 2302 GET_RE_DEBUG_FLAGS_DECL; 2303 #else 2304 PERL_UNUSED_ARG(depth); 2305 #endif 2306 2307 PERL_ARGS_ASSERT_JOIN_EXACT; 2308 #ifndef EXPERIMENTAL_INPLACESCAN 2309 PERL_UNUSED_ARG(flags); 2310 PERL_UNUSED_ARG(val); 2311 #endif 2312 DEBUG_PEEP("join",scan,depth); 2313 2314 /* Skip NOTHING, merge EXACT*. */ 2315 while (n && 2316 ( PL_regkind[OP(n)] == NOTHING || 2317 (stringok && (OP(n) == OP(scan)))) 2318 && NEXT_OFF(n) 2319 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { 2320 2321 if (OP(n) == TAIL || n > next) 2322 stringok = 0; 2323 if (PL_regkind[OP(n)] == NOTHING) { 2324 DEBUG_PEEP("skip:",n,depth); 2325 NEXT_OFF(scan) += NEXT_OFF(n); 2326 next = n + NODE_STEP_REGNODE; 2327 #ifdef DEBUGGING 2328 if (stringok) 2329 stop = n; 2330 #endif 2331 n = regnext(n); 2332 } 2333 else if (stringok) { 2334 const unsigned int oldl = STR_LEN(scan); 2335 regnode * const nnext = regnext(n); 2336 2337 DEBUG_PEEP("merg",n,depth); 2338 2339 merged++; 2340 if (oldl + STR_LEN(n) > U8_MAX) 2341 break; 2342 NEXT_OFF(scan) += NEXT_OFF(n); 2343 STR_LEN(scan) += STR_LEN(n); 2344 next = n + NODE_SZ_STR(n); 2345 /* Now we can overwrite *n : */ 2346 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 2347 #ifdef DEBUGGING 2348 stop = next - 1; 2349 #endif 2350 n = nnext; 2351 if (stopnow) break; 2352 } 2353 2354 #ifdef EXPERIMENTAL_INPLACESCAN 2355 if (flags && !NEXT_OFF(n)) { 2356 DEBUG_PEEP("atch", val, depth); 2357 if (reg_off_by_arg[OP(n)]) { 2358 ARG_SET(n, val - n); 2359 } 2360 else { 2361 NEXT_OFF(n) = val - n; 2362 } 2363 stopnow = 1; 2364 } 2365 #endif 2366 } 2367 2368 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) { 2369 /* 2370 Two problematic code points in Unicode casefolding of EXACT nodes: 2371 2372 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 2373 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 2374 2375 which casefold to 2376 2377 Unicode UTF-8 2378 2379 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 2380 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 2381 2382 This means that in case-insensitive matching (or "loose matching", 2383 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte 2384 length of the above casefolded versions) can match a target string 2385 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). 2386 This would rather mess up the minimum length computation. 2387 2388 What we'll do is to look for the tail four bytes, and then peek 2389 at the preceding two bytes to see whether we need to decrease 2390 the minimum length by four (six minus two). 2391 2392 Thanks to the design of UTF-8, there cannot be false matches: 2393 A sequence of valid UTF-8 bytes cannot be a subsequence of 2394 another valid sequence of UTF-8 bytes. 2395 2396 */ 2397 char * const s0 = STRING(scan), *s, *t; 2398 char * const s1 = s0 + STR_LEN(scan) - 1; 2399 char * const s2 = s1 - 4; 2400 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ 2401 const char t0[] = "\xaf\x49\xaf\x42"; 2402 #else 2403 const char t0[] = "\xcc\x88\xcc\x81"; 2404 #endif 2405 const char * const t1 = t0 + 3; 2406 2407 for (s = s0 + 2; 2408 s < s2 && (t = ninstr(s, s1, t0, t1)); 2409 s = t + 4) { 2410 #ifdef EBCDIC 2411 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) || 2412 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5)) 2413 #else 2414 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || 2415 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) 2416 #endif 2417 *min -= 4; 2418 } 2419 } 2420 2421 #ifdef DEBUGGING 2422 /* Allow dumping */ 2423 n = scan + NODE_SZ_STR(scan); 2424 while (n <= stop) { 2425 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) { 2426 OP(n) = OPTIMIZED; 2427 NEXT_OFF(n) = 0; 2428 } 2429 n++; 2430 } 2431 #endif 2432 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); 2433 return stopnow; 2434 } 2435 2436 /* REx optimizer. Converts nodes into quickier variants "in place". 2437 Finds fixed substrings. */ 2438 2439 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set 2440 to the position after last scanned or to NULL. */ 2441 2442 #define INIT_AND_WITHP \ 2443 assert(!and_withp); \ 2444 Newx(and_withp,1,struct regnode_charclass_class); \ 2445 SAVEFREEPV(and_withp) 2446 2447 /* this is a chain of data about sub patterns we are processing that 2448 need to be handled seperately/specially in study_chunk. Its so 2449 we can simulate recursion without losing state. */ 2450 struct scan_frame; 2451 typedef struct scan_frame { 2452 regnode *last; /* last node to process in this frame */ 2453 regnode *next; /* next node to process when last is reached */ 2454 struct scan_frame *prev; /*previous frame*/ 2455 I32 stop; /* what stopparen do we use */ 2456 } scan_frame; 2457 2458 2459 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) 2460 2461 #define CASE_SYNST_FNC(nAmE) \ 2462 case nAmE: \ 2463 if (flags & SCF_DO_STCLASS_AND) { \ 2464 for (value = 0; value < 256; value++) \ 2465 if (!is_ ## nAmE ## _cp(value)) \ 2466 ANYOF_BITMAP_CLEAR(data->start_class, value); \ 2467 } \ 2468 else { \ 2469 for (value = 0; value < 256; value++) \ 2470 if (is_ ## nAmE ## _cp(value)) \ 2471 ANYOF_BITMAP_SET(data->start_class, value); \ 2472 } \ 2473 break; \ 2474 case N ## nAmE: \ 2475 if (flags & SCF_DO_STCLASS_AND) { \ 2476 for (value = 0; value < 256; value++) \ 2477 if (is_ ## nAmE ## _cp(value)) \ 2478 ANYOF_BITMAP_CLEAR(data->start_class, value); \ 2479 } \ 2480 else { \ 2481 for (value = 0; value < 256; value++) \ 2482 if (!is_ ## nAmE ## _cp(value)) \ 2483 ANYOF_BITMAP_SET(data->start_class, value); \ 2484 } \ 2485 break 2486 2487 2488 2489 STATIC I32 2490 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, 2491 I32 *minlenp, I32 *deltap, 2492 regnode *last, 2493 scan_data_t *data, 2494 I32 stopparen, 2495 U8* recursed, 2496 struct regnode_charclass_class *and_withp, 2497 U32 flags, U32 depth) 2498 /* scanp: Start here (read-write). */ 2499 /* deltap: Write maxlen-minlen here. */ 2500 /* last: Stop before this one. */ 2501 /* data: string data about the pattern */ 2502 /* stopparen: treat close N as END */ 2503 /* recursed: which subroutines have we recursed into */ 2504 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ 2505 { 2506 dVAR; 2507 I32 min = 0, pars = 0, code; 2508 regnode *scan = *scanp, *next; 2509 I32 delta = 0; 2510 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 2511 int is_inf_internal = 0; /* The studied chunk is infinite */ 2512 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; 2513 scan_data_t data_fake; 2514 SV *re_trie_maxbuff = NULL; 2515 regnode *first_non_open = scan; 2516 I32 stopmin = I32_MAX; 2517 scan_frame *frame = NULL; 2518 GET_RE_DEBUG_FLAGS_DECL; 2519 2520 PERL_ARGS_ASSERT_STUDY_CHUNK; 2521 2522 #ifdef DEBUGGING 2523 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 2524 #endif 2525 2526 if ( depth == 0 ) { 2527 while (first_non_open && OP(first_non_open) == OPEN) 2528 first_non_open=regnext(first_non_open); 2529 } 2530 2531 2532 fake_study_recurse: 2533 while ( scan && OP(scan) != END && scan < last ){ 2534 /* Peephole optimizer: */ 2535 DEBUG_STUDYDATA("Peep:", data,depth); 2536 DEBUG_PEEP("Peep",scan,depth); 2537 JOIN_EXACT(scan,&min,0); 2538 2539 /* Follow the next-chain of the current node and optimize 2540 away all the NOTHINGs from it. */ 2541 if (OP(scan) != CURLYX) { 2542 const int max = (reg_off_by_arg[OP(scan)] 2543 ? I32_MAX 2544 /* I32 may be smaller than U16 on CRAYs! */ 2545 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 2546 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); 2547 int noff; 2548 regnode *n = scan; 2549 2550 /* Skip NOTHING and LONGJMP. */ 2551 while ((n = regnext(n)) 2552 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) 2553 || ((OP(n) == LONGJMP) && (noff = ARG(n)))) 2554 && off + noff < max) 2555 off += noff; 2556 if (reg_off_by_arg[OP(scan)]) 2557 ARG(scan) = off; 2558 else 2559 NEXT_OFF(scan) = off; 2560 } 2561 2562 2563 2564 /* The principal pseudo-switch. Cannot be a switch, since we 2565 look into several different things. */ 2566 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ 2567 || OP(scan) == IFTHEN) { 2568 next = regnext(scan); 2569 code = OP(scan); 2570 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ 2571 2572 if (OP(next) == code || code == IFTHEN) { 2573 /* NOTE - There is similar code to this block below for handling 2574 TRIE nodes on a re-study. If you change stuff here check there 2575 too. */ 2576 I32 max1 = 0, min1 = I32_MAX, num = 0; 2577 struct regnode_charclass_class accum; 2578 regnode * const startbranch=scan; 2579 2580 if (flags & SCF_DO_SUBSTR) 2581 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ 2582 if (flags & SCF_DO_STCLASS) 2583 cl_init_zero(pRExC_state, &accum); 2584 2585 while (OP(scan) == code) { 2586 I32 deltanext, minnext, f = 0, fake; 2587 struct regnode_charclass_class this_class; 2588 2589 num++; 2590 data_fake.flags = 0; 2591 if (data) { 2592 data_fake.whilem_c = data->whilem_c; 2593 data_fake.last_closep = data->last_closep; 2594 } 2595 else 2596 data_fake.last_closep = &fake; 2597 2598 data_fake.pos_delta = delta; 2599 next = regnext(scan); 2600 scan = NEXTOPER(scan); 2601 if (code != BRANCH) 2602 scan = NEXTOPER(scan); 2603 if (flags & SCF_DO_STCLASS) { 2604 cl_init(pRExC_state, &this_class); 2605 data_fake.start_class = &this_class; 2606 f = SCF_DO_STCLASS_AND; 2607 } 2608 if (flags & SCF_WHILEM_VISITED_POS) 2609 f |= SCF_WHILEM_VISITED_POS; 2610 2611 /* we suppose the run is continuous, last=next...*/ 2612 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 2613 next, &data_fake, 2614 stopparen, recursed, NULL, f,depth+1); 2615 if (min1 > minnext) 2616 min1 = minnext; 2617 if (max1 < minnext + deltanext) 2618 max1 = minnext + deltanext; 2619 if (deltanext == I32_MAX) 2620 is_inf = is_inf_internal = 1; 2621 scan = next; 2622 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 2623 pars++; 2624 if (data_fake.flags & SCF_SEEN_ACCEPT) { 2625 if ( stopmin > minnext) 2626 stopmin = min + min1; 2627 flags &= ~SCF_DO_SUBSTR; 2628 if (data) 2629 data->flags |= SCF_SEEN_ACCEPT; 2630 } 2631 if (data) { 2632 if (data_fake.flags & SF_HAS_EVAL) 2633 data->flags |= SF_HAS_EVAL; 2634 data->whilem_c = data_fake.whilem_c; 2635 } 2636 if (flags & SCF_DO_STCLASS) 2637 cl_or(pRExC_state, &accum, &this_class); 2638 } 2639 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 2640 min1 = 0; 2641 if (flags & SCF_DO_SUBSTR) { 2642 data->pos_min += min1; 2643 data->pos_delta += max1 - min1; 2644 if (max1 != min1 || is_inf) 2645 data->longest = &(data->longest_float); 2646 } 2647 min += min1; 2648 delta += max1 - min1; 2649 if (flags & SCF_DO_STCLASS_OR) { 2650 cl_or(pRExC_state, data->start_class, &accum); 2651 if (min1) { 2652 cl_and(data->start_class, and_withp); 2653 flags &= ~SCF_DO_STCLASS; 2654 } 2655 } 2656 else if (flags & SCF_DO_STCLASS_AND) { 2657 if (min1) { 2658 cl_and(data->start_class, &accum); 2659 flags &= ~SCF_DO_STCLASS; 2660 } 2661 else { 2662 /* Switch to OR mode: cache the old value of 2663 * data->start_class */ 2664 INIT_AND_WITHP; 2665 StructCopy(data->start_class, and_withp, 2666 struct regnode_charclass_class); 2667 flags &= ~SCF_DO_STCLASS_AND; 2668 StructCopy(&accum, data->start_class, 2669 struct regnode_charclass_class); 2670 flags |= SCF_DO_STCLASS_OR; 2671 data->start_class->flags |= ANYOF_EOS; 2672 } 2673 } 2674 2675 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { 2676 /* demq. 2677 2678 Assuming this was/is a branch we are dealing with: 'scan' now 2679 points at the item that follows the branch sequence, whatever 2680 it is. We now start at the beginning of the sequence and look 2681 for subsequences of 2682 2683 BRANCH->EXACT=>x1 2684 BRANCH->EXACT=>x2 2685 tail 2686 2687 which would be constructed from a pattern like /A|LIST|OF|WORDS/ 2688 2689 If we can find such a subseqence we need to turn the first 2690 element into a trie and then add the subsequent branch exact 2691 strings to the trie. 2692 2693 We have two cases 2694 2695 1. patterns where the whole set of branch can be converted. 2696 2697 2. patterns where only a subset can be converted. 2698 2699 In case 1 we can replace the whole set with a single regop 2700 for the trie. In case 2 we need to keep the start and end 2701 branchs so 2702 2703 'BRANCH EXACT; BRANCH EXACT; BRANCH X' 2704 becomes BRANCH TRIE; BRANCH X; 2705 2706 There is an additional case, that being where there is a 2707 common prefix, which gets split out into an EXACT like node 2708 preceding the TRIE node. 2709 2710 If x(1..n)==tail then we can do a simple trie, if not we make 2711 a "jump" trie, such that when we match the appropriate word 2712 we "jump" to the appopriate tail node. Essentailly we turn 2713 a nested if into a case structure of sorts. 2714 2715 */ 2716 2717 int made=0; 2718 if (!re_trie_maxbuff) { 2719 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 2720 if (!SvIOK(re_trie_maxbuff)) 2721 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 2722 } 2723 if ( SvIV(re_trie_maxbuff)>=0 ) { 2724 regnode *cur; 2725 regnode *first = (regnode *)NULL; 2726 regnode *last = (regnode *)NULL; 2727 regnode *tail = scan; 2728 U8 optype = 0; 2729 U32 count=0; 2730 2731 #ifdef DEBUGGING 2732 SV * const mysv = sv_newmortal(); /* for dumping */ 2733 #endif 2734 /* var tail is used because there may be a TAIL 2735 regop in the way. Ie, the exacts will point to the 2736 thing following the TAIL, but the last branch will 2737 point at the TAIL. So we advance tail. If we 2738 have nested (?:) we may have to move through several 2739 tails. 2740 */ 2741 2742 while ( OP( tail ) == TAIL ) { 2743 /* this is the TAIL generated by (?:) */ 2744 tail = regnext( tail ); 2745 } 2746 2747 2748 DEBUG_OPTIMISE_r({ 2749 regprop(RExC_rx, mysv, tail ); 2750 PerlIO_printf( Perl_debug_log, "%*s%s%s\n", 2751 (int)depth * 2 + 2, "", 2752 "Looking for TRIE'able sequences. Tail node is: ", 2753 SvPV_nolen_const( mysv ) 2754 ); 2755 }); 2756 2757 /* 2758 2759 step through the branches, cur represents each 2760 branch, noper is the first thing to be matched 2761 as part of that branch and noper_next is the 2762 regnext() of that node. if noper is an EXACT 2763 and noper_next is the same as scan (our current 2764 position in the regex) then the EXACT branch is 2765 a possible optimization target. Once we have 2766 two or more consequetive such branches we can 2767 create a trie of the EXACT's contents and stich 2768 it in place. If the sequence represents all of 2769 the branches we eliminate the whole thing and 2770 replace it with a single TRIE. If it is a 2771 subsequence then we need to stitch it in. This 2772 means the first branch has to remain, and needs 2773 to be repointed at the item on the branch chain 2774 following the last branch optimized. This could 2775 be either a BRANCH, in which case the 2776 subsequence is internal, or it could be the 2777 item following the branch sequence in which 2778 case the subsequence is at the end. 2779 2780 */ 2781 2782 /* dont use tail as the end marker for this traverse */ 2783 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { 2784 regnode * const noper = NEXTOPER( cur ); 2785 #if defined(DEBUGGING) || defined(NOJUMPTRIE) 2786 regnode * const noper_next = regnext( noper ); 2787 #endif 2788 2789 DEBUG_OPTIMISE_r({ 2790 regprop(RExC_rx, mysv, cur); 2791 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", 2792 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); 2793 2794 regprop(RExC_rx, mysv, noper); 2795 PerlIO_printf( Perl_debug_log, " -> %s", 2796 SvPV_nolen_const(mysv)); 2797 2798 if ( noper_next ) { 2799 regprop(RExC_rx, mysv, noper_next ); 2800 PerlIO_printf( Perl_debug_log,"\t=> %s\t", 2801 SvPV_nolen_const(mysv)); 2802 } 2803 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n", 2804 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) ); 2805 }); 2806 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype 2807 : PL_regkind[ OP( noper ) ] == EXACT ) 2808 || OP(noper) == NOTHING ) 2809 #ifdef NOJUMPTRIE 2810 && noper_next == tail 2811 #endif 2812 && count < U16_MAX) 2813 { 2814 count++; 2815 if ( !first || optype == NOTHING ) { 2816 if (!first) first = cur; 2817 optype = OP( noper ); 2818 } else { 2819 last = cur; 2820 } 2821 } else { 2822 /* 2823 Currently we do not believe that the trie logic can 2824 handle case insensitive matching properly when the 2825 pattern is not unicode (thus forcing unicode semantics). 2826 2827 If/when this is fixed the following define can be swapped 2828 in below to fully enable trie logic. 2829 2830 #define TRIE_TYPE_IS_SAFE 1 2831 2832 */ 2833 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) 2834 2835 if ( last && TRIE_TYPE_IS_SAFE ) { 2836 make_trie( pRExC_state, 2837 startbranch, first, cur, tail, count, 2838 optype, depth+1 ); 2839 } 2840 if ( PL_regkind[ OP( noper ) ] == EXACT 2841 #ifdef NOJUMPTRIE 2842 && noper_next == tail 2843 #endif 2844 ){ 2845 count = 1; 2846 first = cur; 2847 optype = OP( noper ); 2848 } else { 2849 count = 0; 2850 first = NULL; 2851 optype = 0; 2852 } 2853 last = NULL; 2854 } 2855 } 2856 DEBUG_OPTIMISE_r({ 2857 regprop(RExC_rx, mysv, cur); 2858 PerlIO_printf( Perl_debug_log, 2859 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2, 2860 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); 2861 2862 }); 2863 2864 if ( last && TRIE_TYPE_IS_SAFE ) { 2865 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); 2866 #ifdef TRIE_STUDY_OPT 2867 if ( ((made == MADE_EXACT_TRIE && 2868 startbranch == first) 2869 || ( first_non_open == first )) && 2870 depth==0 ) { 2871 flags |= SCF_TRIE_RESTUDY; 2872 if ( startbranch == first 2873 && scan == tail ) 2874 { 2875 RExC_seen &=~REG_TOP_LEVEL_BRANCHES; 2876 } 2877 } 2878 #endif 2879 } 2880 } 2881 2882 } /* do trie */ 2883 2884 } 2885 else if ( code == BRANCHJ ) { /* single branch is optimized. */ 2886 scan = NEXTOPER(NEXTOPER(scan)); 2887 } else /* single branch is optimized. */ 2888 scan = NEXTOPER(scan); 2889 continue; 2890 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { 2891 scan_frame *newframe = NULL; 2892 I32 paren; 2893 regnode *start; 2894 regnode *end; 2895 2896 if (OP(scan) != SUSPEND) { 2897 /* set the pointer */ 2898 if (OP(scan) == GOSUB) { 2899 paren = ARG(scan); 2900 RExC_recurse[ARG2L(scan)] = scan; 2901 start = RExC_open_parens[paren-1]; 2902 end = RExC_close_parens[paren-1]; 2903 } else { 2904 paren = 0; 2905 start = RExC_rxi->program + 1; 2906 end = RExC_opend; 2907 } 2908 if (!recursed) { 2909 Newxz(recursed, (((RExC_npar)>>3) +1), U8); 2910 SAVEFREEPV(recursed); 2911 } 2912 if (!PAREN_TEST(recursed,paren+1)) { 2913 PAREN_SET(recursed,paren+1); 2914 Newx(newframe,1,scan_frame); 2915 } else { 2916 if (flags & SCF_DO_SUBSTR) { 2917 SCAN_COMMIT(pRExC_state,data,minlenp); 2918 data->longest = &(data->longest_float); 2919 } 2920 is_inf = is_inf_internal = 1; 2921 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 2922 cl_anything(pRExC_state, data->start_class); 2923 flags &= ~SCF_DO_STCLASS; 2924 } 2925 } else { 2926 Newx(newframe,1,scan_frame); 2927 paren = stopparen; 2928 start = scan+2; 2929 end = regnext(scan); 2930 } 2931 if (newframe) { 2932 assert(start); 2933 assert(end); 2934 SAVEFREEPV(newframe); 2935 newframe->next = regnext(scan); 2936 newframe->last = last; 2937 newframe->stop = stopparen; 2938 newframe->prev = frame; 2939 2940 frame = newframe; 2941 scan = start; 2942 stopparen = paren; 2943 last = end; 2944 2945 continue; 2946 } 2947 } 2948 else if (OP(scan) == EXACT) { 2949 I32 l = STR_LEN(scan); 2950 UV uc; 2951 if (UTF) { 2952 const U8 * const s = (U8*)STRING(scan); 2953 l = utf8_length(s, s + l); 2954 uc = utf8_to_uvchr(s, NULL); 2955 } else { 2956 uc = *((U8*)STRING(scan)); 2957 } 2958 min += l; 2959 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 2960 /* The code below prefers earlier match for fixed 2961 offset, later match for variable offset. */ 2962 if (data->last_end == -1) { /* Update the start info. */ 2963 data->last_start_min = data->pos_min; 2964 data->last_start_max = is_inf 2965 ? I32_MAX : data->pos_min + data->pos_delta; 2966 } 2967 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); 2968 if (UTF) 2969 SvUTF8_on(data->last_found); 2970 { 2971 SV * const sv = data->last_found; 2972 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 2973 mg_find(sv, PERL_MAGIC_utf8) : NULL; 2974 if (mg && mg->mg_len >= 0) 2975 mg->mg_len += utf8_length((U8*)STRING(scan), 2976 (U8*)STRING(scan)+STR_LEN(scan)); 2977 } 2978 data->last_end = data->pos_min + l; 2979 data->pos_min += l; /* As in the first entry. */ 2980 data->flags &= ~SF_BEFORE_EOL; 2981 } 2982 if (flags & SCF_DO_STCLASS_AND) { 2983 /* Check whether it is compatible with what we know already! */ 2984 int compat = 1; 2985 2986 if (uc >= 0x100 || 2987 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 2988 && !ANYOF_BITMAP_TEST(data->start_class, uc) 2989 && (!(data->start_class->flags & ANYOF_FOLD) 2990 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 2991 ) 2992 compat = 0; 2993 ANYOF_CLASS_ZERO(data->start_class); 2994 ANYOF_BITMAP_ZERO(data->start_class); 2995 if (compat) 2996 ANYOF_BITMAP_SET(data->start_class, uc); 2997 data->start_class->flags &= ~ANYOF_EOS; 2998 if (uc < 0x100) 2999 data->start_class->flags &= ~ANYOF_UNICODE_ALL; 3000 } 3001 else if (flags & SCF_DO_STCLASS_OR) { 3002 /* false positive possible if the class is case-folded */ 3003 if (uc < 0x100) 3004 ANYOF_BITMAP_SET(data->start_class, uc); 3005 else 3006 data->start_class->flags |= ANYOF_UNICODE_ALL; 3007 data->start_class->flags &= ~ANYOF_EOS; 3008 cl_and(data->start_class, and_withp); 3009 } 3010 flags &= ~SCF_DO_STCLASS; 3011 } 3012 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ 3013 I32 l = STR_LEN(scan); 3014 UV uc = *((U8*)STRING(scan)); 3015 3016 /* Search for fixed substrings supports EXACT only. */ 3017 if (flags & SCF_DO_SUBSTR) { 3018 assert(data); 3019 SCAN_COMMIT(pRExC_state, data, minlenp); 3020 } 3021 if (UTF) { 3022 const U8 * const s = (U8 *)STRING(scan); 3023 l = utf8_length(s, s + l); 3024 uc = utf8_to_uvchr(s, NULL); 3025 } 3026 min += l; 3027 if (flags & SCF_DO_SUBSTR) 3028 data->pos_min += l; 3029 if (flags & SCF_DO_STCLASS_AND) { 3030 /* Check whether it is compatible with what we know already! */ 3031 int compat = 1; 3032 3033 if (uc >= 0x100 || 3034 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 3035 && !ANYOF_BITMAP_TEST(data->start_class, uc) 3036 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 3037 compat = 0; 3038 ANYOF_CLASS_ZERO(data->start_class); 3039 ANYOF_BITMAP_ZERO(data->start_class); 3040 if (compat) { 3041 ANYOF_BITMAP_SET(data->start_class, uc); 3042 data->start_class->flags &= ~ANYOF_EOS; 3043 data->start_class->flags |= ANYOF_FOLD; 3044 if (OP(scan) == EXACTFL) 3045 data->start_class->flags |= ANYOF_LOCALE; 3046 } 3047 } 3048 else if (flags & SCF_DO_STCLASS_OR) { 3049 if (data->start_class->flags & ANYOF_FOLD) { 3050 /* false positive possible if the class is case-folded. 3051 Assume that the locale settings are the same... */ 3052 if (uc < 0x100) 3053 ANYOF_BITMAP_SET(data->start_class, uc); 3054 data->start_class->flags &= ~ANYOF_EOS; 3055 } 3056 cl_and(data->start_class, and_withp); 3057 } 3058 flags &= ~SCF_DO_STCLASS; 3059 } 3060 else if (strchr((const char*)PL_varies,OP(scan))) { 3061 I32 mincount, maxcount, minnext, deltanext, fl = 0; 3062 I32 f = flags, pos_before = 0; 3063 regnode * const oscan = scan; 3064 struct regnode_charclass_class this_class; 3065 struct regnode_charclass_class *oclass = NULL; 3066 I32 next_is_eval = 0; 3067 3068 switch (PL_regkind[OP(scan)]) { 3069 case WHILEM: /* End of (?:...)* . */ 3070 scan = NEXTOPER(scan); 3071 goto finish; 3072 case PLUS: 3073 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 3074 next = NEXTOPER(scan); 3075 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { 3076 mincount = 1; 3077 maxcount = REG_INFTY; 3078 next = regnext(scan); 3079 scan = NEXTOPER(scan); 3080 goto do_curly; 3081 } 3082 } 3083 if (flags & SCF_DO_SUBSTR) 3084 data->pos_min++; 3085 min++; 3086 /* Fall through. */ 3087 case STAR: 3088 if (flags & SCF_DO_STCLASS) { 3089 mincount = 0; 3090 maxcount = REG_INFTY; 3091 next = regnext(scan); 3092 scan = NEXTOPER(scan); 3093 goto do_curly; 3094 } 3095 is_inf = is_inf_internal = 1; 3096 scan = regnext(scan); 3097 if (flags & SCF_DO_SUBSTR) { 3098 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ 3099 data->longest = &(data->longest_float); 3100 } 3101 goto optimize_curly_tail; 3102 case CURLY: 3103 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) 3104 && (scan->flags == stopparen)) 3105 { 3106 mincount = 1; 3107 maxcount = 1; 3108 } else { 3109 mincount = ARG1(scan); 3110 maxcount = ARG2(scan); 3111 } 3112 next = regnext(scan); 3113 if (OP(scan) == CURLYX) { 3114 I32 lp = (data ? *(data->last_closep) : 0); 3115 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); 3116 } 3117 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 3118 next_is_eval = (OP(scan) == EVAL); 3119 do_curly: 3120 if (flags & SCF_DO_SUBSTR) { 3121 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ 3122 pos_before = data->pos_min; 3123 } 3124 if (data) { 3125 fl = data->flags; 3126 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 3127 if (is_inf) 3128 data->flags |= SF_IS_INF; 3129 } 3130 if (flags & SCF_DO_STCLASS) { 3131 cl_init(pRExC_state, &this_class); 3132 oclass = data->start_class; 3133 data->start_class = &this_class; 3134 f |= SCF_DO_STCLASS_AND; 3135 f &= ~SCF_DO_STCLASS_OR; 3136 } 3137 /* These are the cases when once a subexpression 3138 fails at a particular position, it cannot succeed 3139 even after backtracking at the enclosing scope. 3140 3141 XXXX what if minimal match and we are at the 3142 initial run of {n,m}? */ 3143 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) 3144 f &= ~SCF_WHILEM_VISITED_POS; 3145 3146 /* This will finish on WHILEM, setting scan, or on NULL: */ 3147 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 3148 last, data, stopparen, recursed, NULL, 3149 (mincount == 0 3150 ? (f & ~SCF_DO_SUBSTR) : f),depth+1); 3151 3152 if (flags & SCF_DO_STCLASS) 3153 data->start_class = oclass; 3154 if (mincount == 0 || minnext == 0) { 3155 if (flags & SCF_DO_STCLASS_OR) { 3156 cl_or(pRExC_state, data->start_class, &this_class); 3157 } 3158 else if (flags & SCF_DO_STCLASS_AND) { 3159 /* Switch to OR mode: cache the old value of 3160 * data->start_class */ 3161 INIT_AND_WITHP; 3162 StructCopy(data->start_class, and_withp, 3163 struct regnode_charclass_class); 3164 flags &= ~SCF_DO_STCLASS_AND; 3165 StructCopy(&this_class, data->start_class, 3166 struct regnode_charclass_class); 3167 flags |= SCF_DO_STCLASS_OR; 3168 data->start_class->flags |= ANYOF_EOS; 3169 } 3170 } else { /* Non-zero len */ 3171 if (flags & SCF_DO_STCLASS_OR) { 3172 cl_or(pRExC_state, data->start_class, &this_class); 3173 cl_and(data->start_class, and_withp); 3174 } 3175 else if (flags & SCF_DO_STCLASS_AND) 3176 cl_and(data->start_class, &this_class); 3177 flags &= ~SCF_DO_STCLASS; 3178 } 3179 if (!scan) /* It was not CURLYX, but CURLY. */ 3180 scan = next; 3181 if ( /* ? quantifier ok, except for (?{ ... }) */ 3182 (next_is_eval || !(mincount == 0 && maxcount == 1)) 3183 && (minnext == 0) && (deltanext == 0) 3184 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 3185 && maxcount <= REG_INFTY/3 /* Complement check for big count */ 3186 && ckWARN(WARN_REGEXP)) 3187 { 3188 vWARN(RExC_parse, 3189 "Quantifier unexpected on zero-length expression"); 3190 } 3191 3192 min += minnext * mincount; 3193 is_inf_internal |= ((maxcount == REG_INFTY 3194 && (minnext + deltanext) > 0) 3195 || deltanext == I32_MAX); 3196 is_inf |= is_inf_internal; 3197 delta += (minnext + deltanext) * maxcount - minnext * mincount; 3198 3199 /* Try powerful optimization CURLYX => CURLYN. */ 3200 if ( OP(oscan) == CURLYX && data 3201 && data->flags & SF_IN_PAR 3202 && !(data->flags & SF_HAS_EVAL) 3203 && !deltanext && minnext == 1 ) { 3204 /* Try to optimize to CURLYN. */ 3205 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; 3206 regnode * const nxt1 = nxt; 3207 #ifdef DEBUGGING 3208 regnode *nxt2; 3209 #endif 3210 3211 /* Skip open. */ 3212 nxt = regnext(nxt); 3213 if (!strchr((const char*)PL_simple,OP(nxt)) 3214 && !(PL_regkind[OP(nxt)] == EXACT 3215 && STR_LEN(nxt) == 1)) 3216 goto nogo; 3217 #ifdef DEBUGGING 3218 nxt2 = nxt; 3219 #endif 3220 nxt = regnext(nxt); 3221 if (OP(nxt) != CLOSE) 3222 goto nogo; 3223 if (RExC_open_parens) { 3224 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ 3225 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ 3226 } 3227 /* Now we know that nxt2 is the only contents: */ 3228 oscan->flags = (U8)ARG(nxt); 3229 OP(oscan) = CURLYN; 3230 OP(nxt1) = NOTHING; /* was OPEN. */ 3231 3232 #ifdef DEBUGGING 3233 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 3234 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */ 3235 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */ 3236 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 3237 OP(nxt + 1) = OPTIMIZED; /* was count. */ 3238 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ 3239 #endif 3240 } 3241 nogo: 3242 3243 /* Try optimization CURLYX => CURLYM. */ 3244 if ( OP(oscan) == CURLYX && data 3245 && !(data->flags & SF_HAS_PAR) 3246 && !(data->flags & SF_HAS_EVAL) 3247 && !deltanext /* atom is fixed width */ 3248 && minnext != 0 /* CURLYM can't handle zero width */ 3249 ) { 3250 /* XXXX How to optimize if data == 0? */ 3251 /* Optimize to a simpler form. */ 3252 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ 3253 regnode *nxt2; 3254 3255 OP(oscan) = CURLYM; 3256 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 3257 && (OP(nxt2) != WHILEM)) 3258 nxt = nxt2; 3259 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 3260 /* Need to optimize away parenths. */ 3261 if (data->flags & SF_IN_PAR) { 3262 /* Set the parenth number. */ 3263 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ 3264 3265 if (OP(nxt) != CLOSE) 3266 FAIL("Panic opt close"); 3267 oscan->flags = (U8)ARG(nxt); 3268 if (RExC_open_parens) { 3269 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ 3270 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ 3271 } 3272 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 3273 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 3274 3275 #ifdef DEBUGGING 3276 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 3277 OP(nxt + 1) = OPTIMIZED; /* was count. */ 3278 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */ 3279 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */ 3280 #endif 3281 #if 0 3282 while ( nxt1 && (OP(nxt1) != WHILEM)) { 3283 regnode *nnxt = regnext(nxt1); 3284 3285 if (nnxt == nxt) { 3286 if (reg_off_by_arg[OP(nxt1)]) 3287 ARG_SET(nxt1, nxt2 - nxt1); 3288 else if (nxt2 - nxt1 < U16_MAX) 3289 NEXT_OFF(nxt1) = nxt2 - nxt1; 3290 else 3291 OP(nxt) = NOTHING; /* Cannot beautify */ 3292 } 3293 nxt1 = nnxt; 3294 } 3295 #endif 3296 /* Optimize again: */ 3297 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, 3298 NULL, stopparen, recursed, NULL, 0,depth+1); 3299 } 3300 else 3301 oscan->flags = 0; 3302 } 3303 else if ((OP(oscan) == CURLYX) 3304 && (flags & SCF_WHILEM_VISITED_POS) 3305 /* See the comment on a similar expression above. 3306 However, this time it not a subexpression 3307 we care about, but the expression itself. */ 3308 && (maxcount == REG_INFTY) 3309 && data && ++data->whilem_c < 16) { 3310 /* This stays as CURLYX, we can put the count/of pair. */ 3311 /* Find WHILEM (as in regexec.c) */ 3312 regnode *nxt = oscan + NEXT_OFF(oscan); 3313 3314 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ 3315 nxt += ARG(nxt); 3316 PREVOPER(nxt)->flags = (U8)(data->whilem_c 3317 | (RExC_whilem_seen << 4)); /* On WHILEM */ 3318 } 3319 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 3320 pars++; 3321 if (flags & SCF_DO_SUBSTR) { 3322 SV *last_str = NULL; 3323 int counted = mincount != 0; 3324 3325 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ 3326 #if defined(SPARC64_GCC_WORKAROUND) 3327 I32 b = 0; 3328 STRLEN l = 0; 3329 const char *s = NULL; 3330 I32 old = 0; 3331 3332 if (pos_before >= data->last_start_min) 3333 b = pos_before; 3334 else 3335 b = data->last_start_min; 3336 3337 l = 0; 3338 s = SvPV_const(data->last_found, l); 3339 old = b - data->last_start_min; 3340 3341 #else 3342 I32 b = pos_before >= data->last_start_min 3343 ? pos_before : data->last_start_min; 3344 STRLEN l; 3345 const char * const s = SvPV_const(data->last_found, l); 3346 I32 old = b - data->last_start_min; 3347 #endif 3348 3349 if (UTF) 3350 old = utf8_hop((U8*)s, old) - (U8*)s; 3351 3352 l -= old; 3353 /* Get the added string: */ 3354 last_str = newSVpvn_utf8(s + old, l, UTF); 3355 if (deltanext == 0 && pos_before == b) { 3356 /* What was added is a constant string */ 3357 if (mincount > 1) { 3358 SvGROW(last_str, (mincount * l) + 1); 3359 repeatcpy(SvPVX(last_str) + l, 3360 SvPVX_const(last_str), l, mincount - 1); 3361 SvCUR_set(last_str, SvCUR(last_str) * mincount); 3362 /* Add additional parts. */ 3363 SvCUR_set(data->last_found, 3364 SvCUR(data->last_found) - l); 3365 sv_catsv(data->last_found, last_str); 3366 { 3367 SV * sv = data->last_found; 3368 MAGIC *mg = 3369 SvUTF8(sv) && SvMAGICAL(sv) ? 3370 mg_find(sv, PERL_MAGIC_utf8) : NULL; 3371 if (mg && mg->mg_len >= 0) 3372 mg->mg_len += CHR_SVLEN(last_str) - l; 3373 } 3374 data->last_end += l * (mincount - 1); 3375 } 3376 } else { 3377 /* start offset must point into the last copy */ 3378 data->last_start_min += minnext * (mincount - 1); 3379 data->last_start_max += is_inf ? I32_MAX 3380 : (maxcount - 1) * (minnext + data->pos_delta); 3381 } 3382 } 3383 /* It is counted once already... */ 3384 data->pos_min += minnext * (mincount - counted); 3385 data->pos_delta += - counted * deltanext + 3386 (minnext + deltanext) * maxcount - minnext * mincount; 3387 if (mincount != maxcount) { 3388 /* Cannot extend fixed substrings found inside 3389 the group. */ 3390 SCAN_COMMIT(pRExC_state,data,minlenp); 3391 if (mincount && last_str) { 3392 SV * const sv = data->last_found; 3393 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 3394 mg_find(sv, PERL_MAGIC_utf8) : NULL; 3395 3396 if (mg) 3397 mg->mg_len = -1; 3398 sv_setsv(sv, last_str); 3399 data->last_end = data->pos_min; 3400 data->last_start_min = 3401 data->pos_min - CHR_SVLEN(last_str); 3402 data->last_start_max = is_inf 3403 ? I32_MAX 3404 : data->pos_min + data->pos_delta 3405 - CHR_SVLEN(last_str); 3406 } 3407 data->longest = &(data->longest_float); 3408 } 3409 SvREFCNT_dec(last_str); 3410 } 3411 if (data && (fl & SF_HAS_EVAL)) 3412 data->flags |= SF_HAS_EVAL; 3413 optimize_curly_tail: 3414 if (OP(oscan) != CURLYX) { 3415 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING 3416 && NEXT_OFF(next)) 3417 NEXT_OFF(oscan) += NEXT_OFF(next); 3418 } 3419 continue; 3420 default: /* REF and CLUMP only? */ 3421 if (flags & SCF_DO_SUBSTR) { 3422 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3423 data->longest = &(data->longest_float); 3424 } 3425 is_inf = is_inf_internal = 1; 3426 if (flags & SCF_DO_STCLASS_OR) 3427 cl_anything(pRExC_state, data->start_class); 3428 flags &= ~SCF_DO_STCLASS; 3429 break; 3430 } 3431 } 3432 else if (OP(scan) == LNBREAK) { 3433 if (flags & SCF_DO_STCLASS) { 3434 int value = 0; 3435 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ 3436 if (flags & SCF_DO_STCLASS_AND) { 3437 for (value = 0; value < 256; value++) 3438 if (!is_VERTWS_cp(value)) 3439 ANYOF_BITMAP_CLEAR(data->start_class, value); 3440 } 3441 else { 3442 for (value = 0; value < 256; value++) 3443 if (is_VERTWS_cp(value)) 3444 ANYOF_BITMAP_SET(data->start_class, value); 3445 } 3446 if (flags & SCF_DO_STCLASS_OR) 3447 cl_and(data->start_class, and_withp); 3448 flags &= ~SCF_DO_STCLASS; 3449 } 3450 min += 1; 3451 delta += 1; 3452 if (flags & SCF_DO_SUBSTR) { 3453 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3454 data->pos_min += 1; 3455 data->pos_delta += 1; 3456 data->longest = &(data->longest_float); 3457 } 3458 3459 } 3460 else if (OP(scan) == FOLDCHAR) { 3461 int d = ARG(scan)==0xDF ? 1 : 2; 3462 flags &= ~SCF_DO_STCLASS; 3463 min += 1; 3464 delta += d; 3465 if (flags & SCF_DO_SUBSTR) { 3466 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3467 data->pos_min += 1; 3468 data->pos_delta += d; 3469 data->longest = &(data->longest_float); 3470 } 3471 } 3472 else if (strchr((const char*)PL_simple,OP(scan))) { 3473 int value = 0; 3474 3475 if (flags & SCF_DO_SUBSTR) { 3476 SCAN_COMMIT(pRExC_state,data,minlenp); 3477 data->pos_min++; 3478 } 3479 min++; 3480 if (flags & SCF_DO_STCLASS) { 3481 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ 3482 3483 /* Some of the logic below assumes that switching 3484 locale on will only add false positives. */ 3485 switch (PL_regkind[OP(scan)]) { 3486 case SANY: 3487 default: 3488 do_default: 3489 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ 3490 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 3491 cl_anything(pRExC_state, data->start_class); 3492 break; 3493 case REG_ANY: 3494 if (OP(scan) == SANY) 3495 goto do_default; 3496 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ 3497 value = (ANYOF_BITMAP_TEST(data->start_class,'\n') 3498 || (data->start_class->flags & ANYOF_CLASS)); 3499 cl_anything(pRExC_state, data->start_class); 3500 } 3501 if (flags & SCF_DO_STCLASS_AND || !value) 3502 ANYOF_BITMAP_CLEAR(data->start_class,'\n'); 3503 break; 3504 case ANYOF: 3505 if (flags & SCF_DO_STCLASS_AND) 3506 cl_and(data->start_class, 3507 (struct regnode_charclass_class*)scan); 3508 else 3509 cl_or(pRExC_state, data->start_class, 3510 (struct regnode_charclass_class*)scan); 3511 break; 3512 case ALNUM: 3513 if (flags & SCF_DO_STCLASS_AND) { 3514 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 3516 for (value = 0; value < 256; value++) 3517 if (!isALNUM(value)) 3518 ANYOF_BITMAP_CLEAR(data->start_class, value); 3519 } 3520 } 3521 else { 3522 if (data->start_class->flags & ANYOF_LOCALE) 3523 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 3524 else { 3525 for (value = 0; value < 256; value++) 3526 if (isALNUM(value)) 3527 ANYOF_BITMAP_SET(data->start_class, value); 3528 } 3529 } 3530 break; 3531 case ALNUML: 3532 if (flags & SCF_DO_STCLASS_AND) { 3533 if (data->start_class->flags & ANYOF_LOCALE) 3534 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 3535 } 3536 else { 3537 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 3538 data->start_class->flags |= ANYOF_LOCALE; 3539 } 3540 break; 3541 case NALNUM: 3542 if (flags & SCF_DO_STCLASS_AND) { 3543 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 3545 for (value = 0; value < 256; value++) 3546 if (isALNUM(value)) 3547 ANYOF_BITMAP_CLEAR(data->start_class, value); 3548 } 3549 } 3550 else { 3551 if (data->start_class->flags & ANYOF_LOCALE) 3552 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 3553 else { 3554 for (value = 0; value < 256; value++) 3555 if (!isALNUM(value)) 3556 ANYOF_BITMAP_SET(data->start_class, value); 3557 } 3558 } 3559 break; 3560 case NALNUML: 3561 if (flags & SCF_DO_STCLASS_AND) { 3562 if (data->start_class->flags & ANYOF_LOCALE) 3563 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 3564 } 3565 else { 3566 data->start_class->flags |= ANYOF_LOCALE; 3567 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 3568 } 3569 break; 3570 case SPACE: 3571 if (flags & SCF_DO_STCLASS_AND) { 3572 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 3574 for (value = 0; value < 256; value++) 3575 if (!isSPACE(value)) 3576 ANYOF_BITMAP_CLEAR(data->start_class, value); 3577 } 3578 } 3579 else { 3580 if (data->start_class->flags & ANYOF_LOCALE) 3581 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 3582 else { 3583 for (value = 0; value < 256; value++) 3584 if (isSPACE(value)) 3585 ANYOF_BITMAP_SET(data->start_class, value); 3586 } 3587 } 3588 break; 3589 case SPACEL: 3590 if (flags & SCF_DO_STCLASS_AND) { 3591 if (data->start_class->flags & ANYOF_LOCALE) 3592 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 3593 } 3594 else { 3595 data->start_class->flags |= ANYOF_LOCALE; 3596 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 3597 } 3598 break; 3599 case NSPACE: 3600 if (flags & SCF_DO_STCLASS_AND) { 3601 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 3603 for (value = 0; value < 256; value++) 3604 if (isSPACE(value)) 3605 ANYOF_BITMAP_CLEAR(data->start_class, value); 3606 } 3607 } 3608 else { 3609 if (data->start_class->flags & ANYOF_LOCALE) 3610 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 3611 else { 3612 for (value = 0; value < 256; value++) 3613 if (!isSPACE(value)) 3614 ANYOF_BITMAP_SET(data->start_class, value); 3615 } 3616 } 3617 break; 3618 case NSPACEL: 3619 if (flags & SCF_DO_STCLASS_AND) { 3620 if (data->start_class->flags & ANYOF_LOCALE) { 3621 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 3622 for (value = 0; value < 256; value++) 3623 if (!isSPACE(value)) 3624 ANYOF_BITMAP_CLEAR(data->start_class, value); 3625 } 3626 } 3627 else { 3628 data->start_class->flags |= ANYOF_LOCALE; 3629 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 3630 } 3631 break; 3632 case DIGIT: 3633 if (flags & SCF_DO_STCLASS_AND) { 3634 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); 3635 for (value = 0; value < 256; value++) 3636 if (!isDIGIT(value)) 3637 ANYOF_BITMAP_CLEAR(data->start_class, value); 3638 } 3639 else { 3640 if (data->start_class->flags & ANYOF_LOCALE) 3641 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); 3642 else { 3643 for (value = 0; value < 256; value++) 3644 if (isDIGIT(value)) 3645 ANYOF_BITMAP_SET(data->start_class, value); 3646 } 3647 } 3648 break; 3649 case NDIGIT: 3650 if (flags & SCF_DO_STCLASS_AND) { 3651 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); 3652 for (value = 0; value < 256; value++) 3653 if (isDIGIT(value)) 3654 ANYOF_BITMAP_CLEAR(data->start_class, value); 3655 } 3656 else { 3657 if (data->start_class->flags & ANYOF_LOCALE) 3658 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); 3659 else { 3660 for (value = 0; value < 256; value++) 3661 if (!isDIGIT(value)) 3662 ANYOF_BITMAP_SET(data->start_class, value); 3663 } 3664 } 3665 break; 3666 CASE_SYNST_FNC(VERTWS); 3667 CASE_SYNST_FNC(HORIZWS); 3668 3669 } 3670 if (flags & SCF_DO_STCLASS_OR) 3671 cl_and(data->start_class, and_withp); 3672 flags &= ~SCF_DO_STCLASS; 3673 } 3674 } 3675 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { 3676 data->flags |= (OP(scan) == MEOL 3677 ? SF_BEFORE_MEOL 3678 : SF_BEFORE_SEOL); 3679 } 3680 else if ( PL_regkind[OP(scan)] == BRANCHJ 3681 /* Lookbehind, or need to calculate parens/evals/stclass: */ 3682 && (scan->flags || data || (flags & SCF_DO_STCLASS)) 3683 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { 3684 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY 3685 || OP(scan) == UNLESSM ) 3686 { 3687 /* Negative Lookahead/lookbehind 3688 In this case we can't do fixed string optimisation. 3689 */ 3690 3691 I32 deltanext, minnext, fake = 0; 3692 regnode *nscan; 3693 struct regnode_charclass_class intrnl; 3694 int f = 0; 3695 3696 data_fake.flags = 0; 3697 if (data) { 3698 data_fake.whilem_c = data->whilem_c; 3699 data_fake.last_closep = data->last_closep; 3700 } 3701 else 3702 data_fake.last_closep = &fake; 3703 data_fake.pos_delta = delta; 3704 if ( flags & SCF_DO_STCLASS && !scan->flags 3705 && OP(scan) == IFMATCH ) { /* Lookahead */ 3706 cl_init(pRExC_state, &intrnl); 3707 data_fake.start_class = &intrnl; 3708 f |= SCF_DO_STCLASS_AND; 3709 } 3710 if (flags & SCF_WHILEM_VISITED_POS) 3711 f |= SCF_WHILEM_VISITED_POS; 3712 next = regnext(scan); 3713 nscan = NEXTOPER(NEXTOPER(scan)); 3714 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, 3715 last, &data_fake, stopparen, recursed, NULL, f, depth+1); 3716 if (scan->flags) { 3717 if (deltanext) { 3718 FAIL("Variable length lookbehind not implemented"); 3719 } 3720 else if (minnext > (I32)U8_MAX) { 3721 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); 3722 } 3723 scan->flags = (U8)minnext; 3724 } 3725 if (data) { 3726 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3727 pars++; 3728 if (data_fake.flags & SF_HAS_EVAL) 3729 data->flags |= SF_HAS_EVAL; 3730 data->whilem_c = data_fake.whilem_c; 3731 } 3732 if (f & SCF_DO_STCLASS_AND) { 3733 if (flags & SCF_DO_STCLASS_OR) { 3734 /* OR before, AND after: ideally we would recurse with 3735 * data_fake to get the AND applied by study of the 3736 * remainder of the pattern, and then derecurse; 3737 * *** HACK *** for now just treat as "no information". 3738 * See [perl #56690]. 3739 */ 3740 cl_init(pRExC_state, data->start_class); 3741 } else { 3742 /* AND before and after: combine and continue */ 3743 const int was = (data->start_class->flags & ANYOF_EOS); 3744 3745 cl_and(data->start_class, &intrnl); 3746 if (was) 3747 data->start_class->flags |= ANYOF_EOS; 3748 } 3749 } 3750 } 3751 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY 3752 else { 3753 /* Positive Lookahead/lookbehind 3754 In this case we can do fixed string optimisation, 3755 but we must be careful about it. Note in the case of 3756 lookbehind the positions will be offset by the minimum 3757 length of the pattern, something we won't know about 3758 until after the recurse. 3759 */ 3760 I32 deltanext, fake = 0; 3761 regnode *nscan; 3762 struct regnode_charclass_class intrnl; 3763 int f = 0; 3764 /* We use SAVEFREEPV so that when the full compile 3765 is finished perl will clean up the allocated 3766 minlens when its all done. This was we don't 3767 have to worry about freeing them when we know 3768 they wont be used, which would be a pain. 3769 */ 3770 I32 *minnextp; 3771 Newx( minnextp, 1, I32 ); 3772 SAVEFREEPV(minnextp); 3773 3774 if (data) { 3775 StructCopy(data, &data_fake, scan_data_t); 3776 if ((flags & SCF_DO_SUBSTR) && data->last_found) { 3777 f |= SCF_DO_SUBSTR; 3778 if (scan->flags) 3779 SCAN_COMMIT(pRExC_state, &data_fake,minlenp); 3780 data_fake.last_found=newSVsv(data->last_found); 3781 } 3782 } 3783 else 3784 data_fake.last_closep = &fake; 3785 data_fake.flags = 0; 3786 data_fake.pos_delta = delta; 3787 if (is_inf) 3788 data_fake.flags |= SF_IS_INF; 3789 if ( flags & SCF_DO_STCLASS && !scan->flags 3790 && OP(scan) == IFMATCH ) { /* Lookahead */ 3791 cl_init(pRExC_state, &intrnl); 3792 data_fake.start_class = &intrnl; 3793 f |= SCF_DO_STCLASS_AND; 3794 } 3795 if (flags & SCF_WHILEM_VISITED_POS) 3796 f |= SCF_WHILEM_VISITED_POS; 3797 next = regnext(scan); 3798 nscan = NEXTOPER(NEXTOPER(scan)); 3799 3800 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, 3801 last, &data_fake, stopparen, recursed, NULL, f,depth+1); 3802 if (scan->flags) { 3803 if (deltanext) { 3804 FAIL("Variable length lookbehind not implemented"); 3805 } 3806 else if (*minnextp > (I32)U8_MAX) { 3807 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); 3808 } 3809 scan->flags = (U8)*minnextp; 3810 } 3811 3812 *minnextp += min; 3813 3814 if (f & SCF_DO_STCLASS_AND) { 3815 const int was = (data->start_class->flags & ANYOF_EOS); 3816 3817 cl_and(data->start_class, &intrnl); 3818 if (was) 3819 data->start_class->flags |= ANYOF_EOS; 3820 } 3821 if (data) { 3822 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3823 pars++; 3824 if (data_fake.flags & SF_HAS_EVAL) 3825 data->flags |= SF_HAS_EVAL; 3826 data->whilem_c = data_fake.whilem_c; 3827 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { 3828 if (RExC_rx->minlen<*minnextp) 3829 RExC_rx->minlen=*minnextp; 3830 SCAN_COMMIT(pRExC_state, &data_fake, minnextp); 3831 SvREFCNT_dec(data_fake.last_found); 3832 3833 if ( data_fake.minlen_fixed != minlenp ) 3834 { 3835 data->offset_fixed= data_fake.offset_fixed; 3836 data->minlen_fixed= data_fake.minlen_fixed; 3837 data->lookbehind_fixed+= scan->flags; 3838 } 3839 if ( data_fake.minlen_float != minlenp ) 3840 { 3841 data->minlen_float= data_fake.minlen_float; 3842 data->offset_float_min=data_fake.offset_float_min; 3843 data->offset_float_max=data_fake.offset_float_max; 3844 data->lookbehind_float+= scan->flags; 3845 } 3846 } 3847 } 3848 3849 3850 } 3851 #endif 3852 } 3853 else if (OP(scan) == OPEN) { 3854 if (stopparen != (I32)ARG(scan)) 3855 pars++; 3856 } 3857 else if (OP(scan) == CLOSE) { 3858 if (stopparen == (I32)ARG(scan)) { 3859 break; 3860 } 3861 if ((I32)ARG(scan) == is_par) { 3862 next = regnext(scan); 3863 3864 if ( next && (OP(next) != WHILEM) && next < last) 3865 is_par = 0; /* Disable optimization */ 3866 } 3867 if (data) 3868 *(data->last_closep) = ARG(scan); 3869 } 3870 else if (OP(scan) == EVAL) { 3871 if (data) 3872 data->flags |= SF_HAS_EVAL; 3873 } 3874 else if ( PL_regkind[OP(scan)] == ENDLIKE ) { 3875 if (flags & SCF_DO_SUBSTR) { 3876 SCAN_COMMIT(pRExC_state,data,minlenp); 3877 flags &= ~SCF_DO_SUBSTR; 3878 } 3879 if (data && OP(scan)==ACCEPT) { 3880 data->flags |= SCF_SEEN_ACCEPT; 3881 if (stopmin > min) 3882 stopmin = min; 3883 } 3884 } 3885 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ 3886 { 3887 if (flags & SCF_DO_SUBSTR) { 3888 SCAN_COMMIT(pRExC_state,data,minlenp); 3889 data->longest = &(data->longest_float); 3890 } 3891 is_inf = is_inf_internal = 1; 3892 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 3893 cl_anything(pRExC_state, data->start_class); 3894 flags &= ~SCF_DO_STCLASS; 3895 } 3896 else if (OP(scan) == GPOS) { 3897 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && 3898 !(delta || is_inf || (data && data->pos_delta))) 3899 { 3900 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) 3901 RExC_rx->extflags |= RXf_ANCH_GPOS; 3902 if (RExC_rx->gofs < (U32)min) 3903 RExC_rx->gofs = min; 3904 } else { 3905 RExC_rx->extflags |= RXf_GPOS_FLOAT; 3906 RExC_rx->gofs = 0; 3907 } 3908 } 3909 #ifdef TRIE_STUDY_OPT 3910 #ifdef FULL_TRIE_STUDY 3911 else if (PL_regkind[OP(scan)] == TRIE) { 3912 /* NOTE - There is similar code to this block above for handling 3913 BRANCH nodes on the initial study. If you change stuff here 3914 check there too. */ 3915 regnode *trie_node= scan; 3916 regnode *tail= regnext(scan); 3917 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 3918 I32 max1 = 0, min1 = I32_MAX; 3919 struct regnode_charclass_class accum; 3920 3921 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ 3922 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ 3923 if (flags & SCF_DO_STCLASS) 3924 cl_init_zero(pRExC_state, &accum); 3925 3926 if (!trie->jump) { 3927 min1= trie->minlen; 3928 max1= trie->maxlen; 3929 } else { 3930 const regnode *nextbranch= NULL; 3931 U32 word; 3932 3933 for ( word=1 ; word <= trie->wordcount ; word++) 3934 { 3935 I32 deltanext=0, minnext=0, f = 0, fake; 3936 struct regnode_charclass_class this_class; 3937 3938 data_fake.flags = 0; 3939 if (data) { 3940 data_fake.whilem_c = data->whilem_c; 3941 data_fake.last_closep = data->last_closep; 3942 } 3943 else 3944 data_fake.last_closep = &fake; 3945 data_fake.pos_delta = delta; 3946 if (flags & SCF_DO_STCLASS) { 3947 cl_init(pRExC_state, &this_class); 3948 data_fake.start_class = &this_class; 3949 f = SCF_DO_STCLASS_AND; 3950 } 3951 if (flags & SCF_WHILEM_VISITED_POS) 3952 f |= SCF_WHILEM_VISITED_POS; 3953 3954 if (trie->jump[word]) { 3955 if (!nextbranch) 3956 nextbranch = trie_node + trie->jump[0]; 3957 scan= trie_node + trie->jump[word]; 3958 /* We go from the jump point to the branch that follows 3959 it. Note this means we need the vestigal unused branches 3960 even though they arent otherwise used. 3961 */ 3962 minnext = study_chunk(pRExC_state, &scan, minlenp, 3963 &deltanext, (regnode *)nextbranch, &data_fake, 3964 stopparen, recursed, NULL, f,depth+1); 3965 } 3966 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 3967 nextbranch= regnext((regnode*)nextbranch); 3968 3969 if (min1 > (I32)(minnext + trie->minlen)) 3970 min1 = minnext + trie->minlen; 3971 if (max1 < (I32)(minnext + deltanext + trie->maxlen)) 3972 max1 = minnext + deltanext + trie->maxlen; 3973 if (deltanext == I32_MAX) 3974 is_inf = is_inf_internal = 1; 3975 3976 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3977 pars++; 3978 if (data_fake.flags & SCF_SEEN_ACCEPT) { 3979 if ( stopmin > min + min1) 3980 stopmin = min + min1; 3981 flags &= ~SCF_DO_SUBSTR; 3982 if (data) 3983 data->flags |= SCF_SEEN_ACCEPT; 3984 } 3985 if (data) { 3986 if (data_fake.flags & SF_HAS_EVAL) 3987 data->flags |= SF_HAS_EVAL; 3988 data->whilem_c = data_fake.whilem_c; 3989 } 3990 if (flags & SCF_DO_STCLASS) 3991 cl_or(pRExC_state, &accum, &this_class); 3992 } 3993 } 3994 if (flags & SCF_DO_SUBSTR) { 3995 data->pos_min += min1; 3996 data->pos_delta += max1 - min1; 3997 if (max1 != min1 || is_inf) 3998 data->longest = &(data->longest_float); 3999 } 4000 min += min1; 4001 delta += max1 - min1; 4002 if (flags & SCF_DO_STCLASS_OR) { 4003 cl_or(pRExC_state, data->start_class, &accum); 4004 if (min1) { 4005 cl_and(data->start_class, and_withp); 4006 flags &= ~SCF_DO_STCLASS; 4007 } 4008 } 4009 else if (flags & SCF_DO_STCLASS_AND) { 4010 if (min1) { 4011 cl_and(data->start_class, &accum); 4012 flags &= ~SCF_DO_STCLASS; 4013 } 4014 else { 4015 /* Switch to OR mode: cache the old value of 4016 * data->start_class */ 4017 INIT_AND_WITHP; 4018 StructCopy(data->start_class, and_withp, 4019 struct regnode_charclass_class); 4020 flags &= ~SCF_DO_STCLASS_AND; 4021 StructCopy(&accum, data->start_class, 4022 struct regnode_charclass_class); 4023 flags |= SCF_DO_STCLASS_OR; 4024 data->start_class->flags |= ANYOF_EOS; 4025 } 4026 } 4027 scan= tail; 4028 continue; 4029 } 4030 #else 4031 else if (PL_regkind[OP(scan)] == TRIE) { 4032 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 4033 U8*bang=NULL; 4034 4035 min += trie->minlen; 4036 delta += (trie->maxlen - trie->minlen); 4037 flags &= ~SCF_DO_STCLASS; /* xxx */ 4038 if (flags & SCF_DO_SUBSTR) { 4039 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 4040 data->pos_min += trie->minlen; 4041 data->pos_delta += (trie->maxlen - trie->minlen); 4042 if (trie->maxlen != trie->minlen) 4043 data->longest = &(data->longest_float); 4044 } 4045 if (trie->jump) /* no more substrings -- for now /grr*/ 4046 flags &= ~SCF_DO_SUBSTR; 4047 } 4048 #endif /* old or new */ 4049 #endif /* TRIE_STUDY_OPT */ 4050 4051 /* Else: zero-length, ignore. */ 4052 scan = regnext(scan); 4053 } 4054 if (frame) { 4055 last = frame->last; 4056 scan = frame->next; 4057 stopparen = frame->stop; 4058 frame = frame->prev; 4059 goto fake_study_recurse; 4060 } 4061 4062 finish: 4063 assert(!frame); 4064 DEBUG_STUDYDATA("pre-fin:",data,depth); 4065 4066 *scanp = scan; 4067 *deltap = is_inf_internal ? I32_MAX : delta; 4068 if (flags & SCF_DO_SUBSTR && is_inf) 4069 data->pos_delta = I32_MAX - data->pos_min; 4070 if (is_par > (I32)U8_MAX) 4071 is_par = 0; 4072 if (is_par && pars==1 && data) { 4073 data->flags |= SF_IN_PAR; 4074 data->flags &= ~SF_HAS_PAR; 4075 } 4076 else if (pars && data) { 4077 data->flags |= SF_HAS_PAR; 4078 data->flags &= ~SF_IN_PAR; 4079 } 4080 if (flags & SCF_DO_STCLASS_OR) 4081 cl_and(data->start_class, and_withp); 4082 if (flags & SCF_TRIE_RESTUDY) 4083 data->flags |= SCF_TRIE_RESTUDY; 4084 4085 DEBUG_STUDYDATA("post-fin:",data,depth); 4086 4087 return min < stopmin ? min : stopmin; 4088 } 4089 4090 STATIC U32 4091 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) 4092 { 4093 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; 4094 4095 PERL_ARGS_ASSERT_ADD_DATA; 4096 4097 Renewc(RExC_rxi->data, 4098 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), 4099 char, struct reg_data); 4100 if(count) 4101 Renew(RExC_rxi->data->what, count + n, U8); 4102 else 4103 Newx(RExC_rxi->data->what, n, U8); 4104 RExC_rxi->data->count = count + n; 4105 Copy(s, RExC_rxi->data->what + count, n, U8); 4106 return count; 4107 } 4108 4109 /*XXX: todo make this not included in a non debugging perl */ 4110 #ifndef PERL_IN_XSUB_RE 4111 void 4112 Perl_reginitcolors(pTHX) 4113 { 4114 dVAR; 4115 const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); 4116 if (s) { 4117 char *t = savepv(s); 4118 int i = 0; 4119 PL_colors[0] = t; 4120 while (++i < 6) { 4121 t = strchr(t, '\t'); 4122 if (t) { 4123 *t = '\0'; 4124 PL_colors[i] = ++t; 4125 } 4126 else 4127 PL_colors[i] = t = (char *)""; 4128 } 4129 } else { 4130 int i = 0; 4131 while (i < 6) 4132 PL_colors[i++] = (char *)""; 4133 } 4134 PL_colorset = 1; 4135 } 4136 #endif 4137 4138 4139 #ifdef TRIE_STUDY_OPT 4140 #define CHECK_RESTUDY_GOTO \ 4141 if ( \ 4142 (data.flags & SCF_TRIE_RESTUDY) \ 4143 && ! restudied++ \ 4144 ) goto reStudy 4145 #else 4146 #define CHECK_RESTUDY_GOTO 4147 #endif 4148 4149 /* 4150 - pregcomp - compile a regular expression into internal code 4151 * 4152 * We can't allocate space until we know how big the compiled form will be, 4153 * but we can't compile it (and thus know how big it is) until we've got a 4154 * place to put the code. So we cheat: we compile it twice, once with code 4155 * generation turned off and size counting turned on, and once "for real". 4156 * This also means that we don't allocate space until we are sure that the 4157 * thing really will compile successfully, and we never have to move the 4158 * code and thus invalidate pointers into it. (Note that it has to be in 4159 * one piece because free() must be able to free it all.) [NB: not true in perl] 4160 * 4161 * Beware that the optimization-preparation code in here knows about some 4162 * of the structure of the compiled regexp. [I'll say.] 4163 */ 4164 4165 4166 4167 #ifndef PERL_IN_XSUB_RE 4168 #define RE_ENGINE_PTR &PL_core_reg_engine 4169 #else 4170 extern const struct regexp_engine my_reg_engine; 4171 #define RE_ENGINE_PTR &my_reg_engine 4172 #endif 4173 4174 #ifndef PERL_IN_XSUB_RE 4175 REGEXP * 4176 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) 4177 { 4178 dVAR; 4179 HV * const table = GvHV(PL_hintgv); 4180 4181 PERL_ARGS_ASSERT_PREGCOMP; 4182 4183 /* Dispatch a request to compile a regexp to correct 4184 regexp engine. */ 4185 if (table) { 4186 SV **ptr= hv_fetchs(table, "regcomp", FALSE); 4187 GET_RE_DEBUG_FLAGS_DECL; 4188 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) { 4189 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr)); 4190 DEBUG_COMPILE_r({ 4191 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", 4192 SvIV(*ptr)); 4193 }); 4194 return CALLREGCOMP_ENG(eng, pattern, flags); 4195 } 4196 } 4197 return Perl_re_compile(aTHX_ pattern, flags); 4198 } 4199 #endif 4200 4201 REGEXP * 4202 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) 4203 { 4204 dVAR; 4205 register REGEXP *r; 4206 register regexp_internal *ri; 4207 STRLEN plen; 4208 char* exp = SvPV((SV*)pattern, plen); 4209 char* xend = exp + plen; 4210 regnode *scan; 4211 I32 flags; 4212 I32 minlen = 0; 4213 I32 sawplus = 0; 4214 I32 sawopen = 0; 4215 scan_data_t data; 4216 RExC_state_t RExC_state; 4217 RExC_state_t * const pRExC_state = &RExC_state; 4218 #ifdef TRIE_STUDY_OPT 4219 int restudied= 0; 4220 RExC_state_t copyRExC_state; 4221 #endif 4222 GET_RE_DEBUG_FLAGS_DECL; 4223 4224 PERL_ARGS_ASSERT_RE_COMPILE; 4225 4226 DEBUG_r(if (!PL_colorset) reginitcolors()); 4227 4228 RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8; 4229 4230 DEBUG_COMPILE_r({ 4231 SV *dsv= sv_newmortal(); 4232 RE_PV_QUOTED_DECL(s, RExC_utf8, 4233 dsv, exp, plen, 60); 4234 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", 4235 PL_colors[4],PL_colors[5],s); 4236 }); 4237 4238 redo_first_pass: 4239 RExC_precomp = exp; 4240 RExC_flags = pm_flags; 4241 RExC_sawback = 0; 4242 4243 RExC_seen = 0; 4244 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 4245 RExC_seen_evals = 0; 4246 RExC_extralen = 0; 4247 4248 /* First pass: determine size, legality. */ 4249 RExC_parse = exp; 4250 RExC_start = exp; 4251 RExC_end = xend; 4252 RExC_naughty = 0; 4253 RExC_npar = 1; 4254 RExC_nestroot = 0; 4255 RExC_size = 0L; 4256 RExC_emit = &PL_regdummy; 4257 RExC_whilem_seen = 0; 4258 RExC_charnames = NULL; 4259 RExC_open_parens = NULL; 4260 RExC_close_parens = NULL; 4261 RExC_opend = NULL; 4262 RExC_paren_names = NULL; 4263 #ifdef DEBUGGING 4264 RExC_paren_name_list = NULL; 4265 #endif 4266 RExC_recurse = NULL; 4267 RExC_recurse_count = 0; 4268 4269 #if 0 /* REGC() is (currently) a NOP at the first pass. 4270 * Clever compilers notice this and complain. --jhi */ 4271 REGC((U8)REG_MAGIC, (char*)RExC_emit); 4272 #endif 4273 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n")); 4274 if (reg(pRExC_state, 0, &flags,1) == NULL) { 4275 RExC_precomp = NULL; 4276 return(NULL); 4277 } 4278 if (RExC_utf8 && !RExC_orig_utf8) { 4279 /* It's possible to write a regexp in ascii that represents Unicode 4280 codepoints outside of the byte range, such as via \x{100}. If we 4281 detect such a sequence we have to convert the entire pattern to utf8 4282 and then recompile, as our sizing calculation will have been based 4283 on 1 byte == 1 character, but we will need to use utf8 to encode 4284 at least some part of the pattern, and therefore must convert the whole 4285 thing. 4286 XXX: somehow figure out how to make this less expensive... 4287 -- dmq */ 4288 STRLEN len = plen; 4289 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, 4290 "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); 4291 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); 4292 xend = exp + len; 4293 RExC_orig_utf8 = RExC_utf8; 4294 SAVEFREEPV(exp); 4295 goto redo_first_pass; 4296 } 4297 DEBUG_PARSE_r({ 4298 PerlIO_printf(Perl_debug_log, 4299 "Required size %"IVdf" nodes\n" 4300 "Starting second pass (creation)\n", 4301 (IV)RExC_size); 4302 RExC_lastnum=0; 4303 RExC_lastparse=NULL; 4304 }); 4305 /* Small enough for pointer-storage convention? 4306 If extralen==0, this means that we will not need long jumps. */ 4307 if (RExC_size >= 0x10000L && RExC_extralen) 4308 RExC_size += RExC_extralen; 4309 else 4310 RExC_extralen = 0; 4311 if (RExC_whilem_seen > 15) 4312 RExC_whilem_seen = 15; 4313 4314 /* Allocate space and zero-initialize. Note, the two step process 4315 of zeroing when in debug mode, thus anything assigned has to 4316 happen after that */ 4317 Newxz(r, 1, regexp); 4318 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), 4319 char, regexp_internal); 4320 if ( r == NULL || ri == NULL ) 4321 FAIL("Regexp out of space"); 4322 #ifdef DEBUGGING 4323 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ 4324 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); 4325 #else 4326 /* bulk initialize base fields with 0. */ 4327 Zero(ri, sizeof(regexp_internal), char); 4328 #endif 4329 4330 /* non-zero initialization begins here */ 4331 RXi_SET( r, ri ); 4332 r->engine= RE_ENGINE_PTR; 4333 r->refcnt = 1; 4334 RX_PRELEN(r) = plen; 4335 r->extflags = pm_flags; 4336 { 4337 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); 4338 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); 4339 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); 4340 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) 4341 >> RXf_PMf_STD_PMMOD_SHIFT); 4342 const char *fptr = STD_PAT_MODS; /*"msix"*/ 4343 char *p; 4344 RX_WRAPLEN(r) = plen + has_minus + has_p + has_runon 4345 + (sizeof(STD_PAT_MODS) - 1) 4346 + (sizeof("(?:)") - 1); 4347 4348 Newx(RX_WRAPPED(r), RX_WRAPLEN(r) + 1, char ); 4349 p = RX_WRAPPED(r); 4350 *p++='('; *p++='?'; 4351 if (has_p) 4352 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ 4353 { 4354 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; 4355 char *colon = r + 1; 4356 char ch; 4357 4358 while((ch = *fptr++)) { 4359 if(reganch & 1) 4360 *p++ = ch; 4361 else 4362 *r-- = ch; 4363 reganch >>= 1; 4364 } 4365 if(has_minus) { 4366 *r = '-'; 4367 p = colon; 4368 } 4369 } 4370 4371 *p++ = ':'; 4372 Copy(RExC_precomp, p, plen, char); 4373 RX_PRECOMP(r) = p; 4374 p += plen; 4375 if (has_runon) 4376 *p++ = '\n'; 4377 *p++ = ')'; 4378 *p = 0; 4379 } 4380 4381 r->intflags = 0; 4382 r->nparens = RExC_npar - 1; /* set early to validate backrefs */ 4383 4384 if (RExC_seen & REG_SEEN_RECURSE) { 4385 Newxz(RExC_open_parens, RExC_npar,regnode *); 4386 SAVEFREEPV(RExC_open_parens); 4387 Newxz(RExC_close_parens,RExC_npar,regnode *); 4388 SAVEFREEPV(RExC_close_parens); 4389 } 4390 4391 /* Useful during FAIL. */ 4392 #ifdef RE_TRACK_PATTERN_OFFSETS 4393 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ 4394 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, 4395 "%s %"UVuf" bytes for offset annotations.\n", 4396 ri->u.offsets ? "Got" : "Couldn't get", 4397 (UV)((2*RExC_size+1) * sizeof(U32)))); 4398 #endif 4399 SetProgLen(ri,RExC_size); 4400 RExC_rx = r; 4401 RExC_rxi = ri; 4402 4403 /* Second pass: emit code. */ 4404 RExC_flags = pm_flags; /* don't let top level (?i) bleed */ 4405 RExC_parse = exp; 4406 RExC_end = xend; 4407 RExC_naughty = 0; 4408 RExC_npar = 1; 4409 RExC_emit_start = ri->program; 4410 RExC_emit = ri->program; 4411 RExC_emit_bound = ri->program + RExC_size + 1; 4412 4413 /* Store the count of eval-groups for security checks: */ 4414 RExC_rx->seen_evals = RExC_seen_evals; 4415 REGC((U8)REG_MAGIC, (char*) RExC_emit++); 4416 if (reg(pRExC_state, 0, &flags,1) == NULL) { 4417 ReREFCNT_dec(r); 4418 return(NULL); 4419 } 4420 /* XXXX To minimize changes to RE engine we always allocate 4421 3-units-long substrs field. */ 4422 Newx(r->substrs, 1, struct reg_substr_data); 4423 if (RExC_recurse_count) { 4424 Newxz(RExC_recurse,RExC_recurse_count,regnode *); 4425 SAVEFREEPV(RExC_recurse); 4426 } 4427 4428 reStudy: 4429 r->minlen = minlen = sawplus = sawopen = 0; 4430 Zero(r->substrs, 1, struct reg_substr_data); 4431 4432 #ifdef TRIE_STUDY_OPT 4433 if (!restudied) { 4434 StructCopy(&zero_scan_data, &data, scan_data_t); 4435 copyRExC_state = RExC_state; 4436 } else { 4437 U32 seen=RExC_seen; 4438 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); 4439 4440 RExC_state = copyRExC_state; 4441 if (seen & REG_TOP_LEVEL_BRANCHES) 4442 RExC_seen |= REG_TOP_LEVEL_BRANCHES; 4443 else 4444 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; 4445 if (data.last_found) { 4446 SvREFCNT_dec(data.longest_fixed); 4447 SvREFCNT_dec(data.longest_float); 4448 SvREFCNT_dec(data.last_found); 4449 } 4450 StructCopy(&zero_scan_data, &data, scan_data_t); 4451 } 4452 #else 4453 StructCopy(&zero_scan_data, &data, scan_data_t); 4454 #endif 4455 4456 /* Dig out information for optimizations. */ 4457 r->extflags = RExC_flags; /* was pm_op */ 4458 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ 4459 4460 if (UTF) 4461 r->extflags |= RXf_UTF8; /* Unicode in it? */ 4462 ri->regstclass = NULL; 4463 if (RExC_naughty >= 10) /* Probably an expensive pattern. */ 4464 r->intflags |= PREGf_NAUGHTY; 4465 scan = ri->program + 1; /* First BRANCH. */ 4466 4467 /* testing for BRANCH here tells us whether there is "must appear" 4468 data in the pattern. If there is then we can use it for optimisations */ 4469 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ 4470 I32 fake; 4471 STRLEN longest_float_length, longest_fixed_length; 4472 struct regnode_charclass_class ch_class; /* pointed to by data */ 4473 int stclass_flag; 4474 I32 last_close = 0; /* pointed to by data */ 4475 regnode *first= scan; 4476 regnode *first_next= regnext(first); 4477 4478 /* 4479 * Skip introductions and multiplicators >= 1 4480 * so that we can extract the 'meat' of the pattern that must 4481 * match in the large if() sequence following. 4482 * NOTE that EXACT is NOT covered here, as it is normally 4483 * picked up by the optimiser separately. 4484 * 4485 * This is unfortunate as the optimiser isnt handling lookahead 4486 * properly currently. 4487 * 4488 */ 4489 while ((OP(first) == OPEN && (sawopen = 1)) || 4490 /* An OR of *one* alternative - should not happen now. */ 4491 (OP(first) == BRANCH && OP(first_next) != BRANCH) || 4492 /* for now we can't handle lookbehind IFMATCH*/ 4493 (OP(first) == IFMATCH && !first->flags) || 4494 (OP(first) == PLUS) || 4495 (OP(first) == MINMOD) || 4496 /* An {n,m} with n>0 */ 4497 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || 4498 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) 4499 { 4500 /* 4501 * the only op that could be a regnode is PLUS, all the rest 4502 * will be regnode_1 or regnode_2. 4503 * 4504 */ 4505 if (OP(first) == PLUS) 4506 sawplus = 1; 4507 else 4508 first += regarglen[OP(first)]; 4509 4510 first = NEXTOPER(first); 4511 first_next= regnext(first); 4512 } 4513 4514 /* Starting-point info. */ 4515 again: 4516 DEBUG_PEEP("first:",first,0); 4517 /* Ignore EXACT as we deal with it later. */ 4518 if (PL_regkind[OP(first)] == EXACT) { 4519 if (OP(first) == EXACT) 4520 NOOP; /* Empty, get anchored substr later. */ 4521 else if ((OP(first) == EXACTF || OP(first) == EXACTFL)) 4522 ri->regstclass = first; 4523 } 4524 #ifdef TRIE_STCLASS 4525 else if (PL_regkind[OP(first)] == TRIE && 4526 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) 4527 { 4528 regnode *trie_op; 4529 /* this can happen only on restudy */ 4530 if ( OP(first) == TRIE ) { 4531 struct regnode_1 *trieop = (struct regnode_1 *) 4532 PerlMemShared_calloc(1, sizeof(struct regnode_1)); 4533 StructCopy(first,trieop,struct regnode_1); 4534 trie_op=(regnode *)trieop; 4535 } else { 4536 struct regnode_charclass *trieop = (struct regnode_charclass *) 4537 PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); 4538 StructCopy(first,trieop,struct regnode_charclass); 4539 trie_op=(regnode *)trieop; 4540 } 4541 OP(trie_op)+=2; 4542 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); 4543 ri->regstclass = trie_op; 4544 } 4545 #endif 4546 else if (strchr((const char*)PL_simple,OP(first))) 4547 ri->regstclass = first; 4548 else if (PL_regkind[OP(first)] == BOUND || 4549 PL_regkind[OP(first)] == NBOUND) 4550 ri->regstclass = first; 4551 else if (PL_regkind[OP(first)] == BOL) { 4552 r->extflags |= (OP(first) == MBOL 4553 ? RXf_ANCH_MBOL 4554 : (OP(first) == SBOL 4555 ? RXf_ANCH_SBOL 4556 : RXf_ANCH_BOL)); 4557 first = NEXTOPER(first); 4558 goto again; 4559 } 4560 else if (OP(first) == GPOS) { 4561 r->extflags |= RXf_ANCH_GPOS; 4562 first = NEXTOPER(first); 4563 goto again; 4564 } 4565 else if ((!sawopen || !RExC_sawback) && 4566 (OP(first) == STAR && 4567 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && 4568 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) 4569 { 4570 /* turn .* into ^.* with an implied $*=1 */ 4571 const int type = 4572 (OP(NEXTOPER(first)) == REG_ANY) 4573 ? RXf_ANCH_MBOL 4574 : RXf_ANCH_SBOL; 4575 r->extflags |= type; 4576 r->intflags |= PREGf_IMPLICIT; 4577 first = NEXTOPER(first); 4578 goto again; 4579 } 4580 if (sawplus && (!sawopen || !RExC_sawback) 4581 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ 4582 /* x+ must match at the 1st pos of run of x's */ 4583 r->intflags |= PREGf_SKIP; 4584 4585 /* Scan is after the zeroth branch, first is atomic matcher. */ 4586 #ifdef TRIE_STUDY_OPT 4587 DEBUG_PARSE_r( 4588 if (!restudied) 4589 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 4590 (IV)(first - scan + 1)) 4591 ); 4592 #else 4593 DEBUG_PARSE_r( 4594 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 4595 (IV)(first - scan + 1)) 4596 ); 4597 #endif 4598 4599 4600 /* 4601 * If there's something expensive in the r.e., find the 4602 * longest literal string that must appear and make it the 4603 * regmust. Resolve ties in favor of later strings, since 4604 * the regstart check works with the beginning of the r.e. 4605 * and avoiding duplication strengthens checking. Not a 4606 * strong reason, but sufficient in the absence of others. 4607 * [Now we resolve ties in favor of the earlier string if 4608 * it happens that c_offset_min has been invalidated, since the 4609 * earlier string may buy us something the later one won't.] 4610 */ 4611 4612 data.longest_fixed = newSVpvs(""); 4613 data.longest_float = newSVpvs(""); 4614 data.last_found = newSVpvs(""); 4615 data.longest = &(data.longest_fixed); 4616 first = scan; 4617 if (!ri->regstclass) { 4618 cl_init(pRExC_state, &ch_class); 4619 data.start_class = &ch_class; 4620 stclass_flag = SCF_DO_STCLASS_AND; 4621 } else /* XXXX Check for BOUND? */ 4622 stclass_flag = 0; 4623 data.last_closep = &last_close; 4624 4625 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ 4626 &data, -1, NULL, NULL, 4627 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); 4628 4629 4630 CHECK_RESTUDY_GOTO; 4631 4632 4633 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) 4634 && data.last_start_min == 0 && data.last_end > 0 4635 && !RExC_seen_zerolen 4636 && !(RExC_seen & REG_SEEN_VERBARG) 4637 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) 4638 r->extflags |= RXf_CHECK_ALL; 4639 scan_commit(pRExC_state, &data,&minlen,0); 4640 SvREFCNT_dec(data.last_found); 4641 4642 /* Note that code very similar to this but for anchored string 4643 follows immediately below, changes may need to be made to both. 4644 Be careful. 4645 */ 4646 longest_float_length = CHR_SVLEN(data.longest_float); 4647 if (longest_float_length 4648 || (data.flags & SF_FL_BEFORE_EOL 4649 && (!(data.flags & SF_FL_BEFORE_MEOL) 4650 || (RExC_flags & RXf_PMf_MULTILINE)))) 4651 { 4652 I32 t,ml; 4653 4654 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ 4655 && data.offset_fixed == data.offset_float_min 4656 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) 4657 goto remove_float; /* As in (a)+. */ 4658 4659 /* copy the information about the longest float from the reg_scan_data 4660 over to the program. */ 4661 if (SvUTF8(data.longest_float)) { 4662 r->float_utf8 = data.longest_float; 4663 r->float_substr = NULL; 4664 } else { 4665 r->float_substr = data.longest_float; 4666 r->float_utf8 = NULL; 4667 } 4668 /* float_end_shift is how many chars that must be matched that 4669 follow this item. We calculate it ahead of time as once the 4670 lookbehind offset is added in we lose the ability to correctly 4671 calculate it.*/ 4672 ml = data.minlen_float ? *(data.minlen_float) 4673 : (I32)longest_float_length; 4674 r->float_end_shift = ml - data.offset_float_min 4675 - longest_float_length + (SvTAIL(data.longest_float) != 0) 4676 + data.lookbehind_float; 4677 r->float_min_offset = data.offset_float_min - data.lookbehind_float; 4678 r->float_max_offset = data.offset_float_max; 4679 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ 4680 r->float_max_offset -= data.lookbehind_float; 4681 4682 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ 4683 && (!(data.flags & SF_FL_BEFORE_MEOL) 4684 || (RExC_flags & RXf_PMf_MULTILINE))); 4685 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); 4686 } 4687 else { 4688 remove_float: 4689 r->float_substr = r->float_utf8 = NULL; 4690 SvREFCNT_dec(data.longest_float); 4691 longest_float_length = 0; 4692 } 4693 4694 /* Note that code very similar to this but for floating string 4695 is immediately above, changes may need to be made to both. 4696 Be careful. 4697 */ 4698 longest_fixed_length = CHR_SVLEN(data.longest_fixed); 4699 if (longest_fixed_length 4700 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ 4701 && (!(data.flags & SF_FIX_BEFORE_MEOL) 4702 || (RExC_flags & RXf_PMf_MULTILINE)))) 4703 { 4704 I32 t,ml; 4705 4706 /* copy the information about the longest fixed 4707 from the reg_scan_data over to the program. */ 4708 if (SvUTF8(data.longest_fixed)) { 4709 r->anchored_utf8 = data.longest_fixed; 4710 r->anchored_substr = NULL; 4711 } else { 4712 r->anchored_substr = data.longest_fixed; 4713 r->anchored_utf8 = NULL; 4714 } 4715 /* fixed_end_shift is how many chars that must be matched that 4716 follow this item. We calculate it ahead of time as once the 4717 lookbehind offset is added in we lose the ability to correctly 4718 calculate it.*/ 4719 ml = data.minlen_fixed ? *(data.minlen_fixed) 4720 : (I32)longest_fixed_length; 4721 r->anchored_end_shift = ml - data.offset_fixed 4722 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0) 4723 + data.lookbehind_fixed; 4724 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; 4725 4726 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ 4727 && (!(data.flags & SF_FIX_BEFORE_MEOL) 4728 || (RExC_flags & RXf_PMf_MULTILINE))); 4729 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); 4730 } 4731 else { 4732 r->anchored_substr = r->anchored_utf8 = NULL; 4733 SvREFCNT_dec(data.longest_fixed); 4734 longest_fixed_length = 0; 4735 } 4736 if (ri->regstclass 4737 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) 4738 ri->regstclass = NULL; 4739 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) 4740 && stclass_flag 4741 && !(data.start_class->flags & ANYOF_EOS) 4742 && !cl_is_anything(data.start_class)) 4743 { 4744 const U32 n = add_data(pRExC_state, 1, "f"); 4745 4746 Newx(RExC_rxi->data->data[n], 1, 4747 struct regnode_charclass_class); 4748 StructCopy(data.start_class, 4749 (struct regnode_charclass_class*)RExC_rxi->data->data[n], 4750 struct regnode_charclass_class); 4751 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 4752 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 4753 DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); 4754 regprop(r, sv, (regnode*)data.start_class); 4755 PerlIO_printf(Perl_debug_log, 4756 "synthetic stclass \"%s\".\n", 4757 SvPVX_const(sv));}); 4758 } 4759 4760 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ 4761 if (longest_fixed_length > longest_float_length) { 4762 r->check_end_shift = r->anchored_end_shift; 4763 r->check_substr = r->anchored_substr; 4764 r->check_utf8 = r->anchored_utf8; 4765 r->check_offset_min = r->check_offset_max = r->anchored_offset; 4766 if (r->extflags & RXf_ANCH_SINGLE) 4767 r->extflags |= RXf_NOSCAN; 4768 } 4769 else { 4770 r->check_end_shift = r->float_end_shift; 4771 r->check_substr = r->float_substr; 4772 r->check_utf8 = r->float_utf8; 4773 r->check_offset_min = r->float_min_offset; 4774 r->check_offset_max = r->float_max_offset; 4775 } 4776 /* XXXX Currently intuiting is not compatible with ANCH_GPOS. 4777 This should be changed ASAP! */ 4778 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { 4779 r->extflags |= RXf_USE_INTUIT; 4780 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) 4781 r->extflags |= RXf_INTUIT_TAIL; 4782 } 4783 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) 4784 if ( (STRLEN)minlen < longest_float_length ) 4785 minlen= longest_float_length; 4786 if ( (STRLEN)minlen < longest_fixed_length ) 4787 minlen= longest_fixed_length; 4788 */ 4789 } 4790 else { 4791 /* Several toplevels. Best we can is to set minlen. */ 4792 I32 fake; 4793 struct regnode_charclass_class ch_class; 4794 I32 last_close = 0; 4795 4796 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); 4797 4798 scan = ri->program + 1; 4799 cl_init(pRExC_state, &ch_class); 4800 data.start_class = &ch_class; 4801 data.last_closep = &last_close; 4802 4803 4804 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, 4805 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); 4806 4807 CHECK_RESTUDY_GOTO; 4808 4809 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 4810 = r->float_substr = r->float_utf8 = NULL; 4811 if (!(data.start_class->flags & ANYOF_EOS) 4812 && !cl_is_anything(data.start_class)) 4813 { 4814 const U32 n = add_data(pRExC_state, 1, "f"); 4815 4816 Newx(RExC_rxi->data->data[n], 1, 4817 struct regnode_charclass_class); 4818 StructCopy(data.start_class, 4819 (struct regnode_charclass_class*)RExC_rxi->data->data[n], 4820 struct regnode_charclass_class); 4821 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 4822 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 4823 DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); 4824 regprop(r, sv, (regnode*)data.start_class); 4825 PerlIO_printf(Perl_debug_log, 4826 "synthetic stclass \"%s\".\n", 4827 SvPVX_const(sv));}); 4828 } 4829 } 4830 4831 /* Guard against an embedded (?=) or (?<=) with a longer minlen than 4832 the "real" pattern. */ 4833 DEBUG_OPTIMISE_r({ 4834 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", 4835 (IV)minlen, (IV)r->minlen); 4836 }); 4837 r->minlenret = minlen; 4838 if (r->minlen < minlen) 4839 r->minlen = minlen; 4840 4841 if (RExC_seen & REG_SEEN_GPOS) 4842 r->extflags |= RXf_GPOS_SEEN; 4843 if (RExC_seen & REG_SEEN_LOOKBEHIND) 4844 r->extflags |= RXf_LOOKBEHIND_SEEN; 4845 if (RExC_seen & REG_SEEN_EVAL) 4846 r->extflags |= RXf_EVAL_SEEN; 4847 if (RExC_seen & REG_SEEN_CANY) 4848 r->extflags |= RXf_CANY_SEEN; 4849 if (RExC_seen & REG_SEEN_VERBARG) 4850 r->intflags |= PREGf_VERBARG_SEEN; 4851 if (RExC_seen & REG_SEEN_CUTGROUP) 4852 r->intflags |= PREGf_CUTGROUP_SEEN; 4853 if (RExC_paren_names) 4854 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); 4855 else 4856 RXp_PAREN_NAMES(r) = NULL; 4857 4858 #ifdef STUPID_PATTERN_CHECKS 4859 if (RX_PRELEN(r) == 0) 4860 r->extflags |= RXf_NULL; 4861 if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ') 4862 /* XXX: this should happen BEFORE we compile */ 4863 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 4864 else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RX_PRECOMP(r), 3)) 4865 r->extflags |= RXf_WHITE; 4866 else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^') 4867 r->extflags |= RXf_START_ONLY; 4868 #else 4869 if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(r)[0] == ' ') 4870 /* XXX: this should happen BEFORE we compile */ 4871 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 4872 else { 4873 regnode *first = ri->program + 1; 4874 U8 fop = OP(first); 4875 U8 nop = OP(NEXTOPER(first)); 4876 4877 if (PL_regkind[fop] == NOTHING && nop == END) 4878 r->extflags |= RXf_NULL; 4879 else if (PL_regkind[fop] == BOL && nop == END) 4880 r->extflags |= RXf_START_ONLY; 4881 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) 4882 r->extflags |= RXf_WHITE; 4883 } 4884 #endif 4885 #ifdef DEBUGGING 4886 if (RExC_paren_names) { 4887 ri->name_list_idx = add_data( pRExC_state, 1, "p" ); 4888 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); 4889 } else 4890 #endif 4891 ri->name_list_idx = 0; 4892 4893 if (RExC_recurse_count) { 4894 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { 4895 const regnode *scan = RExC_recurse[RExC_recurse_count-1]; 4896 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); 4897 } 4898 } 4899 Newxz(r->offs, RExC_npar, regexp_paren_pair); 4900 /* assume we don't need to swap parens around before we match */ 4901 4902 DEBUG_DUMP_r({ 4903 PerlIO_printf(Perl_debug_log,"Final program:\n"); 4904 regdump(r); 4905 }); 4906 #ifdef RE_TRACK_PATTERN_OFFSETS 4907 DEBUG_OFFSETS_r(if (ri->u.offsets) { 4908 const U32 len = ri->u.offsets[0]; 4909 U32 i; 4910 GET_RE_DEBUG_FLAGS_DECL; 4911 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); 4912 for (i = 1; i <= len; i++) { 4913 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) 4914 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", 4915 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); 4916 } 4917 PerlIO_printf(Perl_debug_log, "\n"); 4918 }); 4919 #endif 4920 return(r); 4921 } 4922 4923 #undef RE_ENGINE_PTR 4924 4925 4926 SV* 4927 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, 4928 const U32 flags) 4929 { 4930 PERL_ARGS_ASSERT_REG_NAMED_BUFF; 4931 4932 PERL_UNUSED_ARG(value); 4933 4934 if (flags & RXapif_FETCH) { 4935 return reg_named_buff_fetch(rx, key, flags); 4936 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { 4937 Perl_croak(aTHX_ "%s", PL_no_modify); 4938 return NULL; 4939 } else if (flags & RXapif_EXISTS) { 4940 return reg_named_buff_exists(rx, key, flags) 4941 ? &PL_sv_yes 4942 : &PL_sv_no; 4943 } else if (flags & RXapif_REGNAMES) { 4944 return reg_named_buff_all(rx, flags); 4945 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { 4946 return reg_named_buff_scalar(rx, flags); 4947 } else { 4948 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); 4949 return NULL; 4950 } 4951 } 4952 4953 SV* 4954 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, 4955 const U32 flags) 4956 { 4957 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; 4958 PERL_UNUSED_ARG(lastkey); 4959 4960 if (flags & RXapif_FIRSTKEY) 4961 return reg_named_buff_firstkey(rx, flags); 4962 else if (flags & RXapif_NEXTKEY) 4963 return reg_named_buff_nextkey(rx, flags); 4964 else { 4965 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); 4966 return NULL; 4967 } 4968 } 4969 4970 SV* 4971 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags) 4972 { 4973 AV *retarray = NULL; 4974 SV *ret; 4975 4976 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; 4977 4978 if (flags & RXapif_ALL) 4979 retarray=newAV(); 4980 4981 if (rx && RXp_PAREN_NAMES(rx)) { 4982 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); 4983 if (he_str) { 4984 IV i; 4985 SV* sv_dat=HeVAL(he_str); 4986 I32 *nums=(I32*)SvPVX(sv_dat); 4987 for ( i=0; i<SvIVX(sv_dat); i++ ) { 4988 if ((I32)(rx->nparens) >= nums[i] 4989 && rx->offs[nums[i]].start != -1 4990 && rx->offs[nums[i]].end != -1) 4991 { 4992 ret = newSVpvs(""); 4993 CALLREG_NUMBUF_FETCH(rx,nums[i],ret); 4994 if (!retarray) 4995 return ret; 4996 } else { 4997 ret = newSVsv(&PL_sv_undef); 4998 } 4999 if (retarray) 5000 av_push(retarray, ret); 5001 } 5002 if (retarray) 5003 return newRV_noinc(MUTABLE_SV(retarray)); 5004 } 5005 } 5006 return NULL; 5007 } 5008 5009 bool 5010 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, 5011 const U32 flags) 5012 { 5013 5014 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; 5015 5016 if (rx && RXp_PAREN_NAMES(rx)) { 5017 if (flags & RXapif_ALL) { 5018 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); 5019 } else { 5020 SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags); 5021 if (sv) { 5022 SvREFCNT_dec(sv); 5023 return TRUE; 5024 } else { 5025 return FALSE; 5026 } 5027 } 5028 } else { 5029 return FALSE; 5030 } 5031 } 5032 5033 SV* 5034 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags) 5035 { 5036 5037 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; 5038 5039 if ( rx && RXp_PAREN_NAMES(rx) ) { 5040 (void)hv_iterinit(RXp_PAREN_NAMES(rx)); 5041 5042 return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY); 5043 } else { 5044 return FALSE; 5045 } 5046 } 5047 5048 SV* 5049 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) 5050 { 5051 GET_RE_DEBUG_FLAGS_DECL; 5052 5053 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; 5054 5055 if (rx && RXp_PAREN_NAMES(rx)) { 5056 HV *hv = RXp_PAREN_NAMES(rx); 5057 HE *temphe; 5058 while ( (temphe = hv_iternext_flags(hv,0)) ) { 5059 IV i; 5060 IV parno = 0; 5061 SV* sv_dat = HeVAL(temphe); 5062 I32 *nums = (I32*)SvPVX(sv_dat); 5063 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 5064 if ((I32)(rx->lastparen) >= nums[i] && 5065 rx->offs[nums[i]].start != -1 && 5066 rx->offs[nums[i]].end != -1) 5067 { 5068 parno = nums[i]; 5069 break; 5070 } 5071 } 5072 if (parno || flags & RXapif_ALL) { 5073 return newSVhek(HeKEY_hek(temphe)); 5074 } 5075 } 5076 } 5077 return NULL; 5078 } 5079 5080 SV* 5081 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) 5082 { 5083 SV *ret; 5084 AV *av; 5085 I32 length; 5086 5087 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; 5088 5089 if (rx && RXp_PAREN_NAMES(rx)) { 5090 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { 5091 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); 5092 } else if (flags & RXapif_ONE) { 5093 ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES)); 5094 av = MUTABLE_AV(SvRV(ret)); 5095 length = av_len(av); 5096 SvREFCNT_dec(ret); 5097 return newSViv(length + 1); 5098 } else { 5099 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); 5100 return NULL; 5101 } 5102 } 5103 return &PL_sv_undef; 5104 } 5105 5106 SV* 5107 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) 5108 { 5109 AV *av = newAV(); 5110 5111 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; 5112 5113 if (rx && RXp_PAREN_NAMES(rx)) { 5114 HV *hv= RXp_PAREN_NAMES(rx); 5115 HE *temphe; 5116 (void)hv_iterinit(hv); 5117 while ( (temphe = hv_iternext_flags(hv,0)) ) { 5118 IV i; 5119 IV parno = 0; 5120 SV* sv_dat = HeVAL(temphe); 5121 I32 *nums = (I32*)SvPVX(sv_dat); 5122 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 5123 if ((I32)(rx->lastparen) >= nums[i] && 5124 rx->offs[nums[i]].start != -1 && 5125 rx->offs[nums[i]].end != -1) 5126 { 5127 parno = nums[i]; 5128 break; 5129 } 5130 } 5131 if (parno || flags & RXapif_ALL) { 5132 av_push(av, newSVhek(HeKEY_hek(temphe))); 5133 } 5134 } 5135 } 5136 5137 return newRV_noinc(MUTABLE_SV(av)); 5138 } 5139 5140 void 5141 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv) 5142 { 5143 char *s = NULL; 5144 I32 i = 0; 5145 I32 s1, t1; 5146 5147 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; 5148 5149 if (!rx->subbeg) { 5150 sv_setsv(sv,&PL_sv_undef); 5151 return; 5152 } 5153 else 5154 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { 5155 /* $` */ 5156 i = rx->offs[0].start; 5157 s = rx->subbeg; 5158 } 5159 else 5160 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { 5161 /* $' */ 5162 s = rx->subbeg + rx->offs[0].end; 5163 i = rx->sublen - rx->offs[0].end; 5164 } 5165 else 5166 if ( 0 <= paren && paren <= (I32)rx->nparens && 5167 (s1 = rx->offs[paren].start) != -1 && 5168 (t1 = rx->offs[paren].end) != -1) 5169 { 5170 /* $& $1 ... */ 5171 i = t1 - s1; 5172 s = rx->subbeg + s1; 5173 } else { 5174 sv_setsv(sv,&PL_sv_undef); 5175 return; 5176 } 5177 assert(rx->sublen >= (s - rx->subbeg) + i ); 5178 if (i >= 0) { 5179 const int oldtainted = PL_tainted; 5180 TAINT_NOT; 5181 sv_setpvn(sv, s, i); 5182 PL_tainted = oldtainted; 5183 if ( (rx->extflags & RXf_CANY_SEEN) 5184 ? (RXp_MATCH_UTF8(rx) 5185 && (!i || is_utf8_string((U8*)s, i))) 5186 : (RXp_MATCH_UTF8(rx)) ) 5187 { 5188 SvUTF8_on(sv); 5189 } 5190 else 5191 SvUTF8_off(sv); 5192 if (PL_tainting) { 5193 if (RXp_MATCH_TAINTED(rx)) { 5194 if (SvTYPE(sv) >= SVt_PVMG) { 5195 MAGIC* const mg = SvMAGIC(sv); 5196 MAGIC* mgt; 5197 PL_tainted = 1; 5198 SvMAGIC_set(sv, mg->mg_moremagic); 5199 SvTAINT(sv); 5200 if ((mgt = SvMAGIC(sv))) { 5201 mg->mg_moremagic = mgt; 5202 SvMAGIC_set(sv, mg); 5203 } 5204 } else { 5205 PL_tainted = 1; 5206 SvTAINT(sv); 5207 } 5208 } else 5209 SvTAINTED_off(sv); 5210 } 5211 } else { 5212 sv_setsv(sv,&PL_sv_undef); 5213 return; 5214 } 5215 } 5216 5217 void 5218 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, 5219 SV const * const value) 5220 { 5221 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; 5222 5223 PERL_UNUSED_ARG(rx); 5224 PERL_UNUSED_ARG(paren); 5225 PERL_UNUSED_ARG(value); 5226 5227 if (!PL_localizing) 5228 Perl_croak(aTHX_ "%s", PL_no_modify); 5229 } 5230 5231 I32 5232 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, 5233 const I32 paren) 5234 { 5235 I32 i; 5236 I32 s1, t1; 5237 5238 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; 5239 5240 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ 5241 switch (paren) { 5242 /* $` / ${^PREMATCH} */ 5243 case RX_BUFF_IDX_PREMATCH: 5244 if (rx->offs[0].start != -1) { 5245 i = rx->offs[0].start; 5246 if (i > 0) { 5247 s1 = 0; 5248 t1 = i; 5249 goto getlen; 5250 } 5251 } 5252 return 0; 5253 /* $' / ${^POSTMATCH} */ 5254 case RX_BUFF_IDX_POSTMATCH: 5255 if (rx->offs[0].end != -1) { 5256 i = rx->sublen - rx->offs[0].end; 5257 if (i > 0) { 5258 s1 = rx->offs[0].end; 5259 t1 = rx->sublen; 5260 goto getlen; 5261 } 5262 } 5263 return 0; 5264 /* $& / ${^MATCH}, $1, $2, ... */ 5265 default: 5266 if (paren <= (I32)rx->nparens && 5267 (s1 = rx->offs[paren].start) != -1 && 5268 (t1 = rx->offs[paren].end) != -1) 5269 { 5270 i = t1 - s1; 5271 goto getlen; 5272 } else { 5273 if (ckWARN(WARN_UNINITIALIZED)) 5274 report_uninit((SV *)sv); 5275 return 0; 5276 } 5277 } 5278 getlen: 5279 if (i > 0 && RXp_MATCH_UTF8(rx)) { 5280 const char * const s = rx->subbeg + s1; 5281 const U8 *ep; 5282 STRLEN el; 5283 5284 i = t1 - s1; 5285 if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) 5286 i = el; 5287 } 5288 return i; 5289 } 5290 5291 SV* 5292 Perl_reg_qr_package(pTHX_ REGEXP * const rx) 5293 { 5294 PERL_ARGS_ASSERT_REG_QR_PACKAGE; 5295 PERL_UNUSED_ARG(rx); 5296 return newSVpvs("Regexp"); 5297 } 5298 5299 /* Scans the name of a named buffer from the pattern. 5300 * If flags is REG_RSN_RETURN_NULL returns null. 5301 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name 5302 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding 5303 * to the parsed name as looked up in the RExC_paren_names hash. 5304 * If there is an error throws a vFAIL().. type exception. 5305 */ 5306 5307 #define REG_RSN_RETURN_NULL 0 5308 #define REG_RSN_RETURN_NAME 1 5309 #define REG_RSN_RETURN_DATA 2 5310 5311 STATIC SV* 5312 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) 5313 { 5314 char *name_start = RExC_parse; 5315 5316 PERL_ARGS_ASSERT_REG_SCAN_NAME; 5317 5318 if (isIDFIRST_lazy_if(RExC_parse, UTF)) { 5319 /* skip IDFIRST by using do...while */ 5320 if (UTF) 5321 do { 5322 RExC_parse += UTF8SKIP(RExC_parse); 5323 } while (isALNUM_utf8((U8*)RExC_parse)); 5324 else 5325 do { 5326 RExC_parse++; 5327 } while (isALNUM(*RExC_parse)); 5328 } 5329 5330 if ( flags ) { 5331 SV* sv_name 5332 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), 5333 SVs_TEMP | (UTF ? SVf_UTF8 : 0)); 5334 if ( flags == REG_RSN_RETURN_NAME) 5335 return sv_name; 5336 else if (flags==REG_RSN_RETURN_DATA) { 5337 HE *he_str = NULL; 5338 SV *sv_dat = NULL; 5339 if ( ! sv_name ) /* should not happen*/ 5340 Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); 5341 if (RExC_paren_names) 5342 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); 5343 if ( he_str ) 5344 sv_dat = HeVAL(he_str); 5345 if ( ! sv_dat ) 5346 vFAIL("Reference to nonexistent named group"); 5347 return sv_dat; 5348 } 5349 else { 5350 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name"); 5351 } 5352 /* NOT REACHED */ 5353 } 5354 return NULL; 5355 } 5356 5357 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ 5358 int rem=(int)(RExC_end - RExC_parse); \ 5359 int cut; \ 5360 int num; \ 5361 int iscut=0; \ 5362 if (rem>10) { \ 5363 rem=10; \ 5364 iscut=1; \ 5365 } \ 5366 cut=10-rem; \ 5367 if (RExC_lastparse!=RExC_parse) \ 5368 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ 5369 rem, RExC_parse, \ 5370 cut + 4, \ 5371 iscut ? "..." : "<" \ 5372 ); \ 5373 else \ 5374 PerlIO_printf(Perl_debug_log,"%16s",""); \ 5375 \ 5376 if (SIZE_ONLY) \ 5377 num = RExC_size + 1; \ 5378 else \ 5379 num=REG_NODE_NUM(RExC_emit); \ 5380 if (RExC_lastnum!=num) \ 5381 PerlIO_printf(Perl_debug_log,"|%4d",num); \ 5382 else \ 5383 PerlIO_printf(Perl_debug_log,"|%4s",""); \ 5384 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ 5385 (int)((depth*2)), "", \ 5386 (funcname) \ 5387 ); \ 5388 RExC_lastnum=num; \ 5389 RExC_lastparse=RExC_parse; \ 5390 }) 5391 5392 5393 5394 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ 5395 DEBUG_PARSE_MSG((funcname)); \ 5396 PerlIO_printf(Perl_debug_log,"%4s","\n"); \ 5397 }) 5398 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ 5399 DEBUG_PARSE_MSG((funcname)); \ 5400 PerlIO_printf(Perl_debug_log,fmt "\n",args); \ 5401 }) 5402 /* 5403 - reg - regular expression, i.e. main body or parenthesized thing 5404 * 5405 * Caller must absorb opening parenthesis. 5406 * 5407 * Combining parenthesis handling with the base level of regular expression 5408 * is a trifle forced, but the need to tie the tails of the branches to what 5409 * follows makes it hard to avoid. 5410 */ 5411 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) 5412 #ifdef DEBUGGING 5413 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) 5414 #else 5415 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) 5416 #endif 5417 5418 STATIC regnode * 5419 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) 5420 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ 5421 { 5422 dVAR; 5423 register regnode *ret; /* Will be the head of the group. */ 5424 register regnode *br; 5425 register regnode *lastbr; 5426 register regnode *ender = NULL; 5427 register I32 parno = 0; 5428 I32 flags; 5429 U32 oregflags = RExC_flags; 5430 bool have_branch = 0; 5431 bool is_open = 0; 5432 I32 freeze_paren = 0; 5433 I32 after_freeze = 0; 5434 5435 /* for (?g), (?gc), and (?o) warnings; warning 5436 about (?c) will warn about (?g) -- japhy */ 5437 5438 #define WASTED_O 0x01 5439 #define WASTED_G 0x02 5440 #define WASTED_C 0x04 5441 #define WASTED_GC (0x02|0x04) 5442 I32 wastedflags = 0x00; 5443 5444 char * parse_start = RExC_parse; /* MJD */ 5445 char * const oregcomp_parse = RExC_parse; 5446 5447 GET_RE_DEBUG_FLAGS_DECL; 5448 5449 PERL_ARGS_ASSERT_REG; 5450 DEBUG_PARSE("reg "); 5451 5452 *flagp = 0; /* Tentatively. */ 5453 5454 5455 /* Make an OPEN node, if parenthesized. */ 5456 if (paren) { 5457 if ( *RExC_parse == '*') { /* (*VERB:ARG) */ 5458 char *start_verb = RExC_parse; 5459 STRLEN verb_len = 0; 5460 char *start_arg = NULL; 5461 unsigned char op = 0; 5462 int argok = 1; 5463 int internal_argval = 0; /* internal_argval is only useful if !argok */ 5464 while ( *RExC_parse && *RExC_parse != ')' ) { 5465 if ( *RExC_parse == ':' ) { 5466 start_arg = RExC_parse + 1; 5467 break; 5468 } 5469 RExC_parse++; 5470 } 5471 ++start_verb; 5472 verb_len = RExC_parse - start_verb; 5473 if ( start_arg ) { 5474 RExC_parse++; 5475 while ( *RExC_parse && *RExC_parse != ')' ) 5476 RExC_parse++; 5477 if ( *RExC_parse != ')' ) 5478 vFAIL("Unterminated verb pattern argument"); 5479 if ( RExC_parse == start_arg ) 5480 start_arg = NULL; 5481 } else { 5482 if ( *RExC_parse != ')' ) 5483 vFAIL("Unterminated verb pattern"); 5484 } 5485 5486 switch ( *start_verb ) { 5487 case 'A': /* (*ACCEPT) */ 5488 if ( memEQs(start_verb,verb_len,"ACCEPT") ) { 5489 op = ACCEPT; 5490 internal_argval = RExC_nestroot; 5491 } 5492 break; 5493 case 'C': /* (*COMMIT) */ 5494 if ( memEQs(start_verb,verb_len,"COMMIT") ) 5495 op = COMMIT; 5496 break; 5497 case 'F': /* (*FAIL) */ 5498 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { 5499 op = OPFAIL; 5500 argok = 0; 5501 } 5502 break; 5503 case ':': /* (*:NAME) */ 5504 case 'M': /* (*MARK:NAME) */ 5505 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { 5506 op = MARKPOINT; 5507 argok = -1; 5508 } 5509 break; 5510 case 'P': /* (*PRUNE) */ 5511 if ( memEQs(start_verb,verb_len,"PRUNE") ) 5512 op = PRUNE; 5513 break; 5514 case 'S': /* (*SKIP) */ 5515 if ( memEQs(start_verb,verb_len,"SKIP") ) 5516 op = SKIP; 5517 break; 5518 case 'T': /* (*THEN) */ 5519 /* [19:06] <TimToady> :: is then */ 5520 if ( memEQs(start_verb,verb_len,"THEN") ) { 5521 op = CUTGROUP; 5522 RExC_seen |= REG_SEEN_CUTGROUP; 5523 } 5524 break; 5525 } 5526 if ( ! op ) { 5527 RExC_parse++; 5528 vFAIL3("Unknown verb pattern '%.*s'", 5529 verb_len, start_verb); 5530 } 5531 if ( argok ) { 5532 if ( start_arg && internal_argval ) { 5533 vFAIL3("Verb pattern '%.*s' may not have an argument", 5534 verb_len, start_verb); 5535 } else if ( argok < 0 && !start_arg ) { 5536 vFAIL3("Verb pattern '%.*s' has a mandatory argument", 5537 verb_len, start_verb); 5538 } else { 5539 ret = reganode(pRExC_state, op, internal_argval); 5540 if ( ! internal_argval && ! SIZE_ONLY ) { 5541 if (start_arg) { 5542 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); 5543 ARG(ret) = add_data( pRExC_state, 1, "S" ); 5544 RExC_rxi->data->data[ARG(ret)]=(void*)sv; 5545 ret->flags = 0; 5546 } else { 5547 ret->flags = 1; 5548 } 5549 } 5550 } 5551 if (!internal_argval) 5552 RExC_seen |= REG_SEEN_VERBARG; 5553 } else if ( start_arg ) { 5554 vFAIL3("Verb pattern '%.*s' may not have an argument", 5555 verb_len, start_verb); 5556 } else { 5557 ret = reg_node(pRExC_state, op); 5558 } 5559 nextchar(pRExC_state); 5560 return ret; 5561 } else 5562 if (*RExC_parse == '?') { /* (?...) */ 5563 bool is_logical = 0; 5564 const char * const seqstart = RExC_parse; 5565 5566 RExC_parse++; 5567 paren = *RExC_parse++; 5568 ret = NULL; /* For look-ahead/behind. */ 5569 switch (paren) { 5570 5571 case 'P': /* (?P...) variants for those used to PCRE/Python */ 5572 paren = *RExC_parse++; 5573 if ( paren == '<') /* (?P<...>) named capture */ 5574 goto named_capture; 5575 else if (paren == '>') { /* (?P>name) named recursion */ 5576 goto named_recursion; 5577 } 5578 else if (paren == '=') { /* (?P=...) named backref */ 5579 /* this pretty much dupes the code for \k<NAME> in regatom(), if 5580 you change this make sure you change that */ 5581 char* name_start = RExC_parse; 5582 U32 num = 0; 5583 SV *sv_dat = reg_scan_name(pRExC_state, 5584 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5585 if (RExC_parse == name_start || *RExC_parse != ')') 5586 vFAIL2("Sequence %.3s... not terminated",parse_start); 5587 5588 if (!SIZE_ONLY) { 5589 num = add_data( pRExC_state, 1, "S" ); 5590 RExC_rxi->data->data[num]=(void*)sv_dat; 5591 SvREFCNT_inc_simple_void(sv_dat); 5592 } 5593 RExC_sawback = 1; 5594 ret = reganode(pRExC_state, 5595 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), 5596 num); 5597 *flagp |= HASWIDTH; 5598 5599 Set_Node_Offset(ret, parse_start+1); 5600 Set_Node_Cur_Length(ret); /* MJD */ 5601 5602 nextchar(pRExC_state); 5603 return ret; 5604 } 5605 RExC_parse++; 5606 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5607 /*NOTREACHED*/ 5608 case '<': /* (?<...) */ 5609 if (*RExC_parse == '!') 5610 paren = ','; 5611 else if (*RExC_parse != '=') 5612 named_capture: 5613 { /* (?<...>) */ 5614 char *name_start; 5615 SV *svname; 5616 paren= '>'; 5617 case '\'': /* (?'...') */ 5618 name_start= RExC_parse; 5619 svname = reg_scan_name(pRExC_state, 5620 SIZE_ONLY ? /* reverse test from the others */ 5621 REG_RSN_RETURN_NAME : 5622 REG_RSN_RETURN_NULL); 5623 if (RExC_parse == name_start) { 5624 RExC_parse++; 5625 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5626 /*NOTREACHED*/ 5627 } 5628 if (*RExC_parse != paren) 5629 vFAIL2("Sequence (?%c... not terminated", 5630 paren=='>' ? '<' : paren); 5631 if (SIZE_ONLY) { 5632 HE *he_str; 5633 SV *sv_dat = NULL; 5634 if (!svname) /* shouldnt happen */ 5635 Perl_croak(aTHX_ 5636 "panic: reg_scan_name returned NULL"); 5637 if (!RExC_paren_names) { 5638 RExC_paren_names= newHV(); 5639 sv_2mortal(MUTABLE_SV(RExC_paren_names)); 5640 #ifdef DEBUGGING 5641 RExC_paren_name_list= newAV(); 5642 sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); 5643 #endif 5644 } 5645 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); 5646 if ( he_str ) 5647 sv_dat = HeVAL(he_str); 5648 if ( ! sv_dat ) { 5649 /* croak baby croak */ 5650 Perl_croak(aTHX_ 5651 "panic: paren_name hash element allocation failed"); 5652 } else if ( SvPOK(sv_dat) ) { 5653 /* (?|...) can mean we have dupes so scan to check 5654 its already been stored. Maybe a flag indicating 5655 we are inside such a construct would be useful, 5656 but the arrays are likely to be quite small, so 5657 for now we punt -- dmq */ 5658 IV count = SvIV(sv_dat); 5659 I32 *pv = (I32*)SvPVX(sv_dat); 5660 IV i; 5661 for ( i = 0 ; i < count ; i++ ) { 5662 if ( pv[i] == RExC_npar ) { 5663 count = 0; 5664 break; 5665 } 5666 } 5667 if ( count ) { 5668 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); 5669 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); 5670 pv[count] = RExC_npar; 5671 SvIV_set(sv_dat, SvIVX(sv_dat) + 1); 5672 } 5673 } else { 5674 (void)SvUPGRADE(sv_dat,SVt_PVNV); 5675 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); 5676 SvIOK_on(sv_dat); 5677 SvIV_set(sv_dat, 1); 5678 } 5679 #ifdef DEBUGGING 5680 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) 5681 SvREFCNT_dec(svname); 5682 #endif 5683 5684 /*sv_dump(sv_dat);*/ 5685 } 5686 nextchar(pRExC_state); 5687 paren = 1; 5688 goto capturing_parens; 5689 } 5690 RExC_seen |= REG_SEEN_LOOKBEHIND; 5691 RExC_parse++; 5692 case '=': /* (?=...) */ 5693 RExC_seen_zerolen++; 5694 break; 5695 case '!': /* (?!...) */ 5696 RExC_seen_zerolen++; 5697 if (*RExC_parse == ')') { 5698 ret=reg_node(pRExC_state, OPFAIL); 5699 nextchar(pRExC_state); 5700 return ret; 5701 } 5702 break; 5703 case '|': /* (?|...) */ 5704 /* branch reset, behave like a (?:...) except that 5705 buffers in alternations share the same numbers */ 5706 paren = ':'; 5707 after_freeze = freeze_paren = RExC_npar; 5708 break; 5709 case ':': /* (?:...) */ 5710 case '>': /* (?>...) */ 5711 break; 5712 case '$': /* (?$...) */ 5713 case '@': /* (?@...) */ 5714 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 5715 break; 5716 case '#': /* (?#...) */ 5717 while (*RExC_parse && *RExC_parse != ')') 5718 RExC_parse++; 5719 if (*RExC_parse != ')') 5720 FAIL("Sequence (?#... not terminated"); 5721 nextchar(pRExC_state); 5722 *flagp = TRYAGAIN; 5723 return NULL; 5724 case '0' : /* (?0) */ 5725 case 'R' : /* (?R) */ 5726 if (*RExC_parse != ')') 5727 FAIL("Sequence (?R) not terminated"); 5728 ret = reg_node(pRExC_state, GOSTART); 5729 *flagp |= POSTPONED; 5730 nextchar(pRExC_state); 5731 return ret; 5732 /*notreached*/ 5733 { /* named and numeric backreferences */ 5734 I32 num; 5735 case '&': /* (?&NAME) */ 5736 parse_start = RExC_parse - 1; 5737 named_recursion: 5738 { 5739 SV *sv_dat = reg_scan_name(pRExC_state, 5740 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5741 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 5742 } 5743 goto gen_recurse_regop; 5744 /* NOT REACHED */ 5745 case '+': 5746 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 5747 RExC_parse++; 5748 vFAIL("Illegal pattern"); 5749 } 5750 goto parse_recursion; 5751 /* NOT REACHED*/ 5752 case '-': /* (?-1) */ 5753 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 5754 RExC_parse--; /* rewind to let it be handled later */ 5755 goto parse_flags; 5756 } 5757 /*FALLTHROUGH */ 5758 case '1': case '2': case '3': case '4': /* (?1) */ 5759 case '5': case '6': case '7': case '8': case '9': 5760 RExC_parse--; 5761 parse_recursion: 5762 num = atoi(RExC_parse); 5763 parse_start = RExC_parse - 1; /* MJD */ 5764 if (*RExC_parse == '-') 5765 RExC_parse++; 5766 while (isDIGIT(*RExC_parse)) 5767 RExC_parse++; 5768 if (*RExC_parse!=')') 5769 vFAIL("Expecting close bracket"); 5770 5771 gen_recurse_regop: 5772 if ( paren == '-' ) { 5773 /* 5774 Diagram of capture buffer numbering. 5775 Top line is the normal capture buffer numbers 5776 Botton line is the negative indexing as from 5777 the X (the (?-2)) 5778 5779 + 1 2 3 4 5 X 6 7 5780 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ 5781 - 5 4 3 2 1 X x x 5782 5783 */ 5784 num = RExC_npar + num; 5785 if (num < 1) { 5786 RExC_parse++; 5787 vFAIL("Reference to nonexistent group"); 5788 } 5789 } else if ( paren == '+' ) { 5790 num = RExC_npar + num - 1; 5791 } 5792 5793 ret = reganode(pRExC_state, GOSUB, num); 5794 if (!SIZE_ONLY) { 5795 if (num > (I32)RExC_rx->nparens) { 5796 RExC_parse++; 5797 vFAIL("Reference to nonexistent group"); 5798 } 5799 ARG2L_SET( ret, RExC_recurse_count++); 5800 RExC_emit++; 5801 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 5802 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); 5803 } else { 5804 RExC_size++; 5805 } 5806 RExC_seen |= REG_SEEN_RECURSE; 5807 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ 5808 Set_Node_Offset(ret, parse_start); /* MJD */ 5809 5810 *flagp |= POSTPONED; 5811 nextchar(pRExC_state); 5812 return ret; 5813 } /* named and numeric backreferences */ 5814 /* NOT REACHED */ 5815 5816 case '?': /* (??...) */ 5817 is_logical = 1; 5818 if (*RExC_parse != '{') { 5819 RExC_parse++; 5820 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5821 /*NOTREACHED*/ 5822 } 5823 *flagp |= POSTPONED; 5824 paren = *RExC_parse++; 5825 /* FALL THROUGH */ 5826 case '{': /* (?{...}) */ 5827 { 5828 I32 count = 1; 5829 U32 n = 0; 5830 char c; 5831 char *s = RExC_parse; 5832 5833 RExC_seen_zerolen++; 5834 RExC_seen |= REG_SEEN_EVAL; 5835 while (count && (c = *RExC_parse)) { 5836 if (c == '\\') { 5837 if (RExC_parse[1]) 5838 RExC_parse++; 5839 } 5840 else if (c == '{') 5841 count++; 5842 else if (c == '}') 5843 count--; 5844 RExC_parse++; 5845 } 5846 if (*RExC_parse != ')') { 5847 RExC_parse = s; 5848 vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); 5849 } 5850 if (!SIZE_ONLY) { 5851 PAD *pad; 5852 OP_4tree *sop, *rop; 5853 SV * const sv = newSVpvn(s, RExC_parse - 1 - s); 5854 5855 ENTER; 5856 Perl_save_re_context(aTHX); 5857 rop = sv_compile_2op(sv, &sop, "re", &pad); 5858 sop->op_private |= OPpREFCOUNTED; 5859 /* re_dup will OpREFCNT_inc */ 5860 OpREFCNT_set(sop, 1); 5861 LEAVE; 5862 5863 n = add_data(pRExC_state, 3, "nop"); 5864 RExC_rxi->data->data[n] = (void*)rop; 5865 RExC_rxi->data->data[n+1] = (void*)sop; 5866 RExC_rxi->data->data[n+2] = (void*)pad; 5867 SvREFCNT_dec(sv); 5868 } 5869 else { /* First pass */ 5870 if (PL_reginterp_cnt < ++RExC_seen_evals 5871 && IN_PERL_RUNTIME) 5872 /* No compiled RE interpolated, has runtime 5873 components ===> unsafe. */ 5874 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 5875 if (PL_tainting && PL_tainted) 5876 FAIL("Eval-group in insecure regular expression"); 5877 #if PERL_VERSION > 8 5878 if (IN_PERL_COMPILETIME) 5879 PL_cv_has_eval = 1; 5880 #endif 5881 } 5882 5883 nextchar(pRExC_state); 5884 if (is_logical) { 5885 ret = reg_node(pRExC_state, LOGICAL); 5886 if (!SIZE_ONLY) 5887 ret->flags = 2; 5888 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); 5889 /* deal with the length of this later - MJD */ 5890 return ret; 5891 } 5892 ret = reganode(pRExC_state, EVAL, n); 5893 Set_Node_Length(ret, RExC_parse - parse_start + 1); 5894 Set_Node_Offset(ret, parse_start); 5895 return ret; 5896 } 5897 case '(': /* (?(?{...})...) and (?(?=...)...) */ 5898 { 5899 int is_define= 0; 5900 if (RExC_parse[0] == '?') { /* (?(?...)) */ 5901 if (RExC_parse[1] == '=' || RExC_parse[1] == '!' 5902 || RExC_parse[1] == '<' 5903 || RExC_parse[1] == '{') { /* Lookahead or eval. */ 5904 I32 flag; 5905 5906 ret = reg_node(pRExC_state, LOGICAL); 5907 if (!SIZE_ONLY) 5908 ret->flags = 1; 5909 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1)); 5910 goto insert_if; 5911 } 5912 } 5913 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ 5914 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ 5915 { 5916 char ch = RExC_parse[0] == '<' ? '>' : '\''; 5917 char *name_start= RExC_parse++; 5918 U32 num = 0; 5919 SV *sv_dat=reg_scan_name(pRExC_state, 5920 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5921 if (RExC_parse == name_start || *RExC_parse != ch) 5922 vFAIL2("Sequence (?(%c... not terminated", 5923 (ch == '>' ? '<' : ch)); 5924 RExC_parse++; 5925 if (!SIZE_ONLY) { 5926 num = add_data( pRExC_state, 1, "S" ); 5927 RExC_rxi->data->data[num]=(void*)sv_dat; 5928 SvREFCNT_inc_simple_void(sv_dat); 5929 } 5930 ret = reganode(pRExC_state,NGROUPP,num); 5931 goto insert_if_check_paren; 5932 } 5933 else if (RExC_parse[0] == 'D' && 5934 RExC_parse[1] == 'E' && 5935 RExC_parse[2] == 'F' && 5936 RExC_parse[3] == 'I' && 5937 RExC_parse[4] == 'N' && 5938 RExC_parse[5] == 'E') 5939 { 5940 ret = reganode(pRExC_state,DEFINEP,0); 5941 RExC_parse +=6 ; 5942 is_define = 1; 5943 goto insert_if_check_paren; 5944 } 5945 else if (RExC_parse[0] == 'R') { 5946 RExC_parse++; 5947 parno = 0; 5948 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 5949 parno = atoi(RExC_parse++); 5950 while (isDIGIT(*RExC_parse)) 5951 RExC_parse++; 5952 } else if (RExC_parse[0] == '&') { 5953 SV *sv_dat; 5954 RExC_parse++; 5955 sv_dat = reg_scan_name(pRExC_state, 5956 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5957 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 5958 } 5959 ret = reganode(pRExC_state,INSUBP,parno); 5960 goto insert_if_check_paren; 5961 } 5962 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 5963 /* (?(1)...) */ 5964 char c; 5965 parno = atoi(RExC_parse++); 5966 5967 while (isDIGIT(*RExC_parse)) 5968 RExC_parse++; 5969 ret = reganode(pRExC_state, GROUPP, parno); 5970 5971 insert_if_check_paren: 5972 if ((c = *nextchar(pRExC_state)) != ')') 5973 vFAIL("Switch condition not recognized"); 5974 insert_if: 5975 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); 5976 br = regbranch(pRExC_state, &flags, 1,depth+1); 5977 if (br == NULL) 5978 br = reganode(pRExC_state, LONGJMP, 0); 5979 else 5980 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); 5981 c = *nextchar(pRExC_state); 5982 if (flags&HASWIDTH) 5983 *flagp |= HASWIDTH; 5984 if (c == '|') { 5985 if (is_define) 5986 vFAIL("(?(DEFINE)....) does not allow branches"); 5987 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ 5988 regbranch(pRExC_state, &flags, 1,depth+1); 5989 REGTAIL(pRExC_state, ret, lastbr); 5990 if (flags&HASWIDTH) 5991 *flagp |= HASWIDTH; 5992 c = *nextchar(pRExC_state); 5993 } 5994 else 5995 lastbr = NULL; 5996 if (c != ')') 5997 vFAIL("Switch (?(condition)... contains too many branches"); 5998 ender = reg_node(pRExC_state, TAIL); 5999 REGTAIL(pRExC_state, br, ender); 6000 if (lastbr) { 6001 REGTAIL(pRExC_state, lastbr, ender); 6002 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); 6003 } 6004 else 6005 REGTAIL(pRExC_state, ret, ender); 6006 RExC_size++; /* XXX WHY do we need this?!! 6007 For large programs it seems to be required 6008 but I can't figure out why. -- dmq*/ 6009 return ret; 6010 } 6011 else { 6012 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); 6013 } 6014 } 6015 case 0: 6016 RExC_parse--; /* for vFAIL to print correctly */ 6017 vFAIL("Sequence (? incomplete"); 6018 break; 6019 default: 6020 --RExC_parse; 6021 parse_flags: /* (?i) */ 6022 { 6023 U32 posflags = 0, negflags = 0; 6024 U32 *flagsp = &posflags; 6025 6026 while (*RExC_parse) { 6027 /* && strchr("iogcmsx", *RExC_parse) */ 6028 /* (?g), (?gc) and (?o) are useless here 6029 and must be globally applied -- japhy */ 6030 switch (*RExC_parse) { 6031 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); 6032 case ONCE_PAT_MOD: /* 'o' */ 6033 case GLOBAL_PAT_MOD: /* 'g' */ 6034 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 6035 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; 6036 if (! (wastedflags & wflagbit) ) { 6037 wastedflags |= wflagbit; 6038 vWARN5( 6039 RExC_parse + 1, 6040 "Useless (%s%c) - %suse /%c modifier", 6041 flagsp == &negflags ? "?-" : "?", 6042 *RExC_parse, 6043 flagsp == &negflags ? "don't " : "", 6044 *RExC_parse 6045 ); 6046 } 6047 } 6048 break; 6049 6050 case CONTINUE_PAT_MOD: /* 'c' */ 6051 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 6052 if (! (wastedflags & WASTED_C) ) { 6053 wastedflags |= WASTED_GC; 6054 vWARN3( 6055 RExC_parse + 1, 6056 "Useless (%sc) - %suse /gc modifier", 6057 flagsp == &negflags ? "?-" : "?", 6058 flagsp == &negflags ? "don't " : "" 6059 ); 6060 } 6061 } 6062 break; 6063 case KEEPCOPY_PAT_MOD: /* 'p' */ 6064 if (flagsp == &negflags) { 6065 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) 6066 vWARN(RExC_parse + 1,"Useless use of (?-p)"); 6067 } else { 6068 *flagsp |= RXf_PMf_KEEPCOPY; 6069 } 6070 break; 6071 case '-': 6072 if (flagsp == &negflags) { 6073 RExC_parse++; 6074 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 6075 /*NOTREACHED*/ 6076 } 6077 flagsp = &negflags; 6078 wastedflags = 0; /* reset so (?g-c) warns twice */ 6079 break; 6080 case ':': 6081 paren = ':'; 6082 /*FALLTHROUGH*/ 6083 case ')': 6084 RExC_flags |= posflags; 6085 RExC_flags &= ~negflags; 6086 if (paren != ':') { 6087 oregflags |= posflags; 6088 oregflags &= ~negflags; 6089 } 6090 nextchar(pRExC_state); 6091 if (paren != ':') { 6092 *flagp = TRYAGAIN; 6093 return NULL; 6094 } else { 6095 ret = NULL; 6096 goto parse_rest; 6097 } 6098 /*NOTREACHED*/ 6099 default: 6100 RExC_parse++; 6101 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 6102 /*NOTREACHED*/ 6103 } 6104 ++RExC_parse; 6105 } 6106 }} /* one for the default block, one for the switch */ 6107 } 6108 else { /* (...) */ 6109 capturing_parens: 6110 parno = RExC_npar; 6111 RExC_npar++; 6112 6113 ret = reganode(pRExC_state, OPEN, parno); 6114 if (!SIZE_ONLY ){ 6115 if (!RExC_nestroot) 6116 RExC_nestroot = parno; 6117 if (RExC_seen & REG_SEEN_RECURSE 6118 && !RExC_open_parens[parno-1]) 6119 { 6120 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 6121 "Setting open paren #%"IVdf" to %d\n", 6122 (IV)parno, REG_NODE_NUM(ret))); 6123 RExC_open_parens[parno-1]= ret; 6124 } 6125 } 6126 Set_Node_Length(ret, 1); /* MJD */ 6127 Set_Node_Offset(ret, RExC_parse); /* MJD */ 6128 is_open = 1; 6129 } 6130 } 6131 else /* ! paren */ 6132 ret = NULL; 6133 6134 parse_rest: 6135 /* Pick up the branches, linking them together. */ 6136 parse_start = RExC_parse; /* MJD */ 6137 br = regbranch(pRExC_state, &flags, 1,depth+1); 6138 6139 if (freeze_paren) { 6140 if (RExC_npar > after_freeze) 6141 after_freeze = RExC_npar; 6142 RExC_npar = freeze_paren; 6143 } 6144 6145 /* branch_len = (paren != 0); */ 6146 6147 if (br == NULL) 6148 return(NULL); 6149 if (*RExC_parse == '|') { 6150 if (!SIZE_ONLY && RExC_extralen) { 6151 reginsert(pRExC_state, BRANCHJ, br, depth+1); 6152 } 6153 else { /* MJD */ 6154 reginsert(pRExC_state, BRANCH, br, depth+1); 6155 Set_Node_Length(br, paren != 0); 6156 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); 6157 } 6158 have_branch = 1; 6159 if (SIZE_ONLY) 6160 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ 6161 } 6162 else if (paren == ':') { 6163 *flagp |= flags&SIMPLE; 6164 } 6165 if (is_open) { /* Starts with OPEN. */ 6166 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ 6167 } 6168 else if (paren != '?') /* Not Conditional */ 6169 ret = br; 6170 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 6171 lastbr = br; 6172 while (*RExC_parse == '|') { 6173 if (!SIZE_ONLY && RExC_extralen) { 6174 ender = reganode(pRExC_state, LONGJMP,0); 6175 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ 6176 } 6177 if (SIZE_ONLY) 6178 RExC_extralen += 2; /* Account for LONGJMP. */ 6179 nextchar(pRExC_state); 6180 if (freeze_paren) { 6181 if (RExC_npar > after_freeze) 6182 after_freeze = RExC_npar; 6183 RExC_npar = freeze_paren; 6184 } 6185 br = regbranch(pRExC_state, &flags, 0, depth+1); 6186 6187 if (br == NULL) 6188 return(NULL); 6189 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ 6190 lastbr = br; 6191 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 6192 } 6193 6194 if (have_branch || paren != ':') { 6195 /* Make a closing node, and hook it on the end. */ 6196 switch (paren) { 6197 case ':': 6198 ender = reg_node(pRExC_state, TAIL); 6199 break; 6200 case 1: 6201 ender = reganode(pRExC_state, CLOSE, parno); 6202 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { 6203 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 6204 "Setting close paren #%"IVdf" to %d\n", 6205 (IV)parno, REG_NODE_NUM(ender))); 6206 RExC_close_parens[parno-1]= ender; 6207 if (RExC_nestroot == parno) 6208 RExC_nestroot = 0; 6209 } 6210 Set_Node_Offset(ender,RExC_parse+1); /* MJD */ 6211 Set_Node_Length(ender,1); /* MJD */ 6212 break; 6213 case '<': 6214 case ',': 6215 case '=': 6216 case '!': 6217 *flagp &= ~HASWIDTH; 6218 /* FALL THROUGH */ 6219 case '>': 6220 ender = reg_node(pRExC_state, SUCCEED); 6221 break; 6222 case 0: 6223 ender = reg_node(pRExC_state, END); 6224 if (!SIZE_ONLY) { 6225 assert(!RExC_opend); /* there can only be one! */ 6226 RExC_opend = ender; 6227 } 6228 break; 6229 } 6230 REGTAIL(pRExC_state, lastbr, ender); 6231 6232 if (have_branch && !SIZE_ONLY) { 6233 if (depth==1) 6234 RExC_seen |= REG_TOP_LEVEL_BRANCHES; 6235 6236 /* Hook the tails of the branches to the closing node. */ 6237 for (br = ret; br; br = regnext(br)) { 6238 const U8 op = PL_regkind[OP(br)]; 6239 if (op == BRANCH) { 6240 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); 6241 } 6242 else if (op == BRANCHJ) { 6243 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); 6244 } 6245 } 6246 } 6247 } 6248 6249 { 6250 const char *p; 6251 static const char parens[] = "=!<,>"; 6252 6253 if (paren && (p = strchr(parens, paren))) { 6254 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 6255 int flag = (p - parens) > 1; 6256 6257 if (paren == '>') 6258 node = SUSPEND, flag = 0; 6259 reginsert(pRExC_state, node,ret, depth+1); 6260 Set_Node_Cur_Length(ret); 6261 Set_Node_Offset(ret, parse_start + 1); 6262 ret->flags = flag; 6263 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); 6264 } 6265 } 6266 6267 /* Check for proper termination. */ 6268 if (paren) { 6269 RExC_flags = oregflags; 6270 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { 6271 RExC_parse = oregcomp_parse; 6272 vFAIL("Unmatched ("); 6273 } 6274 } 6275 else if (!paren && RExC_parse < RExC_end) { 6276 if (*RExC_parse == ')') { 6277 RExC_parse++; 6278 vFAIL("Unmatched )"); 6279 } 6280 else 6281 FAIL("Junk on end of regexp"); /* "Can't happen". */ 6282 /* NOTREACHED */ 6283 } 6284 if (after_freeze) 6285 RExC_npar = after_freeze; 6286 return(ret); 6287 } 6288 6289 /* 6290 - regbranch - one alternative of an | operator 6291 * 6292 * Implements the concatenation operator. 6293 */ 6294 STATIC regnode * 6295 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) 6296 { 6297 dVAR; 6298 register regnode *ret; 6299 register regnode *chain = NULL; 6300 register regnode *latest; 6301 I32 flags = 0, c = 0; 6302 GET_RE_DEBUG_FLAGS_DECL; 6303 6304 PERL_ARGS_ASSERT_REGBRANCH; 6305 6306 DEBUG_PARSE("brnc"); 6307 6308 if (first) 6309 ret = NULL; 6310 else { 6311 if (!SIZE_ONLY && RExC_extralen) 6312 ret = reganode(pRExC_state, BRANCHJ,0); 6313 else { 6314 ret = reg_node(pRExC_state, BRANCH); 6315 Set_Node_Length(ret, 1); 6316 } 6317 } 6318 6319 if (!first && SIZE_ONLY) 6320 RExC_extralen += 1; /* BRANCHJ */ 6321 6322 *flagp = WORST; /* Tentatively. */ 6323 6324 RExC_parse--; 6325 nextchar(pRExC_state); 6326 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 6327 flags &= ~TRYAGAIN; 6328 latest = regpiece(pRExC_state, &flags,depth+1); 6329 if (latest == NULL) { 6330 if (flags & TRYAGAIN) 6331 continue; 6332 return(NULL); 6333 } 6334 else if (ret == NULL) 6335 ret = latest; 6336 *flagp |= flags&(HASWIDTH|POSTPONED); 6337 if (chain == NULL) /* First piece. */ 6338 *flagp |= flags&SPSTART; 6339 else { 6340 RExC_naughty++; 6341 REGTAIL(pRExC_state, chain, latest); 6342 } 6343 chain = latest; 6344 c++; 6345 } 6346 if (chain == NULL) { /* Loop ran zero times. */ 6347 chain = reg_node(pRExC_state, NOTHING); 6348 if (ret == NULL) 6349 ret = chain; 6350 } 6351 if (c == 1) { 6352 *flagp |= flags&SIMPLE; 6353 } 6354 6355 return ret; 6356 } 6357 6358 /* 6359 - regpiece - something followed by possible [*+?] 6360 * 6361 * Note that the branching code sequences used for ? and the general cases 6362 * of * and + are somewhat optimized: they use the same NOTHING node as 6363 * both the endmarker for their branch list and the body of the last branch. 6364 * It might seem that this node could be dispensed with entirely, but the 6365 * endmarker role is not redundant. 6366 */ 6367 STATIC regnode * 6368 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 6369 { 6370 dVAR; 6371 register regnode *ret; 6372 register char op; 6373 register char *next; 6374 I32 flags; 6375 const char * const origparse = RExC_parse; 6376 I32 min; 6377 I32 max = REG_INFTY; 6378 char *parse_start; 6379 const char *maxpos = NULL; 6380 GET_RE_DEBUG_FLAGS_DECL; 6381 6382 PERL_ARGS_ASSERT_REGPIECE; 6383 6384 DEBUG_PARSE("piec"); 6385 6386 ret = regatom(pRExC_state, &flags,depth+1); 6387 if (ret == NULL) { 6388 if (flags & TRYAGAIN) 6389 *flagp |= TRYAGAIN; 6390 return(NULL); 6391 } 6392 6393 op = *RExC_parse; 6394 6395 if (op == '{' && regcurly(RExC_parse)) { 6396 maxpos = NULL; 6397 parse_start = RExC_parse; /* MJD */ 6398 next = RExC_parse + 1; 6399 while (isDIGIT(*next) || *next == ',') { 6400 if (*next == ',') { 6401 if (maxpos) 6402 break; 6403 else 6404 maxpos = next; 6405 } 6406 next++; 6407 } 6408 if (*next == '}') { /* got one */ 6409 if (!maxpos) 6410 maxpos = next; 6411 RExC_parse++; 6412 min = atoi(RExC_parse); 6413 if (*maxpos == ',') 6414 maxpos++; 6415 else 6416 maxpos = RExC_parse; 6417 max = atoi(maxpos); 6418 if (!max && *maxpos != '0') 6419 max = REG_INFTY; /* meaning "infinity" */ 6420 else if (max >= REG_INFTY) 6421 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 6422 RExC_parse = next; 6423 nextchar(pRExC_state); 6424 6425 do_curly: 6426 if ((flags&SIMPLE)) { 6427 RExC_naughty += 2 + RExC_naughty / 2; 6428 reginsert(pRExC_state, CURLY, ret, depth+1); 6429 Set_Node_Offset(ret, parse_start+1); /* MJD */ 6430 Set_Node_Cur_Length(ret); 6431 } 6432 else { 6433 regnode * const w = reg_node(pRExC_state, WHILEM); 6434 6435 w->flags = 0; 6436 REGTAIL(pRExC_state, ret, w); 6437 if (!SIZE_ONLY && RExC_extralen) { 6438 reginsert(pRExC_state, LONGJMP,ret, depth+1); 6439 reginsert(pRExC_state, NOTHING,ret, depth+1); 6440 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ 6441 } 6442 reginsert(pRExC_state, CURLYX,ret, depth+1); 6443 /* MJD hk */ 6444 Set_Node_Offset(ret, parse_start+1); 6445 Set_Node_Length(ret, 6446 op == '{' ? (RExC_parse - parse_start) : 1); 6447 6448 if (!SIZE_ONLY && RExC_extralen) 6449 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ 6450 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); 6451 if (SIZE_ONLY) 6452 RExC_whilem_seen++, RExC_extralen += 3; 6453 RExC_naughty += 4 + RExC_naughty; /* compound interest */ 6454 } 6455 ret->flags = 0; 6456 6457 if (min > 0) 6458 *flagp = WORST; 6459 if (max > 0) 6460 *flagp |= HASWIDTH; 6461 if (max < min) 6462 vFAIL("Can't do {n,m} with n > m"); 6463 if (!SIZE_ONLY) { 6464 ARG1_SET(ret, (U16)min); 6465 ARG2_SET(ret, (U16)max); 6466 } 6467 6468 goto nest_check; 6469 } 6470 } 6471 6472 if (!ISMULT1(op)) { 6473 *flagp = flags; 6474 return(ret); 6475 } 6476 6477 #if 0 /* Now runtime fix should be reliable. */ 6478 6479 /* if this is reinstated, don't forget to put this back into perldiag: 6480 6481 =item Regexp *+ operand could be empty at {#} in regex m/%s/ 6482 6483 (F) The part of the regexp subject to either the * or + quantifier 6484 could match an empty string. The {#} shows in the regular 6485 expression about where the problem was discovered. 6486 6487 */ 6488 6489 if (!(flags&HASWIDTH) && op != '?') 6490 vFAIL("Regexp *+ operand could be empty"); 6491 #endif 6492 6493 parse_start = RExC_parse; 6494 nextchar(pRExC_state); 6495 6496 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); 6497 6498 if (op == '*' && (flags&SIMPLE)) { 6499 reginsert(pRExC_state, STAR, ret, depth+1); 6500 ret->flags = 0; 6501 RExC_naughty += 4; 6502 } 6503 else if (op == '*') { 6504 min = 0; 6505 goto do_curly; 6506 } 6507 else if (op == '+' && (flags&SIMPLE)) { 6508 reginsert(pRExC_state, PLUS, ret, depth+1); 6509 ret->flags = 0; 6510 RExC_naughty += 3; 6511 } 6512 else if (op == '+') { 6513 min = 1; 6514 goto do_curly; 6515 } 6516 else if (op == '?') { 6517 min = 0; max = 1; 6518 goto do_curly; 6519 } 6520 nest_check: 6521 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { 6522 vWARN3(RExC_parse, 6523 "%.*s matches null string many times", 6524 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), 6525 origparse); 6526 } 6527 6528 if (RExC_parse < RExC_end && *RExC_parse == '?') { 6529 nextchar(pRExC_state); 6530 reginsert(pRExC_state, MINMOD, ret, depth+1); 6531 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); 6532 } 6533 #ifndef REG_ALLOW_MINMOD_SUSPEND 6534 else 6535 #endif 6536 if (RExC_parse < RExC_end && *RExC_parse == '+') { 6537 regnode *ender; 6538 nextchar(pRExC_state); 6539 ender = reg_node(pRExC_state, SUCCEED); 6540 REGTAIL(pRExC_state, ret, ender); 6541 reginsert(pRExC_state, SUSPEND, ret, depth+1); 6542 ret->flags = 0; 6543 ender = reg_node(pRExC_state, TAIL); 6544 REGTAIL(pRExC_state, ret, ender); 6545 /*ret= ender;*/ 6546 } 6547 6548 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { 6549 RExC_parse++; 6550 vFAIL("Nested quantifiers"); 6551 } 6552 6553 return(ret); 6554 } 6555 6556 6557 /* reg_namedseq(pRExC_state,UVp) 6558 6559 This is expected to be called by a parser routine that has 6560 recognized'\N' and needs to handle the rest. RExC_parse is 6561 expected to point at the first char following the N at the time 6562 of the call. 6563 6564 If valuep is non-null then it is assumed that we are parsing inside 6565 of a charclass definition and the first codepoint in the resolved 6566 string is returned via *valuep and the routine will return NULL. 6567 In this mode if a multichar string is returned from the charnames 6568 handler a warning will be issued, and only the first char in the 6569 sequence will be examined. If the string returned is zero length 6570 then the value of *valuep is undefined and NON-NULL will 6571 be returned to indicate failure. (This will NOT be a valid pointer 6572 to a regnode.) 6573 6574 If value is null then it is assumed that we are parsing normal text 6575 and inserts a new EXACT node into the program containing the resolved 6576 string and returns a pointer to the new node. If the string is 6577 zerolength a NOTHING node is emitted. 6578 6579 On success RExC_parse is set to the char following the endbrace. 6580 Parsing failures will generate a fatal errorvia vFAIL(...) 6581 6582 NOTE: We cache all results from the charnames handler locally in 6583 the RExC_charnames hash (created on first use) to prevent a charnames 6584 handler from playing silly-buggers and returning a short string and 6585 then a long string for a given pattern. Since the regexp program 6586 size is calculated during an initial parse this would result 6587 in a buffer overrun so we cache to prevent the charname result from 6588 changing during the course of the parse. 6589 6590 */ 6591 STATIC regnode * 6592 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) 6593 { 6594 char * name; /* start of the content of the name */ 6595 char * endbrace; /* endbrace following the name */ 6596 SV *sv_str = NULL; 6597 SV *sv_name = NULL; 6598 STRLEN len; /* this has various purposes throughout the code */ 6599 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */ 6600 regnode *ret = NULL; 6601 6602 PERL_ARGS_ASSERT_REG_NAMEDSEQ; 6603 6604 if (*RExC_parse != '{') { 6605 vFAIL("Missing braces on \\N{}"); 6606 } 6607 name = RExC_parse+1; 6608 endbrace = strchr(RExC_parse, '}'); 6609 if ( ! endbrace ) { 6610 RExC_parse++; 6611 vFAIL("Missing right brace on \\N{}"); 6612 } 6613 RExC_parse = endbrace + 1; 6614 6615 6616 /* RExC_parse points at the beginning brace, 6617 endbrace points at the last */ 6618 if ( name[0]=='U' && name[1]=='+' ) { 6619 /* its a "Unicode hex" notation {U+89AB} */ 6620 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES 6621 | PERL_SCAN_DISALLOW_PREFIX 6622 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); 6623 UV cp; 6624 len = (STRLEN)(endbrace - name - 2); 6625 cp = grok_hex(name + 2, &len, &fl, NULL); 6626 if ( len != (STRLEN)(endbrace - name - 2) ) { 6627 cp = 0xFFFD; 6628 } 6629 if ( valuep ) { 6630 if (cp > 0xff) RExC_utf8 = 1; 6631 *valuep = cp; 6632 return NULL; 6633 } 6634 6635 /* Need to convert to utf8 if either: won't fit into a byte, or the re 6636 * is going to be in utf8 and the representation changes under utf8. */ 6637 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) { 6638 U8 string[UTF8_MAXBYTES+1]; 6639 U8 *tmps; 6640 RExC_utf8 = 1; 6641 tmps = uvuni_to_utf8(string, cp); 6642 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE); 6643 } else { /* Otherwise, no need for utf8, can skip that step */ 6644 char string; 6645 string = (char)cp; 6646 sv_str= newSVpvn(&string, 1); 6647 } 6648 } else { 6649 /* fetch the charnames handler for this scope */ 6650 HV * const table = GvHV(PL_hintgv); 6651 SV **cvp= table ? 6652 hv_fetchs(table, "charnames", FALSE) : 6653 NULL; 6654 SV *cv= cvp ? *cvp : NULL; 6655 HE *he_str; 6656 int count; 6657 /* create an SV with the name as argument */ 6658 sv_name = newSVpvn(name, endbrace - name); 6659 6660 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) { 6661 vFAIL2("Constant(\\N{%s}) unknown: " 6662 "(possibly a missing \"use charnames ...\")", 6663 SvPVX(sv_name)); 6664 } 6665 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */ 6666 vFAIL2("Constant(\\N{%s}): " 6667 "$^H{charnames} is not defined",SvPVX(sv_name)); 6668 } 6669 6670 6671 6672 if (!RExC_charnames) { 6673 /* make sure our cache is allocated */ 6674 RExC_charnames = newHV(); 6675 sv_2mortal(MUTABLE_SV(RExC_charnames)); 6676 } 6677 /* see if we have looked this one up before */ 6678 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 ); 6679 if ( he_str ) { 6680 sv_str = HeVAL(he_str); 6681 cached = 1; 6682 } else { 6683 dSP ; 6684 6685 ENTER ; 6686 SAVETMPS ; 6687 PUSHMARK(SP) ; 6688 6689 XPUSHs(sv_name); 6690 6691 PUTBACK ; 6692 6693 count= call_sv(cv, G_SCALAR); 6694 6695 if (count == 1) { /* XXXX is this right? dmq */ 6696 sv_str = POPs; 6697 SvREFCNT_inc_simple_void(sv_str); 6698 } 6699 6700 SPAGAIN ; 6701 PUTBACK ; 6702 FREETMPS ; 6703 LEAVE ; 6704 6705 if ( !sv_str || !SvOK(sv_str) ) { 6706 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} " 6707 "did not return a defined value",SvPVX(sv_name)); 6708 } 6709 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0)) 6710 cached = 1; 6711 } 6712 } 6713 if (valuep) { 6714 char *p = SvPV(sv_str, len); 6715 if (len) { 6716 STRLEN numlen = 1; 6717 if ( SvUTF8(sv_str) ) { 6718 *valuep = utf8_to_uvchr((U8*)p, &numlen); 6719 if (*valuep > 0x7F) 6720 RExC_utf8 = 1; 6721 /* XXXX 6722 We have to turn on utf8 for high bit chars otherwise 6723 we get failures with 6724 6725 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i 6726 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i 6727 6728 This is different from what \x{} would do with the same 6729 codepoint, where the condition is > 0xFF. 6730 - dmq 6731 */ 6732 6733 6734 } else { 6735 *valuep = (UV)*p; 6736 /* warn if we havent used the whole string? */ 6737 } 6738 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) { 6739 vWARN2(RExC_parse, 6740 "Ignoring excess chars from \\N{%s} in character class", 6741 SvPVX(sv_name) 6742 ); 6743 } 6744 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 6745 vWARN2(RExC_parse, 6746 "Ignoring zero length \\N{%s} in character class", 6747 SvPVX(sv_name) 6748 ); 6749 } 6750 if (sv_name) 6751 SvREFCNT_dec(sv_name); 6752 if (!cached) 6753 SvREFCNT_dec(sv_str); 6754 return len ? NULL : (regnode *)&len; 6755 } else if(SvCUR(sv_str)) { 6756 6757 char *s; 6758 char *p, *pend; 6759 STRLEN charlen = 1; 6760 #ifdef DEBUGGING 6761 char * parse_start = name-3; /* needed for the offsets */ 6762 #endif 6763 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */ 6764 6765 ret = reg_node(pRExC_state, 6766 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); 6767 s= STRING(ret); 6768 6769 if ( RExC_utf8 && !SvUTF8(sv_str) ) { 6770 sv_utf8_upgrade(sv_str); 6771 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) { 6772 RExC_utf8= 1; 6773 } 6774 6775 p = SvPV(sv_str, len); 6776 pend = p + len; 6777 /* len is the length written, charlen is the size the char read */ 6778 for ( len = 0; p < pend; p += charlen ) { 6779 if (UTF) { 6780 UV uvc = utf8_to_uvchr((U8*)p, &charlen); 6781 if (FOLD) { 6782 STRLEN foldlen,numlen; 6783 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; 6784 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen); 6785 /* Emit all the Unicode characters. */ 6786 6787 for (foldbuf = tmpbuf; 6788 foldlen; 6789 foldlen -= numlen) 6790 { 6791 uvc = utf8_to_uvchr(foldbuf, &numlen); 6792 if (numlen > 0) { 6793 const STRLEN unilen = reguni(pRExC_state, uvc, s); 6794 s += unilen; 6795 len += unilen; 6796 /* In EBCDIC the numlen 6797 * and unilen can differ. */ 6798 foldbuf += numlen; 6799 if (numlen >= foldlen) 6800 break; 6801 } 6802 else 6803 break; /* "Can't happen." */ 6804 } 6805 } else { 6806 const STRLEN unilen = reguni(pRExC_state, uvc, s); 6807 if (unilen > 0) { 6808 s += unilen; 6809 len += unilen; 6810 } 6811 } 6812 } else { 6813 len++; 6814 REGC(*p, s++); 6815 } 6816 } 6817 if (SIZE_ONLY) { 6818 RExC_size += STR_SZ(len); 6819 } else { 6820 STR_LEN(ret) = len; 6821 RExC_emit += STR_SZ(len); 6822 } 6823 Set_Node_Cur_Length(ret); /* MJD */ 6824 RExC_parse--; 6825 nextchar(pRExC_state); 6826 } else { /* zero length */ 6827 ret = reg_node(pRExC_state,NOTHING); 6828 } 6829 if (!cached) { 6830 SvREFCNT_dec(sv_str); 6831 } 6832 if (sv_name) { 6833 SvREFCNT_dec(sv_name); 6834 } 6835 return ret; 6836 6837 } 6838 6839 6840 /* 6841 * reg_recode 6842 * 6843 * It returns the code point in utf8 for the value in *encp. 6844 * value: a code value in the source encoding 6845 * encp: a pointer to an Encode object 6846 * 6847 * If the result from Encode is not a single character, 6848 * it returns U+FFFD (Replacement character) and sets *encp to NULL. 6849 */ 6850 STATIC UV 6851 S_reg_recode(pTHX_ const char value, SV **encp) 6852 { 6853 STRLEN numlen = 1; 6854 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); 6855 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); 6856 const STRLEN newlen = SvCUR(sv); 6857 UV uv = UNICODE_REPLACEMENT; 6858 6859 PERL_ARGS_ASSERT_REG_RECODE; 6860 6861 if (newlen) 6862 uv = SvUTF8(sv) 6863 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) 6864 : *(U8*)s; 6865 6866 if (!newlen || numlen != newlen) { 6867 uv = UNICODE_REPLACEMENT; 6868 *encp = NULL; 6869 } 6870 return uv; 6871 } 6872 6873 6874 /* 6875 - regatom - the lowest level 6876 6877 Try to identify anything special at the start of the pattern. If there 6878 is, then handle it as required. This may involve generating a single regop, 6879 such as for an assertion; or it may involve recursing, such as to 6880 handle a () structure. 6881 6882 If the string doesn't start with something special then we gobble up 6883 as much literal text as we can. 6884 6885 Once we have been able to handle whatever type of thing started the 6886 sequence, we return. 6887 6888 Note: we have to be careful with escapes, as they can be both literal 6889 and special, and in the case of \10 and friends can either, depending 6890 on context. Specifically there are two seperate switches for handling 6891 escape sequences, with the one for handling literal escapes requiring 6892 a dummy entry for all of the special escapes that are actually handled 6893 by the other. 6894 */ 6895 6896 STATIC regnode * 6897 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 6898 { 6899 dVAR; 6900 register regnode *ret = NULL; 6901 I32 flags; 6902 char *parse_start = RExC_parse; 6903 GET_RE_DEBUG_FLAGS_DECL; 6904 DEBUG_PARSE("atom"); 6905 *flagp = WORST; /* Tentatively. */ 6906 6907 PERL_ARGS_ASSERT_REGATOM; 6908 6909 tryagain: 6910 switch ((U8)*RExC_parse) { 6911 case '^': 6912 RExC_seen_zerolen++; 6913 nextchar(pRExC_state); 6914 if (RExC_flags & RXf_PMf_MULTILINE) 6915 ret = reg_node(pRExC_state, MBOL); 6916 else if (RExC_flags & RXf_PMf_SINGLELINE) 6917 ret = reg_node(pRExC_state, SBOL); 6918 else 6919 ret = reg_node(pRExC_state, BOL); 6920 Set_Node_Length(ret, 1); /* MJD */ 6921 break; 6922 case '$': 6923 nextchar(pRExC_state); 6924 if (*RExC_parse) 6925 RExC_seen_zerolen++; 6926 if (RExC_flags & RXf_PMf_MULTILINE) 6927 ret = reg_node(pRExC_state, MEOL); 6928 else if (RExC_flags & RXf_PMf_SINGLELINE) 6929 ret = reg_node(pRExC_state, SEOL); 6930 else 6931 ret = reg_node(pRExC_state, EOL); 6932 Set_Node_Length(ret, 1); /* MJD */ 6933 break; 6934 case '.': 6935 nextchar(pRExC_state); 6936 if (RExC_flags & RXf_PMf_SINGLELINE) 6937 ret = reg_node(pRExC_state, SANY); 6938 else 6939 ret = reg_node(pRExC_state, REG_ANY); 6940 *flagp |= HASWIDTH|SIMPLE; 6941 RExC_naughty++; 6942 Set_Node_Length(ret, 1); /* MJD */ 6943 break; 6944 case '[': 6945 { 6946 char * const oregcomp_parse = ++RExC_parse; 6947 ret = regclass(pRExC_state,depth+1); 6948 if (*RExC_parse != ']') { 6949 RExC_parse = oregcomp_parse; 6950 vFAIL("Unmatched ["); 6951 } 6952 nextchar(pRExC_state); 6953 *flagp |= HASWIDTH|SIMPLE; 6954 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ 6955 break; 6956 } 6957 case '(': 6958 nextchar(pRExC_state); 6959 ret = reg(pRExC_state, 1, &flags,depth+1); 6960 if (ret == NULL) { 6961 if (flags & TRYAGAIN) { 6962 if (RExC_parse == RExC_end) { 6963 /* Make parent create an empty node if needed. */ 6964 *flagp |= TRYAGAIN; 6965 return(NULL); 6966 } 6967 goto tryagain; 6968 } 6969 return(NULL); 6970 } 6971 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); 6972 break; 6973 case '|': 6974 case ')': 6975 if (flags & TRYAGAIN) { 6976 *flagp |= TRYAGAIN; 6977 return NULL; 6978 } 6979 vFAIL("Internal urp"); 6980 /* Supposed to be caught earlier. */ 6981 break; 6982 case '{': 6983 if (!regcurly(RExC_parse)) { 6984 RExC_parse++; 6985 goto defchar; 6986 } 6987 /* FALL THROUGH */ 6988 case '?': 6989 case '+': 6990 case '*': 6991 RExC_parse++; 6992 vFAIL("Quantifier follows nothing"); 6993 break; 6994 case 0xDF: 6995 case 0xC3: 6996 case 0xCE: 6997 do_foldchar: 6998 if (!LOC && FOLD) { 6999 U32 len,cp; 7000 len=0; /* silence a spurious compiler warning */ 7001 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { 7002 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ 7003 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ 7004 ret = reganode(pRExC_state, FOLDCHAR, cp); 7005 Set_Node_Length(ret, 1); /* MJD */ 7006 nextchar(pRExC_state); /* kill whitespace under /x */ 7007 return ret; 7008 } 7009 } 7010 goto outer_default; 7011 case '\\': 7012 /* Special Escapes 7013 7014 This switch handles escape sequences that resolve to some kind 7015 of special regop and not to literal text. Escape sequnces that 7016 resolve to literal text are handled below in the switch marked 7017 "Literal Escapes". 7018 7019 Every entry in this switch *must* have a corresponding entry 7020 in the literal escape switch. However, the opposite is not 7021 required, as the default for this switch is to jump to the 7022 literal text handling code. 7023 */ 7024 switch ((U8)*++RExC_parse) { 7025 case 0xDF: 7026 case 0xC3: 7027 case 0xCE: 7028 goto do_foldchar; 7029 /* Special Escapes */ 7030 case 'A': 7031 RExC_seen_zerolen++; 7032 ret = reg_node(pRExC_state, SBOL); 7033 *flagp |= SIMPLE; 7034 goto finish_meta_pat; 7035 case 'G': 7036 ret = reg_node(pRExC_state, GPOS); 7037 RExC_seen |= REG_SEEN_GPOS; 7038 *flagp |= SIMPLE; 7039 goto finish_meta_pat; 7040 case 'K': 7041 RExC_seen_zerolen++; 7042 ret = reg_node(pRExC_state, KEEPS); 7043 *flagp |= SIMPLE; 7044 /* XXX:dmq : disabling in-place substitution seems to 7045 * be necessary here to avoid cases of memory corruption, as 7046 * with: C<$_="x" x 80; s/x\K/y/> -- rgs 7047 */ 7048 RExC_seen |= REG_SEEN_LOOKBEHIND; 7049 goto finish_meta_pat; 7050 case 'Z': 7051 ret = reg_node(pRExC_state, SEOL); 7052 *flagp |= SIMPLE; 7053 RExC_seen_zerolen++; /* Do not optimize RE away */ 7054 goto finish_meta_pat; 7055 case 'z': 7056 ret = reg_node(pRExC_state, EOS); 7057 *flagp |= SIMPLE; 7058 RExC_seen_zerolen++; /* Do not optimize RE away */ 7059 goto finish_meta_pat; 7060 case 'C': 7061 ret = reg_node(pRExC_state, CANY); 7062 RExC_seen |= REG_SEEN_CANY; 7063 *flagp |= HASWIDTH|SIMPLE; 7064 goto finish_meta_pat; 7065 case 'X': 7066 ret = reg_node(pRExC_state, CLUMP); 7067 *flagp |= HASWIDTH; 7068 goto finish_meta_pat; 7069 case 'w': 7070 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); 7071 *flagp |= HASWIDTH|SIMPLE; 7072 goto finish_meta_pat; 7073 case 'W': 7074 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); 7075 *flagp |= HASWIDTH|SIMPLE; 7076 goto finish_meta_pat; 7077 case 'b': 7078 RExC_seen_zerolen++; 7079 RExC_seen |= REG_SEEN_LOOKBEHIND; 7080 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); 7081 *flagp |= SIMPLE; 7082 goto finish_meta_pat; 7083 case 'B': 7084 RExC_seen_zerolen++; 7085 RExC_seen |= REG_SEEN_LOOKBEHIND; 7086 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); 7087 *flagp |= SIMPLE; 7088 goto finish_meta_pat; 7089 case 's': 7090 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); 7091 *flagp |= HASWIDTH|SIMPLE; 7092 goto finish_meta_pat; 7093 case 'S': 7094 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); 7095 *flagp |= HASWIDTH|SIMPLE; 7096 goto finish_meta_pat; 7097 case 'd': 7098 ret = reg_node(pRExC_state, DIGIT); 7099 *flagp |= HASWIDTH|SIMPLE; 7100 goto finish_meta_pat; 7101 case 'D': 7102 ret = reg_node(pRExC_state, NDIGIT); 7103 *flagp |= HASWIDTH|SIMPLE; 7104 goto finish_meta_pat; 7105 case 'R': 7106 ret = reg_node(pRExC_state, LNBREAK); 7107 *flagp |= HASWIDTH|SIMPLE; 7108 goto finish_meta_pat; 7109 case 'h': 7110 ret = reg_node(pRExC_state, HORIZWS); 7111 *flagp |= HASWIDTH|SIMPLE; 7112 goto finish_meta_pat; 7113 case 'H': 7114 ret = reg_node(pRExC_state, NHORIZWS); 7115 *flagp |= HASWIDTH|SIMPLE; 7116 goto finish_meta_pat; 7117 case 'v': 7118 ret = reg_node(pRExC_state, VERTWS); 7119 *flagp |= HASWIDTH|SIMPLE; 7120 goto finish_meta_pat; 7121 case 'V': 7122 ret = reg_node(pRExC_state, NVERTWS); 7123 *flagp |= HASWIDTH|SIMPLE; 7124 finish_meta_pat: 7125 nextchar(pRExC_state); 7126 Set_Node_Length(ret, 2); /* MJD */ 7127 break; 7128 case 'p': 7129 case 'P': 7130 { 7131 char* const oldregxend = RExC_end; 7132 #ifdef DEBUGGING 7133 char* parse_start = RExC_parse - 2; 7134 #endif 7135 7136 if (RExC_parse[1] == '{') { 7137 /* a lovely hack--pretend we saw [\pX] instead */ 7138 RExC_end = strchr(RExC_parse, '}'); 7139 if (!RExC_end) { 7140 const U8 c = (U8)*RExC_parse; 7141 RExC_parse += 2; 7142 RExC_end = oldregxend; 7143 vFAIL2("Missing right brace on \\%c{}", c); 7144 } 7145 RExC_end++; 7146 } 7147 else { 7148 RExC_end = RExC_parse + 2; 7149 if (RExC_end > oldregxend) 7150 RExC_end = oldregxend; 7151 } 7152 RExC_parse--; 7153 7154 ret = regclass(pRExC_state,depth+1); 7155 7156 RExC_end = oldregxend; 7157 RExC_parse--; 7158 7159 Set_Node_Offset(ret, parse_start + 2); 7160 Set_Node_Cur_Length(ret); 7161 nextchar(pRExC_state); 7162 *flagp |= HASWIDTH|SIMPLE; 7163 } 7164 break; 7165 case 'N': 7166 /* Handle \N{NAME} here and not below because it can be 7167 multicharacter. join_exact() will join them up later on. 7168 Also this makes sure that things like /\N{BLAH}+/ and 7169 \N{BLAH} being multi char Just Happen. dmq*/ 7170 ++RExC_parse; 7171 ret= reg_namedseq(pRExC_state, NULL); 7172 break; 7173 case 'k': /* Handle \k<NAME> and \k'NAME' */ 7174 parse_named_seq: 7175 { 7176 char ch= RExC_parse[1]; 7177 if (ch != '<' && ch != '\'' && ch != '{') { 7178 RExC_parse++; 7179 vFAIL2("Sequence %.2s... not terminated",parse_start); 7180 } else { 7181 /* this pretty much dupes the code for (?P=...) in reg(), if 7182 you change this make sure you change that */ 7183 char* name_start = (RExC_parse += 2); 7184 U32 num = 0; 7185 SV *sv_dat = reg_scan_name(pRExC_state, 7186 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 7187 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; 7188 if (RExC_parse == name_start || *RExC_parse != ch) 7189 vFAIL2("Sequence %.3s... not terminated",parse_start); 7190 7191 if (!SIZE_ONLY) { 7192 num = add_data( pRExC_state, 1, "S" ); 7193 RExC_rxi->data->data[num]=(void*)sv_dat; 7194 SvREFCNT_inc_simple_void(sv_dat); 7195 } 7196 7197 RExC_sawback = 1; 7198 ret = reganode(pRExC_state, 7199 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), 7200 num); 7201 *flagp |= HASWIDTH; 7202 7203 /* override incorrect value set in reganode MJD */ 7204 Set_Node_Offset(ret, parse_start+1); 7205 Set_Node_Cur_Length(ret); /* MJD */ 7206 nextchar(pRExC_state); 7207 7208 } 7209 break; 7210 } 7211 case 'g': 7212 case '1': case '2': case '3': case '4': 7213 case '5': case '6': case '7': case '8': case '9': 7214 { 7215 I32 num; 7216 bool isg = *RExC_parse == 'g'; 7217 bool isrel = 0; 7218 bool hasbrace = 0; 7219 if (isg) { 7220 RExC_parse++; 7221 if (*RExC_parse == '{') { 7222 RExC_parse++; 7223 hasbrace = 1; 7224 } 7225 if (*RExC_parse == '-') { 7226 RExC_parse++; 7227 isrel = 1; 7228 } 7229 if (hasbrace && !isDIGIT(*RExC_parse)) { 7230 if (isrel) RExC_parse--; 7231 RExC_parse -= 2; 7232 goto parse_named_seq; 7233 } } 7234 num = atoi(RExC_parse); 7235 if (isg && num == 0) 7236 vFAIL("Reference to invalid group 0"); 7237 if (isrel) { 7238 num = RExC_npar - num; 7239 if (num < 1) 7240 vFAIL("Reference to nonexistent or unclosed group"); 7241 } 7242 if (!isg && num > 9 && num >= RExC_npar) 7243 goto defchar; 7244 else { 7245 char * const parse_start = RExC_parse - 1; /* MJD */ 7246 while (isDIGIT(*RExC_parse)) 7247 RExC_parse++; 7248 if (parse_start == RExC_parse - 1) 7249 vFAIL("Unterminated \\g... pattern"); 7250 if (hasbrace) { 7251 if (*RExC_parse != '}') 7252 vFAIL("Unterminated \\g{...} pattern"); 7253 RExC_parse++; 7254 } 7255 if (!SIZE_ONLY) { 7256 if (num > (I32)RExC_rx->nparens) 7257 vFAIL("Reference to nonexistent group"); 7258 } 7259 RExC_sawback = 1; 7260 ret = reganode(pRExC_state, 7261 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF), 7262 num); 7263 *flagp |= HASWIDTH; 7264 7265 /* override incorrect value set in reganode MJD */ 7266 Set_Node_Offset(ret, parse_start+1); 7267 Set_Node_Cur_Length(ret); /* MJD */ 7268 RExC_parse--; 7269 nextchar(pRExC_state); 7270 } 7271 } 7272 break; 7273 case '\0': 7274 if (RExC_parse >= RExC_end) 7275 FAIL("Trailing \\"); 7276 /* FALL THROUGH */ 7277 default: 7278 /* Do not generate "unrecognized" warnings here, we fall 7279 back into the quick-grab loop below */ 7280 parse_start--; 7281 goto defchar; 7282 } 7283 break; 7284 7285 case '#': 7286 if (RExC_flags & RXf_PMf_EXTENDED) { 7287 if ( reg_skipcomment( pRExC_state ) ) 7288 goto tryagain; 7289 } 7290 /* FALL THROUGH */ 7291 7292 default: 7293 outer_default:{ 7294 register STRLEN len; 7295 register UV ender; 7296 register char *p; 7297 char *s; 7298 STRLEN foldlen; 7299 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; 7300 7301 parse_start = RExC_parse - 1; 7302 7303 RExC_parse++; 7304 7305 defchar: 7306 ender = 0; 7307 ret = reg_node(pRExC_state, 7308 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); 7309 s = STRING(ret); 7310 for (len = 0, p = RExC_parse - 1; 7311 len < 127 && p < RExC_end; 7312 len++) 7313 { 7314 char * const oldp = p; 7315 7316 if (RExC_flags & RXf_PMf_EXTENDED) 7317 p = regwhite( pRExC_state, p ); 7318 switch ((U8)*p) { 7319 case 0xDF: 7320 case 0xC3: 7321 case 0xCE: 7322 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) 7323 goto normal_default; 7324 case '^': 7325 case '$': 7326 case '.': 7327 case '[': 7328 case '(': 7329 case ')': 7330 case '|': 7331 goto loopdone; 7332 case '\\': 7333 /* Literal Escapes Switch 7334 7335 This switch is meant to handle escape sequences that 7336 resolve to a literal character. 7337 7338 Every escape sequence that represents something 7339 else, like an assertion or a char class, is handled 7340 in the switch marked 'Special Escapes' above in this 7341 routine, but also has an entry here as anything that 7342 isn't explicitly mentioned here will be treated as 7343 an unescaped equivalent literal. 7344 */ 7345 7346 switch ((U8)*++p) { 7347 /* These are all the special escapes. */ 7348 case 0xDF: 7349 case 0xC3: 7350 case 0xCE: 7351 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) 7352 goto normal_default; 7353 case 'A': /* Start assertion */ 7354 case 'b': case 'B': /* Word-boundary assertion*/ 7355 case 'C': /* Single char !DANGEROUS! */ 7356 case 'd': case 'D': /* digit class */ 7357 case 'g': case 'G': /* generic-backref, pos assertion */ 7358 case 'h': case 'H': /* HORIZWS */ 7359 case 'k': case 'K': /* named backref, keep marker */ 7360 case 'N': /* named char sequence */ 7361 case 'p': case 'P': /* Unicode property */ 7362 case 'R': /* LNBREAK */ 7363 case 's': case 'S': /* space class */ 7364 case 'v': case 'V': /* VERTWS */ 7365 case 'w': case 'W': /* word class */ 7366 case 'X': /* eXtended Unicode "combining character sequence" */ 7367 case 'z': case 'Z': /* End of line/string assertion */ 7368 --p; 7369 goto loopdone; 7370 7371 /* Anything after here is an escape that resolves to a 7372 literal. (Except digits, which may or may not) 7373 */ 7374 case 'n': 7375 ender = '\n'; 7376 p++; 7377 break; 7378 case 'r': 7379 ender = '\r'; 7380 p++; 7381 break; 7382 case 't': 7383 ender = '\t'; 7384 p++; 7385 break; 7386 case 'f': 7387 ender = '\f'; 7388 p++; 7389 break; 7390 case 'e': 7391 ender = ASCII_TO_NATIVE('\033'); 7392 p++; 7393 break; 7394 case 'a': 7395 ender = ASCII_TO_NATIVE('\007'); 7396 p++; 7397 break; 7398 case 'x': 7399 if (*++p == '{') { 7400 char* const e = strchr(p, '}'); 7401 7402 if (!e) { 7403 RExC_parse = p + 1; 7404 vFAIL("Missing right brace on \\x{}"); 7405 } 7406 else { 7407 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 7408 | PERL_SCAN_DISALLOW_PREFIX; 7409 STRLEN numlen = e - p - 1; 7410 ender = grok_hex(p + 1, &numlen, &flags, NULL); 7411 if (ender > 0xff) 7412 RExC_utf8 = 1; 7413 p = e + 1; 7414 } 7415 } 7416 else { 7417 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 7418 STRLEN numlen = 2; 7419 ender = grok_hex(p, &numlen, &flags, NULL); 7420 p += numlen; 7421 } 7422 if (PL_encoding && ender < 0x100) 7423 goto recode_encoding; 7424 break; 7425 case 'c': 7426 p++; 7427 ender = UCHARAT(p++); 7428 ender = toCTRL(ender); 7429 break; 7430 case '0': case '1': case '2': case '3':case '4': 7431 case '5': case '6': case '7': case '8':case '9': 7432 if (*p == '0' || 7433 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { 7434 I32 flags = 0; 7435 STRLEN numlen = 3; 7436 ender = grok_oct(p, &numlen, &flags, NULL); 7437 p += numlen; 7438 } 7439 else { 7440 --p; 7441 goto loopdone; 7442 } 7443 if (PL_encoding && ender < 0x100) 7444 goto recode_encoding; 7445 break; 7446 recode_encoding: 7447 { 7448 SV* enc = PL_encoding; 7449 ender = reg_recode((const char)(U8)ender, &enc); 7450 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) 7451 vWARN(p, "Invalid escape in the specified encoding"); 7452 RExC_utf8 = 1; 7453 } 7454 break; 7455 case '\0': 7456 if (p >= RExC_end) 7457 FAIL("Trailing \\"); 7458 /* FALL THROUGH */ 7459 default: 7460 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP)) 7461 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); 7462 goto normal_default; 7463 } 7464 break; 7465 default: 7466 normal_default: 7467 if (UTF8_IS_START(*p) && UTF) { 7468 STRLEN numlen; 7469 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 7470 &numlen, UTF8_ALLOW_DEFAULT); 7471 p += numlen; 7472 } 7473 else 7474 ender = *p++; 7475 break; 7476 } 7477 if ( RExC_flags & RXf_PMf_EXTENDED) 7478 p = regwhite( pRExC_state, p ); 7479 if (UTF && FOLD) { 7480 /* Prime the casefolded buffer. */ 7481 ender = toFOLD_uni(ender, tmpbuf, &foldlen); 7482 } 7483 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ 7484 if (len) 7485 p = oldp; 7486 else if (UTF) { 7487 if (FOLD) { 7488 /* Emit all the Unicode characters. */ 7489 STRLEN numlen; 7490 for (foldbuf = tmpbuf; 7491 foldlen; 7492 foldlen -= numlen) { 7493 ender = utf8_to_uvchr(foldbuf, &numlen); 7494 if (numlen > 0) { 7495 const STRLEN unilen = reguni(pRExC_state, ender, s); 7496 s += unilen; 7497 len += unilen; 7498 /* In EBCDIC the numlen 7499 * and unilen can differ. */ 7500 foldbuf += numlen; 7501 if (numlen >= foldlen) 7502 break; 7503 } 7504 else 7505 break; /* "Can't happen." */ 7506 } 7507 } 7508 else { 7509 const STRLEN unilen = reguni(pRExC_state, ender, s); 7510 if (unilen > 0) { 7511 s += unilen; 7512 len += unilen; 7513 } 7514 } 7515 } 7516 else { 7517 len++; 7518 REGC((char)ender, s++); 7519 } 7520 break; 7521 } 7522 if (UTF) { 7523 if (FOLD) { 7524 /* Emit all the Unicode characters. */ 7525 STRLEN numlen; 7526 for (foldbuf = tmpbuf; 7527 foldlen; 7528 foldlen -= numlen) { 7529 ender = utf8_to_uvchr(foldbuf, &numlen); 7530 if (numlen > 0) { 7531 const STRLEN unilen = reguni(pRExC_state, ender, s); 7532 len += unilen; 7533 s += unilen; 7534 /* In EBCDIC the numlen 7535 * and unilen can differ. */ 7536 foldbuf += numlen; 7537 if (numlen >= foldlen) 7538 break; 7539 } 7540 else 7541 break; 7542 } 7543 } 7544 else { 7545 const STRLEN unilen = reguni(pRExC_state, ender, s); 7546 if (unilen > 0) { 7547 s += unilen; 7548 len += unilen; 7549 } 7550 } 7551 len--; 7552 } 7553 else 7554 REGC((char)ender, s++); 7555 } 7556 loopdone: 7557 RExC_parse = p - 1; 7558 Set_Node_Cur_Length(ret); /* MJD */ 7559 nextchar(pRExC_state); 7560 { 7561 /* len is STRLEN which is unsigned, need to copy to signed */ 7562 IV iv = len; 7563 if (iv < 0) 7564 vFAIL("Internal disaster"); 7565 } 7566 if (len > 0) 7567 *flagp |= HASWIDTH; 7568 if (len == 1 && UNI_IS_INVARIANT(ender)) 7569 *flagp |= SIMPLE; 7570 7571 if (SIZE_ONLY) 7572 RExC_size += STR_SZ(len); 7573 else { 7574 STR_LEN(ret) = len; 7575 RExC_emit += STR_SZ(len); 7576 } 7577 } 7578 break; 7579 } 7580 7581 return(ret); 7582 } 7583 7584 STATIC char * 7585 S_regwhite( RExC_state_t *pRExC_state, char *p ) 7586 { 7587 const char *e = RExC_end; 7588 7589 PERL_ARGS_ASSERT_REGWHITE; 7590 7591 while (p < e) { 7592 if (isSPACE(*p)) 7593 ++p; 7594 else if (*p == '#') { 7595 bool ended = 0; 7596 do { 7597 if (*p++ == '\n') { 7598 ended = 1; 7599 break; 7600 } 7601 } while (p < e); 7602 if (!ended) 7603 RExC_seen |= REG_SEEN_RUN_ON_COMMENT; 7604 } 7605 else 7606 break; 7607 } 7608 return p; 7609 } 7610 7611 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 7612 Character classes ([:foo:]) can also be negated ([:^foo:]). 7613 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 7614 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 7615 but trigger failures because they are currently unimplemented. */ 7616 7617 #define POSIXCC_DONE(c) ((c) == ':') 7618 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 7619 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 7620 7621 STATIC I32 7622 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) 7623 { 7624 dVAR; 7625 I32 namedclass = OOB_NAMEDCLASS; 7626 7627 PERL_ARGS_ASSERT_REGPPOSIXCC; 7628 7629 if (value == '[' && RExC_parse + 1 < RExC_end && 7630 /* I smell either [: or [= or [. -- POSIX has been here, right? */ 7631 POSIXCC(UCHARAT(RExC_parse))) { 7632 const char c = UCHARAT(RExC_parse); 7633 char* const s = RExC_parse++; 7634 7635 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) 7636 RExC_parse++; 7637 if (RExC_parse == RExC_end) 7638 /* Grandfather lone [:, [=, [. */ 7639 RExC_parse = s; 7640 else { 7641 const char* const t = RExC_parse++; /* skip over the c */ 7642 assert(*t == c); 7643 7644 if (UCHARAT(RExC_parse) == ']') { 7645 const char *posixcc = s + 1; 7646 RExC_parse++; /* skip over the ending ] */ 7647 7648 if (*s == ':') { 7649 const I32 complement = *posixcc == '^' ? *posixcc++ : 0; 7650 const I32 skip = t - posixcc; 7651 7652 /* Initially switch on the length of the name. */ 7653 switch (skip) { 7654 case 4: 7655 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ 7656 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM; 7657 break; 7658 case 5: 7659 /* Names all of length 5. */ 7660 /* alnum alpha ascii blank cntrl digit graph lower 7661 print punct space upper */ 7662 /* Offset 4 gives the best switch position. */ 7663 switch (posixcc[4]) { 7664 case 'a': 7665 if (memEQ(posixcc, "alph", 4)) /* alpha */ 7666 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA; 7667 break; 7668 case 'e': 7669 if (memEQ(posixcc, "spac", 4)) /* space */ 7670 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; 7671 break; 7672 case 'h': 7673 if (memEQ(posixcc, "grap", 4)) /* graph */ 7674 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH; 7675 break; 7676 case 'i': 7677 if (memEQ(posixcc, "asci", 4)) /* ascii */ 7678 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; 7679 break; 7680 case 'k': 7681 if (memEQ(posixcc, "blan", 4)) /* blank */ 7682 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK; 7683 break; 7684 case 'l': 7685 if (memEQ(posixcc, "cntr", 4)) /* cntrl */ 7686 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL; 7687 break; 7688 case 'm': 7689 if (memEQ(posixcc, "alnu", 4)) /* alnum */ 7690 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; 7691 break; 7692 case 'r': 7693 if (memEQ(posixcc, "lowe", 4)) /* lower */ 7694 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER; 7695 else if (memEQ(posixcc, "uppe", 4)) /* upper */ 7696 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER; 7697 break; 7698 case 't': 7699 if (memEQ(posixcc, "digi", 4)) /* digit */ 7700 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT; 7701 else if (memEQ(posixcc, "prin", 4)) /* print */ 7702 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT; 7703 else if (memEQ(posixcc, "punc", 4)) /* punct */ 7704 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT; 7705 break; 7706 } 7707 break; 7708 case 6: 7709 if (memEQ(posixcc, "xdigit", 6)) 7710 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; 7711 break; 7712 } 7713 7714 if (namedclass == OOB_NAMEDCLASS) 7715 Simple_vFAIL3("POSIX class [:%.*s:] unknown", 7716 t - s - 1, s + 1); 7717 assert (posixcc[skip] == ':'); 7718 assert (posixcc[skip+1] == ']'); 7719 } else if (!SIZE_ONLY) { 7720 /* [[=foo=]] and [[.foo.]] are still future. */ 7721 7722 /* adjust RExC_parse so the warning shows after 7723 the class closes */ 7724 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') 7725 RExC_parse++; 7726 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 7727 } 7728 } else { 7729 /* Maternal grandfather: 7730 * "[:" ending in ":" but not in ":]" */ 7731 RExC_parse = s; 7732 } 7733 } 7734 } 7735 7736 return namedclass; 7737 } 7738 7739 STATIC void 7740 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) 7741 { 7742 dVAR; 7743 7744 PERL_ARGS_ASSERT_CHECKPOSIXCC; 7745 7746 if (POSIXCC(UCHARAT(RExC_parse))) { 7747 const char *s = RExC_parse; 7748 const char c = *s++; 7749 7750 while (isALNUM(*s)) 7751 s++; 7752 if (*s && c == *s && s[1] == ']') { 7753 if (ckWARN(WARN_REGEXP)) 7754 vWARN3(s+2, 7755 "POSIX syntax [%c %c] belongs inside character classes", 7756 c, c); 7757 7758 /* [[=foo=]] and [[.foo.]] are still future. */ 7759 if (POSIXCC_NOTYET(c)) { 7760 /* adjust RExC_parse so the error shows after 7761 the class closes */ 7762 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']') 7763 NOOP; 7764 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 7765 } 7766 } 7767 } 7768 } 7769 7770 7771 #define _C_C_T_(NAME,TEST,WORD) \ 7772 ANYOF_##NAME: \ 7773 if (LOC) \ 7774 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ 7775 else { \ 7776 for (value = 0; value < 256; value++) \ 7777 if (TEST) \ 7778 ANYOF_BITMAP_SET(ret, value); \ 7779 } \ 7780 yesno = '+'; \ 7781 what = WORD; \ 7782 break; \ 7783 case ANYOF_N##NAME: \ 7784 if (LOC) \ 7785 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ 7786 else { \ 7787 for (value = 0; value < 256; value++) \ 7788 if (!TEST) \ 7789 ANYOF_BITMAP_SET(ret, value); \ 7790 } \ 7791 yesno = '!'; \ 7792 what = WORD; \ 7793 break 7794 7795 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \ 7796 ANYOF_##NAME: \ 7797 for (value = 0; value < 256; value++) \ 7798 if (TEST) \ 7799 ANYOF_BITMAP_SET(ret, value); \ 7800 yesno = '+'; \ 7801 what = WORD; \ 7802 break; \ 7803 case ANYOF_N##NAME: \ 7804 for (value = 0; value < 256; value++) \ 7805 if (!TEST) \ 7806 ANYOF_BITMAP_SET(ret, value); \ 7807 yesno = '!'; \ 7808 what = WORD; \ 7809 break 7810 7811 /* 7812 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test 7813 so that it is possible to override the option here without having to 7814 rebuild the entire core. as we are required to do if we change regcomp.h 7815 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. 7816 */ 7817 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 7818 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS 7819 #endif 7820 7821 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS 7822 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME 7823 #else 7824 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME 7825 #endif 7826 7827 /* 7828 parse a class specification and produce either an ANYOF node that 7829 matches the pattern or if the pattern matches a single char only and 7830 that char is < 256 and we are case insensitive then we produce an 7831 EXACT node instead. 7832 */ 7833 7834 STATIC regnode * 7835 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) 7836 { 7837 dVAR; 7838 register UV nextvalue; 7839 register IV prevvalue = OOB_UNICODE; 7840 register IV range = 0; 7841 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ 7842 register regnode *ret; 7843 STRLEN numlen; 7844 IV namedclass; 7845 char *rangebegin = NULL; 7846 bool need_class = 0; 7847 SV *listsv = NULL; 7848 UV n; 7849 bool optimize_invert = TRUE; 7850 AV* unicode_alternate = NULL; 7851 #ifdef EBCDIC 7852 UV literal_endpoint = 0; 7853 #endif 7854 UV stored = 0; /* number of chars stored in the class */ 7855 7856 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in 7857 case we need to change the emitted regop to an EXACT. */ 7858 const char * orig_parse = RExC_parse; 7859 GET_RE_DEBUG_FLAGS_DECL; 7860 7861 PERL_ARGS_ASSERT_REGCLASS; 7862 #ifndef DEBUGGING 7863 PERL_UNUSED_ARG(depth); 7864 #endif 7865 7866 DEBUG_PARSE("clas"); 7867 7868 /* Assume we are going to generate an ANYOF node. */ 7869 ret = reganode(pRExC_state, ANYOF, 0); 7870 7871 if (!SIZE_ONLY) 7872 ANYOF_FLAGS(ret) = 0; 7873 7874 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ 7875 RExC_naughty++; 7876 RExC_parse++; 7877 if (!SIZE_ONLY) 7878 ANYOF_FLAGS(ret) |= ANYOF_INVERT; 7879 } 7880 7881 if (SIZE_ONLY) { 7882 RExC_size += ANYOF_SKIP; 7883 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ 7884 } 7885 else { 7886 RExC_emit += ANYOF_SKIP; 7887 if (FOLD) 7888 ANYOF_FLAGS(ret) |= ANYOF_FOLD; 7889 if (LOC) 7890 ANYOF_FLAGS(ret) |= ANYOF_LOCALE; 7891 ANYOF_BITMAP_ZERO(ret); 7892 listsv = newSVpvs("# comment\n"); 7893 } 7894 7895 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 7896 7897 if (!SIZE_ONLY && POSIXCC(nextvalue)) 7898 checkposixcc(pRExC_state); 7899 7900 /* allow 1st char to be ] (allowing it to be - is dealt with later) */ 7901 if (UCHARAT(RExC_parse) == ']') 7902 goto charclassloop; 7903 7904 parseit: 7905 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { 7906 7907 charclassloop: 7908 7909 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 7910 7911 if (!range) 7912 rangebegin = RExC_parse; 7913 if (UTF) { 7914 value = utf8n_to_uvchr((U8*)RExC_parse, 7915 RExC_end - RExC_parse, 7916 &numlen, UTF8_ALLOW_DEFAULT); 7917 RExC_parse += numlen; 7918 } 7919 else 7920 value = UCHARAT(RExC_parse++); 7921 7922 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 7923 if (value == '[' && POSIXCC(nextvalue)) 7924 namedclass = regpposixcc(pRExC_state, value); 7925 else if (value == '\\') { 7926 if (UTF) { 7927 value = utf8n_to_uvchr((U8*)RExC_parse, 7928 RExC_end - RExC_parse, 7929 &numlen, UTF8_ALLOW_DEFAULT); 7930 RExC_parse += numlen; 7931 } 7932 else 7933 value = UCHARAT(RExC_parse++); 7934 /* Some compilers cannot handle switching on 64-bit integer 7935 * values, therefore value cannot be an UV. Yes, this will 7936 * be a problem later if we want switch on Unicode. 7937 * A similar issue a little bit later when switching on 7938 * namedclass. --jhi */ 7939 switch ((I32)value) { 7940 case 'w': namedclass = ANYOF_ALNUM; break; 7941 case 'W': namedclass = ANYOF_NALNUM; break; 7942 case 's': namedclass = ANYOF_SPACE; break; 7943 case 'S': namedclass = ANYOF_NSPACE; break; 7944 case 'd': namedclass = ANYOF_DIGIT; break; 7945 case 'D': namedclass = ANYOF_NDIGIT; break; 7946 case 'v': namedclass = ANYOF_VERTWS; break; 7947 case 'V': namedclass = ANYOF_NVERTWS; break; 7948 case 'h': namedclass = ANYOF_HORIZWS; break; 7949 case 'H': namedclass = ANYOF_NHORIZWS; break; 7950 case 'N': /* Handle \N{NAME} in class */ 7951 { 7952 /* We only pay attention to the first char of 7953 multichar strings being returned. I kinda wonder 7954 if this makes sense as it does change the behaviour 7955 from earlier versions, OTOH that behaviour was broken 7956 as well. */ 7957 UV v; /* value is register so we cant & it /grrr */ 7958 if (reg_namedseq(pRExC_state, &v)) { 7959 goto parseit; 7960 } 7961 value= v; 7962 } 7963 break; 7964 case 'p': 7965 case 'P': 7966 { 7967 char *e; 7968 if (RExC_parse >= RExC_end) 7969 vFAIL2("Empty \\%c{}", (U8)value); 7970 if (*RExC_parse == '{') { 7971 const U8 c = (U8)value; 7972 e = strchr(RExC_parse++, '}'); 7973 if (!e) 7974 vFAIL2("Missing right brace on \\%c{}", c); 7975 while (isSPACE(UCHARAT(RExC_parse))) 7976 RExC_parse++; 7977 if (e == RExC_parse) 7978 vFAIL2("Empty \\%c{}", c); 7979 n = e - RExC_parse; 7980 while (isSPACE(UCHARAT(RExC_parse + n - 1))) 7981 n--; 7982 } 7983 else { 7984 e = RExC_parse; 7985 n = 1; 7986 } 7987 if (!SIZE_ONLY) { 7988 if (UCHARAT(RExC_parse) == '^') { 7989 RExC_parse++; 7990 n--; 7991 value = value == 'p' ? 'P' : 'p'; /* toggle */ 7992 while (isSPACE(UCHARAT(RExC_parse))) { 7993 RExC_parse++; 7994 n--; 7995 } 7996 } 7997 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n", 7998 (value=='p' ? '+' : '!'), (int)n, RExC_parse); 7999 } 8000 RExC_parse = e + 1; 8001 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 8002 namedclass = ANYOF_MAX; /* no official name, but it's named */ 8003 } 8004 break; 8005 case 'n': value = '\n'; break; 8006 case 'r': value = '\r'; break; 8007 case 't': value = '\t'; break; 8008 case 'f': value = '\f'; break; 8009 case 'b': value = '\b'; break; 8010 case 'e': value = ASCII_TO_NATIVE('\033');break; 8011 case 'a': value = ASCII_TO_NATIVE('\007');break; 8012 case 'x': 8013 if (*RExC_parse == '{') { 8014 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 8015 | PERL_SCAN_DISALLOW_PREFIX; 8016 char * const e = strchr(RExC_parse++, '}'); 8017 if (!e) 8018 vFAIL("Missing right brace on \\x{}"); 8019 8020 numlen = e - RExC_parse; 8021 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 8022 RExC_parse = e + 1; 8023 } 8024 else { 8025 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 8026 numlen = 2; 8027 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 8028 RExC_parse += numlen; 8029 } 8030 if (PL_encoding && value < 0x100) 8031 goto recode_encoding; 8032 break; 8033 case 'c': 8034 value = UCHARAT(RExC_parse++); 8035 value = toCTRL(value); 8036 break; 8037 case '0': case '1': case '2': case '3': case '4': 8038 case '5': case '6': case '7': case '8': case '9': 8039 { 8040 I32 flags = 0; 8041 numlen = 3; 8042 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 8043 RExC_parse += numlen; 8044 if (PL_encoding && value < 0x100) 8045 goto recode_encoding; 8046 break; 8047 } 8048 recode_encoding: 8049 { 8050 SV* enc = PL_encoding; 8051 value = reg_recode((const char)(U8)value, &enc); 8052 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) 8053 vWARN(RExC_parse, 8054 "Invalid escape in the specified encoding"); 8055 break; 8056 } 8057 default: 8058 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP)) 8059 vWARN2(RExC_parse, 8060 "Unrecognized escape \\%c in character class passed through", 8061 (int)value); 8062 break; 8063 } 8064 } /* end of \blah */ 8065 #ifdef EBCDIC 8066 else 8067 literal_endpoint++; 8068 #endif 8069 8070 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 8071 8072 if (!SIZE_ONLY && !need_class) 8073 ANYOF_CLASS_ZERO(ret); 8074 8075 need_class = 1; 8076 8077 /* a bad range like a-\d, a-[:digit:] ? */ 8078 if (range) { 8079 if (!SIZE_ONLY) { 8080 if (ckWARN(WARN_REGEXP)) { 8081 const int w = 8082 RExC_parse >= rangebegin ? 8083 RExC_parse - rangebegin : 0; 8084 vWARN4(RExC_parse, 8085 "False [] range \"%*.*s\"", 8086 w, w, rangebegin); 8087 } 8088 if (prevvalue < 256) { 8089 ANYOF_BITMAP_SET(ret, prevvalue); 8090 ANYOF_BITMAP_SET(ret, '-'); 8091 } 8092 else { 8093 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 8094 Perl_sv_catpvf(aTHX_ listsv, 8095 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); 8096 } 8097 } 8098 8099 range = 0; /* this was not a true range */ 8100 } 8101 8102 8103 8104 if (!SIZE_ONLY) { 8105 const char *what = NULL; 8106 char yesno = 0; 8107 8108 if (namedclass > OOB_NAMEDCLASS) 8109 optimize_invert = FALSE; 8110 /* Possible truncation here but in some 64-bit environments 8111 * the compiler gets heartburn about switch on 64-bit values. 8112 * A similar issue a little earlier when switching on value. 8113 * --jhi */ 8114 switch ((I32)namedclass) { 8115 8116 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); 8117 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); 8118 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); 8119 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); 8120 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); 8121 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); 8122 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); 8123 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); 8124 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); 8125 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); 8126 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS 8127 case _C_C_T_(ALNUM, isALNUM(value), "Word"); 8128 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); 8129 #else 8130 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace"); 8131 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord"); 8132 #endif 8133 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); 8134 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); 8135 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); 8136 case ANYOF_ASCII: 8137 if (LOC) 8138 ANYOF_CLASS_SET(ret, ANYOF_ASCII); 8139 else { 8140 #ifndef EBCDIC 8141 for (value = 0; value < 128; value++) 8142 ANYOF_BITMAP_SET(ret, value); 8143 #else /* EBCDIC */ 8144 for (value = 0; value < 256; value++) { 8145 if (isASCII(value)) 8146 ANYOF_BITMAP_SET(ret, value); 8147 } 8148 #endif /* EBCDIC */ 8149 } 8150 yesno = '+'; 8151 what = "ASCII"; 8152 break; 8153 case ANYOF_NASCII: 8154 if (LOC) 8155 ANYOF_CLASS_SET(ret, ANYOF_NASCII); 8156 else { 8157 #ifndef EBCDIC 8158 for (value = 128; value < 256; value++) 8159 ANYOF_BITMAP_SET(ret, value); 8160 #else /* EBCDIC */ 8161 for (value = 0; value < 256; value++) { 8162 if (!isASCII(value)) 8163 ANYOF_BITMAP_SET(ret, value); 8164 } 8165 #endif /* EBCDIC */ 8166 } 8167 yesno = '!'; 8168 what = "ASCII"; 8169 break; 8170 case ANYOF_DIGIT: 8171 if (LOC) 8172 ANYOF_CLASS_SET(ret, ANYOF_DIGIT); 8173 else { 8174 /* consecutive digits assumed */ 8175 for (value = '0'; value <= '9'; value++) 8176 ANYOF_BITMAP_SET(ret, value); 8177 } 8178 yesno = '+'; 8179 what = POSIX_CC_UNI_NAME("Digit"); 8180 break; 8181 case ANYOF_NDIGIT: 8182 if (LOC) 8183 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); 8184 else { 8185 /* consecutive digits assumed */ 8186 for (value = 0; value < '0'; value++) 8187 ANYOF_BITMAP_SET(ret, value); 8188 for (value = '9' + 1; value < 256; value++) 8189 ANYOF_BITMAP_SET(ret, value); 8190 } 8191 yesno = '!'; 8192 what = POSIX_CC_UNI_NAME("Digit"); 8193 break; 8194 case ANYOF_MAX: 8195 /* this is to handle \p and \P */ 8196 break; 8197 default: 8198 vFAIL("Invalid [::] class"); 8199 break; 8200 } 8201 if (what) { 8202 /* Strings such as "+utf8::isWord\n" */ 8203 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); 8204 } 8205 if (LOC) 8206 ANYOF_FLAGS(ret) |= ANYOF_CLASS; 8207 continue; 8208 } 8209 } /* end of namedclass \blah */ 8210 8211 if (range) { 8212 if (prevvalue > (IV)value) /* b-a */ { 8213 const int w = RExC_parse - rangebegin; 8214 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); 8215 range = 0; /* not a valid range */ 8216 } 8217 } 8218 else { 8219 prevvalue = value; /* save the beginning of the range */ 8220 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && 8221 RExC_parse[1] != ']') { 8222 RExC_parse++; 8223 8224 /* a bad range like \w-, [:word:]- ? */ 8225 if (namedclass > OOB_NAMEDCLASS) { 8226 if (ckWARN(WARN_REGEXP)) { 8227 const int w = 8228 RExC_parse >= rangebegin ? 8229 RExC_parse - rangebegin : 0; 8230 vWARN4(RExC_parse, 8231 "False [] range \"%*.*s\"", 8232 w, w, rangebegin); 8233 } 8234 if (!SIZE_ONLY) 8235 ANYOF_BITMAP_SET(ret, '-'); 8236 } else 8237 range = 1; /* yeah, it's a range! */ 8238 continue; /* but do it the next time */ 8239 } 8240 } 8241 8242 /* now is the next time */ 8243 /*stored += (value - prevvalue + 1);*/ 8244 if (!SIZE_ONLY) { 8245 if (prevvalue < 256) { 8246 const IV ceilvalue = value < 256 ? value : 255; 8247 IV i; 8248 #ifdef EBCDIC 8249 /* In EBCDIC [\x89-\x91] should include 8250 * the \x8e but [i-j] should not. */ 8251 if (literal_endpoint == 2 && 8252 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || 8253 (isUPPER(prevvalue) && isUPPER(ceilvalue)))) 8254 { 8255 if (isLOWER(prevvalue)) { 8256 for (i = prevvalue; i <= ceilvalue; i++) 8257 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { 8258 stored++; 8259 ANYOF_BITMAP_SET(ret, i); 8260 } 8261 } else { 8262 for (i = prevvalue; i <= ceilvalue; i++) 8263 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { 8264 stored++; 8265 ANYOF_BITMAP_SET(ret, i); 8266 } 8267 } 8268 } 8269 else 8270 #endif 8271 for (i = prevvalue; i <= ceilvalue; i++) { 8272 if (!ANYOF_BITMAP_TEST(ret,i)) { 8273 stored++; 8274 ANYOF_BITMAP_SET(ret, i); 8275 } 8276 } 8277 } 8278 if (value > 255 || UTF) { 8279 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); 8280 const UV natvalue = NATIVE_TO_UNI(value); 8281 stored+=2; /* can't optimize this class */ 8282 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 8283 if (prevnatvalue < natvalue) { /* what about > ? */ 8284 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", 8285 prevnatvalue, natvalue); 8286 } 8287 else if (prevnatvalue == natvalue) { 8288 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); 8289 if (FOLD) { 8290 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 8291 STRLEN foldlen; 8292 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen); 8293 8294 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */ 8295 if (RExC_precomp[0] == ':' && 8296 RExC_precomp[1] == '[' && 8297 (f == 0xDF || f == 0x92)) { 8298 f = NATIVE_TO_UNI(f); 8299 } 8300 #endif 8301 /* If folding and foldable and a single 8302 * character, insert also the folded version 8303 * to the charclass. */ 8304 if (f != value) { 8305 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */ 8306 if ((RExC_precomp[0] == ':' && 8307 RExC_precomp[1] == '[' && 8308 (f == 0xA2 && 8309 (value == 0xFB05 || value == 0xFB06))) ? 8310 foldlen == ((STRLEN)UNISKIP(f) - 1) : 8311 foldlen == (STRLEN)UNISKIP(f) ) 8312 #else 8313 if (foldlen == (STRLEN)UNISKIP(f)) 8314 #endif 8315 Perl_sv_catpvf(aTHX_ listsv, 8316 "%04"UVxf"\n", f); 8317 else { 8318 /* Any multicharacter foldings 8319 * require the following transform: 8320 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) 8321 * where E folds into "pq" and F folds 8322 * into "rst", all other characters 8323 * fold to single characters. We save 8324 * away these multicharacter foldings, 8325 * to be later saved as part of the 8326 * additional "s" data. */ 8327 SV *sv; 8328 8329 if (!unicode_alternate) 8330 unicode_alternate = newAV(); 8331 sv = newSVpvn_utf8((char*)foldbuf, foldlen, 8332 TRUE); 8333 av_push(unicode_alternate, sv); 8334 } 8335 } 8336 8337 /* If folding and the value is one of the Greek 8338 * sigmas insert a few more sigmas to make the 8339 * folding rules of the sigmas to work right. 8340 * Note that not all the possible combinations 8341 * are handled here: some of them are handled 8342 * by the standard folding rules, and some of 8343 * them (literal or EXACTF cases) are handled 8344 * during runtime in regexec.c:S_find_byclass(). */ 8345 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { 8346 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 8347 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); 8348 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 8349 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 8350 } 8351 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA) 8352 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 8353 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 8354 } 8355 } 8356 } 8357 #ifdef EBCDIC 8358 literal_endpoint = 0; 8359 #endif 8360 } 8361 8362 range = 0; /* this range (if it was one) is done now */ 8363 } 8364 8365 if (need_class) { 8366 ANYOF_FLAGS(ret) |= ANYOF_LARGE; 8367 if (SIZE_ONLY) 8368 RExC_size += ANYOF_CLASS_ADD_SKIP; 8369 else 8370 RExC_emit += ANYOF_CLASS_ADD_SKIP; 8371 } 8372 8373 8374 if (SIZE_ONLY) 8375 return ret; 8376 /****** !SIZE_ONLY AFTER HERE *********/ 8377 8378 if( stored == 1 && (value < 128 || (value < 256 && !UTF)) 8379 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) ) 8380 ) { 8381 /* optimize single char class to an EXACT node 8382 but *only* when its not a UTF/high char */ 8383 const char * cur_parse= RExC_parse; 8384 RExC_emit = (regnode *)orig_emit; 8385 RExC_parse = (char *)orig_parse; 8386 ret = reg_node(pRExC_state, 8387 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT)); 8388 RExC_parse = (char *)cur_parse; 8389 *STRING(ret)= (char)value; 8390 STR_LEN(ret)= 1; 8391 RExC_emit += STR_SZ(1); 8392 if (listsv) { 8393 SvREFCNT_dec(listsv); 8394 } 8395 return ret; 8396 } 8397 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ 8398 if ( /* If the only flag is folding (plus possibly inversion). */ 8399 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) 8400 ) { 8401 for (value = 0; value < 256; ++value) { 8402 if (ANYOF_BITMAP_TEST(ret, value)) { 8403 UV fold = PL_fold[value]; 8404 8405 if (fold != value) 8406 ANYOF_BITMAP_SET(ret, fold); 8407 } 8408 } 8409 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; 8410 } 8411 8412 /* optimize inverted simple patterns (e.g. [^a-z]) */ 8413 if (optimize_invert && 8414 /* If the only flag is inversion. */ 8415 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { 8416 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) 8417 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; 8418 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; 8419 } 8420 { 8421 AV * const av = newAV(); 8422 SV *rv; 8423 /* The 0th element stores the character class description 8424 * in its textual form: used later (regexec.c:Perl_regclass_swash()) 8425 * to initialize the appropriate swash (which gets stored in 8426 * the 1st element), and also useful for dumping the regnode. 8427 * The 2nd element stores the multicharacter foldings, 8428 * used later (regexec.c:S_reginclass()). */ 8429 av_store(av, 0, listsv); 8430 av_store(av, 1, NULL); 8431 av_store(av, 2, MUTABLE_SV(unicode_alternate)); 8432 rv = newRV_noinc(MUTABLE_SV(av)); 8433 n = add_data(pRExC_state, 1, "s"); 8434 RExC_rxi->data->data[n] = (void*)rv; 8435 ARG_SET(ret, n); 8436 } 8437 return ret; 8438 } 8439 #undef _C_C_T_ 8440 8441 8442 /* reg_skipcomment() 8443 8444 Absorbs an /x style # comments from the input stream. 8445 Returns true if there is more text remaining in the stream. 8446 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment 8447 terminates the pattern without including a newline. 8448 8449 Note its the callers responsibility to ensure that we are 8450 actually in /x mode 8451 8452 */ 8453 8454 STATIC bool 8455 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) 8456 { 8457 bool ended = 0; 8458 8459 PERL_ARGS_ASSERT_REG_SKIPCOMMENT; 8460 8461 while (RExC_parse < RExC_end) 8462 if (*RExC_parse++ == '\n') { 8463 ended = 1; 8464 break; 8465 } 8466 if (!ended) { 8467 /* we ran off the end of the pattern without ending 8468 the comment, so we have to add an \n when wrapping */ 8469 RExC_seen |= REG_SEEN_RUN_ON_COMMENT; 8470 return 0; 8471 } else 8472 return 1; 8473 } 8474 8475 /* nextchar() 8476 8477 Advance that parse position, and optionall absorbs 8478 "whitespace" from the inputstream. 8479 8480 Without /x "whitespace" means (?#...) style comments only, 8481 with /x this means (?#...) and # comments and whitespace proper. 8482 8483 Returns the RExC_parse point from BEFORE the scan occurs. 8484 8485 This is the /x friendly way of saying RExC_parse++. 8486 */ 8487 8488 STATIC char* 8489 S_nextchar(pTHX_ RExC_state_t *pRExC_state) 8490 { 8491 char* const retval = RExC_parse++; 8492 8493 PERL_ARGS_ASSERT_NEXTCHAR; 8494 8495 for (;;) { 8496 if (*RExC_parse == '(' && RExC_parse[1] == '?' && 8497 RExC_parse[2] == '#') { 8498 while (*RExC_parse != ')') { 8499 if (RExC_parse == RExC_end) 8500 FAIL("Sequence (?#... not terminated"); 8501 RExC_parse++; 8502 } 8503 RExC_parse++; 8504 continue; 8505 } 8506 if (RExC_flags & RXf_PMf_EXTENDED) { 8507 if (isSPACE(*RExC_parse)) { 8508 RExC_parse++; 8509 continue; 8510 } 8511 else if (*RExC_parse == '#') { 8512 if ( reg_skipcomment( pRExC_state ) ) 8513 continue; 8514 } 8515 } 8516 return retval; 8517 } 8518 } 8519 8520 /* 8521 - reg_node - emit a node 8522 */ 8523 STATIC regnode * /* Location. */ 8524 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 8525 { 8526 dVAR; 8527 register regnode *ptr; 8528 regnode * const ret = RExC_emit; 8529 GET_RE_DEBUG_FLAGS_DECL; 8530 8531 PERL_ARGS_ASSERT_REG_NODE; 8532 8533 if (SIZE_ONLY) { 8534 SIZE_ALIGN(RExC_size); 8535 RExC_size += 1; 8536 return(ret); 8537 } 8538 if (RExC_emit >= RExC_emit_bound) 8539 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); 8540 8541 NODE_ALIGN_FILL(ret); 8542 ptr = ret; 8543 FILL_ADVANCE_NODE(ptr, op); 8544 #ifdef RE_TRACK_PATTERN_OFFSETS 8545 if (RExC_offsets) { /* MJD */ 8546 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", 8547 "reg_node", __LINE__, 8548 PL_reg_name[op], 8549 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] 8550 ? "Overwriting end of array!\n" : "OK", 8551 (UV)(RExC_emit - RExC_emit_start), 8552 (UV)(RExC_parse - RExC_start), 8553 (UV)RExC_offsets[0])); 8554 Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); 8555 } 8556 #endif 8557 RExC_emit = ptr; 8558 return(ret); 8559 } 8560 8561 /* 8562 - reganode - emit a node with an argument 8563 */ 8564 STATIC regnode * /* Location. */ 8565 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 8566 { 8567 dVAR; 8568 register regnode *ptr; 8569 regnode * const ret = RExC_emit; 8570 GET_RE_DEBUG_FLAGS_DECL; 8571 8572 PERL_ARGS_ASSERT_REGANODE; 8573 8574 if (SIZE_ONLY) { 8575 SIZE_ALIGN(RExC_size); 8576 RExC_size += 2; 8577 /* 8578 We can't do this: 8579 8580 assert(2==regarglen[op]+1); 8581 8582 Anything larger than this has to allocate the extra amount. 8583 If we changed this to be: 8584 8585 RExC_size += (1 + regarglen[op]); 8586 8587 then it wouldn't matter. Its not clear what side effect 8588 might come from that so its not done so far. 8589 -- dmq 8590 */ 8591 return(ret); 8592 } 8593 if (RExC_emit >= RExC_emit_bound) 8594 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); 8595 8596 NODE_ALIGN_FILL(ret); 8597 ptr = ret; 8598 FILL_ADVANCE_NODE_ARG(ptr, op, arg); 8599 #ifdef RE_TRACK_PATTERN_OFFSETS 8600 if (RExC_offsets) { /* MJD */ 8601 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", 8602 "reganode", 8603 __LINE__, 8604 PL_reg_name[op], 8605 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? 8606 "Overwriting end of array!\n" : "OK", 8607 (UV)(RExC_emit - RExC_emit_start), 8608 (UV)(RExC_parse - RExC_start), 8609 (UV)RExC_offsets[0])); 8610 Set_Cur_Node_Offset; 8611 } 8612 #endif 8613 RExC_emit = ptr; 8614 return(ret); 8615 } 8616 8617 /* 8618 - reguni - emit (if appropriate) a Unicode character 8619 */ 8620 STATIC STRLEN 8621 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) 8622 { 8623 dVAR; 8624 8625 PERL_ARGS_ASSERT_REGUNI; 8626 8627 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); 8628 } 8629 8630 /* 8631 - reginsert - insert an operator in front of already-emitted operand 8632 * 8633 * Means relocating the operand. 8634 */ 8635 STATIC void 8636 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) 8637 { 8638 dVAR; 8639 register regnode *src; 8640 register regnode *dst; 8641 register regnode *place; 8642 const int offset = regarglen[(U8)op]; 8643 const int size = NODE_STEP_REGNODE + offset; 8644 GET_RE_DEBUG_FLAGS_DECL; 8645 8646 PERL_ARGS_ASSERT_REGINSERT; 8647 PERL_UNUSED_ARG(depth); 8648 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ 8649 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); 8650 if (SIZE_ONLY) { 8651 RExC_size += size; 8652 return; 8653 } 8654 8655 src = RExC_emit; 8656 RExC_emit += size; 8657 dst = RExC_emit; 8658 if (RExC_open_parens) { 8659 int paren; 8660 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ 8661 for ( paren=0 ; paren < RExC_npar ; paren++ ) { 8662 if ( RExC_open_parens[paren] >= opnd ) { 8663 /*DEBUG_PARSE_FMT("open"," - %d",size);*/ 8664 RExC_open_parens[paren] += size; 8665 } else { 8666 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ 8667 } 8668 if ( RExC_close_parens[paren] >= opnd ) { 8669 /*DEBUG_PARSE_FMT("close"," - %d",size);*/ 8670 RExC_close_parens[paren] += size; 8671 } else { 8672 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ 8673 } 8674 } 8675 } 8676 8677 while (src > opnd) { 8678 StructCopy(--src, --dst, regnode); 8679 #ifdef RE_TRACK_PATTERN_OFFSETS 8680 if (RExC_offsets) { /* MJD 20010112 */ 8681 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", 8682 "reg_insert", 8683 __LINE__, 8684 PL_reg_name[op], 8685 (UV)(dst - RExC_emit_start) > RExC_offsets[0] 8686 ? "Overwriting end of array!\n" : "OK", 8687 (UV)(src - RExC_emit_start), 8688 (UV)(dst - RExC_emit_start), 8689 (UV)RExC_offsets[0])); 8690 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); 8691 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); 8692 } 8693 #endif 8694 } 8695 8696 8697 place = opnd; /* Op node, where operand used to be. */ 8698 #ifdef RE_TRACK_PATTERN_OFFSETS 8699 if (RExC_offsets) { /* MJD */ 8700 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", 8701 "reginsert", 8702 __LINE__, 8703 PL_reg_name[op], 8704 (UV)(place - RExC_emit_start) > RExC_offsets[0] 8705 ? "Overwriting end of array!\n" : "OK", 8706 (UV)(place - RExC_emit_start), 8707 (UV)(RExC_parse - RExC_start), 8708 (UV)RExC_offsets[0])); 8709 Set_Node_Offset(place, RExC_parse); 8710 Set_Node_Length(place, 1); 8711 } 8712 #endif 8713 src = NEXTOPER(place); 8714 FILL_ADVANCE_NODE(place, op); 8715 Zero(src, offset, regnode); 8716 } 8717 8718 /* 8719 - regtail - set the next-pointer at the end of a node chain of p to val. 8720 - SEE ALSO: regtail_study 8721 */ 8722 /* TODO: All three parms should be const */ 8723 STATIC void 8724 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) 8725 { 8726 dVAR; 8727 register regnode *scan; 8728 GET_RE_DEBUG_FLAGS_DECL; 8729 8730 PERL_ARGS_ASSERT_REGTAIL; 8731 #ifndef DEBUGGING 8732 PERL_UNUSED_ARG(depth); 8733 #endif 8734 8735 if (SIZE_ONLY) 8736 return; 8737 8738 /* Find last node. */ 8739 scan = p; 8740 for (;;) { 8741 regnode * const temp = regnext(scan); 8742 DEBUG_PARSE_r({ 8743 SV * const mysv=sv_newmortal(); 8744 DEBUG_PARSE_MSG((scan==p ? "tail" : "")); 8745 regprop(RExC_rx, mysv, scan); 8746 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", 8747 SvPV_nolen_const(mysv), REG_NODE_NUM(scan), 8748 (temp == NULL ? "->" : ""), 8749 (temp == NULL ? PL_reg_name[OP(val)] : "") 8750 ); 8751 }); 8752 if (temp == NULL) 8753 break; 8754 scan = temp; 8755 } 8756 8757 if (reg_off_by_arg[OP(scan)]) { 8758 ARG_SET(scan, val - scan); 8759 } 8760 else { 8761 NEXT_OFF(scan) = val - scan; 8762 } 8763 } 8764 8765 #ifdef DEBUGGING 8766 /* 8767 - regtail_study - set the next-pointer at the end of a node chain of p to val. 8768 - Look for optimizable sequences at the same time. 8769 - currently only looks for EXACT chains. 8770 8771 This is expermental code. The idea is to use this routine to perform 8772 in place optimizations on branches and groups as they are constructed, 8773 with the long term intention of removing optimization from study_chunk so 8774 that it is purely analytical. 8775 8776 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used 8777 to control which is which. 8778 8779 */ 8780 /* TODO: All four parms should be const */ 8781 8782 STATIC U8 8783 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) 8784 { 8785 dVAR; 8786 register regnode *scan; 8787 U8 exact = PSEUDO; 8788 #ifdef EXPERIMENTAL_INPLACESCAN 8789 I32 min = 0; 8790 #endif 8791 GET_RE_DEBUG_FLAGS_DECL; 8792 8793 PERL_ARGS_ASSERT_REGTAIL_STUDY; 8794 8795 8796 if (SIZE_ONLY) 8797 return exact; 8798 8799 /* Find last node. */ 8800 8801 scan = p; 8802 for (;;) { 8803 regnode * const temp = regnext(scan); 8804 #ifdef EXPERIMENTAL_INPLACESCAN 8805 if (PL_regkind[OP(scan)] == EXACT) 8806 if (join_exact(pRExC_state,scan,&min,1,val,depth+1)) 8807 return EXACT; 8808 #endif 8809 if ( exact ) { 8810 switch (OP(scan)) { 8811 case EXACT: 8812 case EXACTF: 8813 case EXACTFL: 8814 if( exact == PSEUDO ) 8815 exact= OP(scan); 8816 else if ( exact != OP(scan) ) 8817 exact= 0; 8818 case NOTHING: 8819 break; 8820 default: 8821 exact= 0; 8822 } 8823 } 8824 DEBUG_PARSE_r({ 8825 SV * const mysv=sv_newmortal(); 8826 DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); 8827 regprop(RExC_rx, mysv, scan); 8828 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", 8829 SvPV_nolen_const(mysv), 8830 REG_NODE_NUM(scan), 8831 PL_reg_name[exact]); 8832 }); 8833 if (temp == NULL) 8834 break; 8835 scan = temp; 8836 } 8837 DEBUG_PARSE_r({ 8838 SV * const mysv_val=sv_newmortal(); 8839 DEBUG_PARSE_MSG(""); 8840 regprop(RExC_rx, mysv_val, val); 8841 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", 8842 SvPV_nolen_const(mysv_val), 8843 (IV)REG_NODE_NUM(val), 8844 (IV)(val - scan) 8845 ); 8846 }); 8847 if (reg_off_by_arg[OP(scan)]) { 8848 ARG_SET(scan, val - scan); 8849 } 8850 else { 8851 NEXT_OFF(scan) = val - scan; 8852 } 8853 8854 return exact; 8855 } 8856 #endif 8857 8858 /* 8859 - regcurly - a little FSA that accepts {\d+,?\d*} 8860 */ 8861 STATIC I32 8862 S_regcurly(register const char *s) 8863 { 8864 PERL_ARGS_ASSERT_REGCURLY; 8865 8866 if (*s++ != '{') 8867 return FALSE; 8868 if (!isDIGIT(*s)) 8869 return FALSE; 8870 while (isDIGIT(*s)) 8871 s++; 8872 if (*s == ',') 8873 s++; 8874 while (isDIGIT(*s)) 8875 s++; 8876 if (*s != '}') 8877 return FALSE; 8878 return TRUE; 8879 } 8880 8881 8882 /* 8883 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 8884 */ 8885 #ifdef DEBUGGING 8886 void 8887 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) 8888 { 8889 int bit; 8890 int set=0; 8891 8892 for (bit=0; bit<32; bit++) { 8893 if (flags & (1<<bit)) { 8894 if (!set++ && lead) 8895 PerlIO_printf(Perl_debug_log, "%s",lead); 8896 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]); 8897 } 8898 } 8899 if (lead) { 8900 if (set) 8901 PerlIO_printf(Perl_debug_log, "\n"); 8902 else 8903 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead); 8904 } 8905 } 8906 #endif 8907 8908 void 8909 Perl_regdump(pTHX_ const regexp *r) 8910 { 8911 #ifdef DEBUGGING 8912 dVAR; 8913 SV * const sv = sv_newmortal(); 8914 SV *dsv= sv_newmortal(); 8915 RXi_GET_DECL(r,ri); 8916 GET_RE_DEBUG_FLAGS_DECL; 8917 8918 PERL_ARGS_ASSERT_REGDUMP; 8919 8920 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); 8921 8922 /* Header fields of interest. */ 8923 if (r->anchored_substr) { 8924 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), 8925 RE_SV_DUMPLEN(r->anchored_substr), 30); 8926 PerlIO_printf(Perl_debug_log, 8927 "anchored %s%s at %"IVdf" ", 8928 s, RE_SV_TAIL(r->anchored_substr), 8929 (IV)r->anchored_offset); 8930 } else if (r->anchored_utf8) { 8931 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), 8932 RE_SV_DUMPLEN(r->anchored_utf8), 30); 8933 PerlIO_printf(Perl_debug_log, 8934 "anchored utf8 %s%s at %"IVdf" ", 8935 s, RE_SV_TAIL(r->anchored_utf8), 8936 (IV)r->anchored_offset); 8937 } 8938 if (r->float_substr) { 8939 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), 8940 RE_SV_DUMPLEN(r->float_substr), 30); 8941 PerlIO_printf(Perl_debug_log, 8942 "floating %s%s at %"IVdf"..%"UVuf" ", 8943 s, RE_SV_TAIL(r->float_substr), 8944 (IV)r->float_min_offset, (UV)r->float_max_offset); 8945 } else if (r->float_utf8) { 8946 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), 8947 RE_SV_DUMPLEN(r->float_utf8), 30); 8948 PerlIO_printf(Perl_debug_log, 8949 "floating utf8 %s%s at %"IVdf"..%"UVuf" ", 8950 s, RE_SV_TAIL(r->float_utf8), 8951 (IV)r->float_min_offset, (UV)r->float_max_offset); 8952 } 8953 if (r->check_substr || r->check_utf8) 8954 PerlIO_printf(Perl_debug_log, 8955 (const char *) 8956 (r->check_substr == r->float_substr 8957 && r->check_utf8 == r->float_utf8 8958 ? "(checking floating" : "(checking anchored")); 8959 if (r->extflags & RXf_NOSCAN) 8960 PerlIO_printf(Perl_debug_log, " noscan"); 8961 if (r->extflags & RXf_CHECK_ALL) 8962 PerlIO_printf(Perl_debug_log, " isall"); 8963 if (r->check_substr || r->check_utf8) 8964 PerlIO_printf(Perl_debug_log, ") "); 8965 8966 if (ri->regstclass) { 8967 regprop(r, sv, ri->regstclass); 8968 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); 8969 } 8970 if (r->extflags & RXf_ANCH) { 8971 PerlIO_printf(Perl_debug_log, "anchored"); 8972 if (r->extflags & RXf_ANCH_BOL) 8973 PerlIO_printf(Perl_debug_log, "(BOL)"); 8974 if (r->extflags & RXf_ANCH_MBOL) 8975 PerlIO_printf(Perl_debug_log, "(MBOL)"); 8976 if (r->extflags & RXf_ANCH_SBOL) 8977 PerlIO_printf(Perl_debug_log, "(SBOL)"); 8978 if (r->extflags & RXf_ANCH_GPOS) 8979 PerlIO_printf(Perl_debug_log, "(GPOS)"); 8980 PerlIO_putc(Perl_debug_log, ' '); 8981 } 8982 if (r->extflags & RXf_GPOS_SEEN) 8983 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); 8984 if (r->intflags & PREGf_SKIP) 8985 PerlIO_printf(Perl_debug_log, "plus "); 8986 if (r->intflags & PREGf_IMPLICIT) 8987 PerlIO_printf(Perl_debug_log, "implicit "); 8988 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); 8989 if (r->extflags & RXf_EVAL_SEEN) 8990 PerlIO_printf(Perl_debug_log, "with eval "); 8991 PerlIO_printf(Perl_debug_log, "\n"); 8992 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); 8993 #else 8994 PERL_ARGS_ASSERT_REGDUMP; 8995 PERL_UNUSED_CONTEXT; 8996 PERL_UNUSED_ARG(r); 8997 #endif /* DEBUGGING */ 8998 } 8999 9000 /* 9001 - regprop - printable representation of opcode 9002 */ 9003 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ 9004 STMT_START { \ 9005 if (do_sep) { \ 9006 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ 9007 if (flags & ANYOF_INVERT) \ 9008 /*make sure the invert info is in each */ \ 9009 sv_catpvs(sv, "^"); \ 9010 do_sep = 0; \ 9011 } \ 9012 } STMT_END 9013 9014 void 9015 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) 9016 { 9017 #ifdef DEBUGGING 9018 dVAR; 9019 register int k; 9020 RXi_GET_DECL(prog,progi); 9021 GET_RE_DEBUG_FLAGS_DECL; 9022 9023 PERL_ARGS_ASSERT_REGPROP; 9024 9025 sv_setpvs(sv, ""); 9026 9027 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ 9028 /* It would be nice to FAIL() here, but this may be called from 9029 regexec.c, and it would be hard to supply pRExC_state. */ 9030 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); 9031 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ 9032 9033 k = PL_regkind[OP(o)]; 9034 9035 if (k == EXACT) { 9036 sv_catpvs(sv, " "); 9037 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 9038 * is a crude hack but it may be the best for now since 9039 * we have no flag "this EXACTish node was UTF-8" 9040 * --jhi */ 9041 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], 9042 PERL_PV_ESCAPE_UNI_DETECT | 9043 PERL_PV_PRETTY_ELLIPSES | 9044 PERL_PV_PRETTY_LTGT | 9045 PERL_PV_PRETTY_NOCLEAR 9046 ); 9047 } else if (k == TRIE) { 9048 /* print the details of the trie in dumpuntil instead, as 9049 * progi->data isn't available here */ 9050 const char op = OP(o); 9051 const U32 n = ARG(o); 9052 const reg_ac_data * const ac = IS_TRIE_AC(op) ? 9053 (reg_ac_data *)progi->data->data[n] : 9054 NULL; 9055 const reg_trie_data * const trie 9056 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; 9057 9058 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); 9059 DEBUG_TRIE_COMPILE_r( 9060 Perl_sv_catpvf(aTHX_ sv, 9061 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">", 9062 (UV)trie->startstate, 9063 (IV)trie->statecount-1, /* -1 because of the unused 0 element */ 9064 (UV)trie->wordcount, 9065 (UV)trie->minlen, 9066 (UV)trie->maxlen, 9067 (UV)TRIE_CHARCOUNT(trie), 9068 (UV)trie->uniquecharcount 9069 ) 9070 ); 9071 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { 9072 int i; 9073 int rangestart = -1; 9074 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); 9075 sv_catpvs(sv, "["); 9076 for (i = 0; i <= 256; i++) { 9077 if (i < 256 && BITMAP_TEST(bitmap,i)) { 9078 if (rangestart == -1) 9079 rangestart = i; 9080 } else if (rangestart != -1) { 9081 if (i <= rangestart + 3) 9082 for (; rangestart < i; rangestart++) 9083 put_byte(sv, rangestart); 9084 else { 9085 put_byte(sv, rangestart); 9086 sv_catpvs(sv, "-"); 9087 put_byte(sv, i - 1); 9088 } 9089 rangestart = -1; 9090 } 9091 } 9092 sv_catpvs(sv, "]"); 9093 } 9094 9095 } else if (k == CURLY) { 9096 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) 9097 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ 9098 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); 9099 } 9100 else if (k == WHILEM && o->flags) /* Ordinal/of */ 9101 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); 9102 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { 9103 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ 9104 if ( RXp_PAREN_NAMES(prog) ) { 9105 if ( k != REF || OP(o) < NREF) { 9106 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 9107 SV **name= av_fetch(list, ARG(o), 0 ); 9108 if (name) 9109 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 9110 } 9111 else { 9112 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); 9113 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); 9114 I32 *nums=(I32*)SvPVX(sv_dat); 9115 SV **name= av_fetch(list, nums[0], 0 ); 9116 I32 n; 9117 if (name) { 9118 for ( n=0; n<SvIVX(sv_dat); n++ ) { 9119 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf, 9120 (n ? "," : ""), (IV)nums[n]); 9121 } 9122 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 9123 } 9124 } 9125 } 9126 } else if (k == GOSUB) 9127 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ 9128 else if (k == VERB) { 9129 if (!o->flags) 9130 Perl_sv_catpvf(aTHX_ sv, ":%"SVf, 9131 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); 9132 } else if (k == LOGICAL) 9133 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ 9134 else if (k == FOLDCHAR) 9135 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); 9136 else if (k == ANYOF) { 9137 int i, rangestart = -1; 9138 const U8 flags = ANYOF_FLAGS(o); 9139 int do_sep = 0; 9140 9141 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ 9142 static const char * const anyofs[] = { 9143 "\\w", 9144 "\\W", 9145 "\\s", 9146 "\\S", 9147 "\\d", 9148 "\\D", 9149 "[:alnum:]", 9150 "[:^alnum:]", 9151 "[:alpha:]", 9152 "[:^alpha:]", 9153 "[:ascii:]", 9154 "[:^ascii:]", 9155 "[:cntrl:]", 9156 "[:^cntrl:]", 9157 "[:graph:]", 9158 "[:^graph:]", 9159 "[:lower:]", 9160 "[:^lower:]", 9161 "[:print:]", 9162 "[:^print:]", 9163 "[:punct:]", 9164 "[:^punct:]", 9165 "[:upper:]", 9166 "[:^upper:]", 9167 "[:xdigit:]", 9168 "[:^xdigit:]", 9169 "[:space:]", 9170 "[:^space:]", 9171 "[:blank:]", 9172 "[:^blank:]" 9173 }; 9174 9175 if (flags & ANYOF_LOCALE) 9176 sv_catpvs(sv, "{loc}"); 9177 if (flags & ANYOF_FOLD) 9178 sv_catpvs(sv, "{i}"); 9179 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 9180 if (flags & ANYOF_INVERT) 9181 sv_catpvs(sv, "^"); 9182 9183 /* output what the standard cp 0-255 bitmap matches */ 9184 for (i = 0; i <= 256; i++) { 9185 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { 9186 if (rangestart == -1) 9187 rangestart = i; 9188 } else if (rangestart != -1) { 9189 if (i <= rangestart + 3) 9190 for (; rangestart < i; rangestart++) 9191 put_byte(sv, rangestart); 9192 else { 9193 put_byte(sv, rangestart); 9194 sv_catpvs(sv, "-"); 9195 put_byte(sv, i - 1); 9196 } 9197 do_sep = 1; 9198 rangestart = -1; 9199 } 9200 } 9201 9202 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); 9203 /* output any special charclass tests (used mostly under use locale) */ 9204 if (o->flags & ANYOF_CLASS) 9205 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) 9206 if (ANYOF_CLASS_TEST(o,i)) { 9207 sv_catpv(sv, anyofs[i]); 9208 do_sep = 1; 9209 } 9210 9211 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); 9212 9213 /* output information about the unicode matching */ 9214 if (flags & ANYOF_UNICODE) 9215 sv_catpvs(sv, "{unicode}"); 9216 else if (flags & ANYOF_UNICODE_ALL) 9217 sv_catpvs(sv, "{unicode_all}"); 9218 9219 { 9220 SV *lv; 9221 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0); 9222 9223 if (lv) { 9224 if (sw) { 9225 U8 s[UTF8_MAXBYTES_CASE+1]; 9226 9227 for (i = 0; i <= 256; i++) { /* just the first 256 */ 9228 uvchr_to_utf8(s, i); 9229 9230 if (i < 256 && swash_fetch(sw, s, TRUE)) { 9231 if (rangestart == -1) 9232 rangestart = i; 9233 } else if (rangestart != -1) { 9234 if (i <= rangestart + 3) 9235 for (; rangestart < i; rangestart++) { 9236 const U8 * const e = uvchr_to_utf8(s,rangestart); 9237 U8 *p; 9238 for(p = s; p < e; p++) 9239 put_byte(sv, *p); 9240 } 9241 else { 9242 const U8 *e = uvchr_to_utf8(s,rangestart); 9243 U8 *p; 9244 for (p = s; p < e; p++) 9245 put_byte(sv, *p); 9246 sv_catpvs(sv, "-"); 9247 e = uvchr_to_utf8(s, i-1); 9248 for (p = s; p < e; p++) 9249 put_byte(sv, *p); 9250 } 9251 rangestart = -1; 9252 } 9253 } 9254 9255 sv_catpvs(sv, "..."); /* et cetera */ 9256 } 9257 9258 { 9259 char *s = savesvpv(lv); 9260 char * const origs = s; 9261 9262 while (*s && *s != '\n') 9263 s++; 9264 9265 if (*s == '\n') { 9266 const char * const t = ++s; 9267 9268 while (*s) { 9269 if (*s == '\n') 9270 *s = ' '; 9271 s++; 9272 } 9273 if (s[-1] == ' ') 9274 s[-1] = 0; 9275 9276 sv_catpv(sv, t); 9277 } 9278 9279 Safefree(origs); 9280 } 9281 } 9282 } 9283 9284 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 9285 } 9286 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) 9287 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); 9288 #else 9289 PERL_UNUSED_CONTEXT; 9290 PERL_UNUSED_ARG(sv); 9291 PERL_UNUSED_ARG(o); 9292 PERL_UNUSED_ARG(prog); 9293 #endif /* DEBUGGING */ 9294 } 9295 9296 SV * 9297 Perl_re_intuit_string(pTHX_ REGEXP * const prog) 9298 { /* Assume that RE_INTUIT is set */ 9299 dVAR; 9300 GET_RE_DEBUG_FLAGS_DECL; 9301 9302 PERL_ARGS_ASSERT_RE_INTUIT_STRING; 9303 PERL_UNUSED_CONTEXT; 9304 9305 DEBUG_COMPILE_r( 9306 { 9307 const char * const s = SvPV_nolen_const(prog->check_substr 9308 ? prog->check_substr : prog->check_utf8); 9309 9310 if (!PL_colorset) reginitcolors(); 9311 PerlIO_printf(Perl_debug_log, 9312 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", 9313 PL_colors[4], 9314 prog->check_substr ? "" : "utf8 ", 9315 PL_colors[5],PL_colors[0], 9316 s, 9317 PL_colors[1], 9318 (strlen(s) > 60 ? "..." : "")); 9319 } ); 9320 9321 return prog->check_substr ? prog->check_substr : prog->check_utf8; 9322 } 9323 9324 /* 9325 pregfree() 9326 9327 handles refcounting and freeing the perl core regexp structure. When 9328 it is necessary to actually free the structure the first thing it 9329 does is call the 'free' method of the regexp_engine associated to to 9330 the regexp, allowing the handling of the void *pprivate; member 9331 first. (This routine is not overridable by extensions, which is why 9332 the extensions free is called first.) 9333 9334 See regdupe and regdupe_internal if you change anything here. 9335 */ 9336 #ifndef PERL_IN_XSUB_RE 9337 void 9338 Perl_pregfree(pTHX_ REGEXP *r) 9339 { 9340 dVAR; 9341 GET_RE_DEBUG_FLAGS_DECL; 9342 9343 if (!r || (--r->refcnt > 0)) 9344 return; 9345 if (r->mother_re) { 9346 ReREFCNT_dec(r->mother_re); 9347 } else { 9348 CALLREGFREE_PVT(r); /* free the private data */ 9349 if (RXp_PAREN_NAMES(r)) 9350 SvREFCNT_dec(RXp_PAREN_NAMES(r)); 9351 Safefree(RX_WRAPPED(r)); 9352 } 9353 if (r->substrs) { 9354 if (r->anchored_substr) 9355 SvREFCNT_dec(r->anchored_substr); 9356 if (r->anchored_utf8) 9357 SvREFCNT_dec(r->anchored_utf8); 9358 if (r->float_substr) 9359 SvREFCNT_dec(r->float_substr); 9360 if (r->float_utf8) 9361 SvREFCNT_dec(r->float_utf8); 9362 Safefree(r->substrs); 9363 } 9364 RX_MATCH_COPY_FREE(r); 9365 #ifdef PERL_OLD_COPY_ON_WRITE 9366 if (r->saved_copy) 9367 SvREFCNT_dec(r->saved_copy); 9368 #endif 9369 Safefree(r->swap); 9370 Safefree(r->offs); 9371 Safefree(r); 9372 } 9373 9374 /* reg_temp_copy() 9375 9376 This is a hacky workaround to the structural issue of match results 9377 being stored in the regexp structure which is in turn stored in 9378 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern 9379 could be PL_curpm in multiple contexts, and could require multiple 9380 result sets being associated with the pattern simultaneously, such 9381 as when doing a recursive match with (??{$qr}) 9382 9383 The solution is to make a lightweight copy of the regexp structure 9384 when a qr// is returned from the code executed by (??{$qr}) this 9385 lightweight copy doesnt actually own any of its data except for 9386 the starp/end and the actual regexp structure itself. 9387 9388 */ 9389 9390 9391 REGEXP * 9392 Perl_reg_temp_copy (pTHX_ REGEXP *r) { 9393 regexp *ret; 9394 register const I32 npar = r->nparens+1; 9395 9396 PERL_ARGS_ASSERT_REG_TEMP_COPY; 9397 9398 (void)ReREFCNT_inc(r); 9399 Newx(ret, 1, regexp); 9400 StructCopy(r, ret, regexp); 9401 Newx(ret->offs, npar, regexp_paren_pair); 9402 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 9403 ret->refcnt = 1; 9404 if (r->substrs) { 9405 Newx(ret->substrs, 1, struct reg_substr_data); 9406 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 9407 9408 SvREFCNT_inc_void(ret->anchored_substr); 9409 SvREFCNT_inc_void(ret->anchored_utf8); 9410 SvREFCNT_inc_void(ret->float_substr); 9411 SvREFCNT_inc_void(ret->float_utf8); 9412 9413 /* check_substr and check_utf8, if non-NULL, point to either their 9414 anchored or float namesakes, and don't hold a second reference. */ 9415 } 9416 RX_MATCH_COPIED_off(ret); 9417 #ifdef PERL_OLD_COPY_ON_WRITE 9418 ret->saved_copy = NULL; 9419 #endif 9420 ret->mother_re = r; 9421 ret->swap = NULL; 9422 9423 return ret; 9424 } 9425 #endif 9426 9427 /* regfree_internal() 9428 9429 Free the private data in a regexp. This is overloadable by 9430 extensions. Perl takes care of the regexp structure in pregfree(), 9431 this covers the *pprivate pointer which technically perldoesnt 9432 know about, however of course we have to handle the 9433 regexp_internal structure when no extension is in use. 9434 9435 Note this is called before freeing anything in the regexp 9436 structure. 9437 */ 9438 9439 void 9440 Perl_regfree_internal(pTHX_ REGEXP * const r) 9441 { 9442 dVAR; 9443 RXi_GET_DECL(r,ri); 9444 GET_RE_DEBUG_FLAGS_DECL; 9445 9446 PERL_ARGS_ASSERT_REGFREE_INTERNAL; 9447 9448 DEBUG_COMPILE_r({ 9449 if (!PL_colorset) 9450 reginitcolors(); 9451 { 9452 SV *dsv= sv_newmortal(); 9453 RE_PV_QUOTED_DECL(s, RX_UTF8(r), 9454 dsv, RX_PRECOMP(r), RX_PRELEN(r), 60); 9455 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", 9456 PL_colors[4],PL_colors[5],s); 9457 } 9458 }); 9459 #ifdef RE_TRACK_PATTERN_OFFSETS 9460 if (ri->u.offsets) 9461 Safefree(ri->u.offsets); /* 20010421 MJD */ 9462 #endif 9463 if (ri->data) { 9464 int n = ri->data->count; 9465 PAD* new_comppad = NULL; 9466 PAD* old_comppad; 9467 PADOFFSET refcnt; 9468 9469 while (--n >= 0) { 9470 /* If you add a ->what type here, update the comment in regcomp.h */ 9471 switch (ri->data->what[n]) { 9472 case 's': 9473 case 'S': 9474 case 'u': 9475 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); 9476 break; 9477 case 'f': 9478 Safefree(ri->data->data[n]); 9479 break; 9480 case 'p': 9481 new_comppad = MUTABLE_AV(ri->data->data[n]); 9482 break; 9483 case 'o': 9484 if (new_comppad == NULL) 9485 Perl_croak(aTHX_ "panic: pregfree comppad"); 9486 PAD_SAVE_LOCAL(old_comppad, 9487 /* Watch out for global destruction's random ordering. */ 9488 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL 9489 ); 9490 OP_REFCNT_LOCK; 9491 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]); 9492 OP_REFCNT_UNLOCK; 9493 if (!refcnt) 9494 op_free((OP_4tree*)ri->data->data[n]); 9495 9496 PAD_RESTORE_LOCAL(old_comppad); 9497 SvREFCNT_dec(MUTABLE_SV(new_comppad)); 9498 new_comppad = NULL; 9499 break; 9500 case 'n': 9501 break; 9502 case 'T': 9503 { /* Aho Corasick add-on structure for a trie node. 9504 Used in stclass optimization only */ 9505 U32 refcount; 9506 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; 9507 OP_REFCNT_LOCK; 9508 refcount = --aho->refcount; 9509 OP_REFCNT_UNLOCK; 9510 if ( !refcount ) { 9511 PerlMemShared_free(aho->states); 9512 PerlMemShared_free(aho->fail); 9513 /* do this last!!!! */ 9514 PerlMemShared_free(ri->data->data[n]); 9515 PerlMemShared_free(ri->regstclass); 9516 } 9517 } 9518 break; 9519 case 't': 9520 { 9521 /* trie structure. */ 9522 U32 refcount; 9523 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; 9524 OP_REFCNT_LOCK; 9525 refcount = --trie->refcount; 9526 OP_REFCNT_UNLOCK; 9527 if ( !refcount ) { 9528 PerlMemShared_free(trie->charmap); 9529 PerlMemShared_free(trie->states); 9530 PerlMemShared_free(trie->trans); 9531 if (trie->bitmap) 9532 PerlMemShared_free(trie->bitmap); 9533 if (trie->wordlen) 9534 PerlMemShared_free(trie->wordlen); 9535 if (trie->jump) 9536 PerlMemShared_free(trie->jump); 9537 if (trie->nextword) 9538 PerlMemShared_free(trie->nextword); 9539 /* do this last!!!! */ 9540 PerlMemShared_free(ri->data->data[n]); 9541 } 9542 } 9543 break; 9544 default: 9545 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); 9546 } 9547 } 9548 Safefree(ri->data->what); 9549 Safefree(ri->data); 9550 } 9551 9552 Safefree(ri); 9553 } 9554 9555 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) 9556 #define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t))) 9557 #define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t))) 9558 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) 9559 9560 /* 9561 re_dup - duplicate a regexp. 9562 9563 This routine is expected to clone a given regexp structure. It is only 9564 compiled under USE_ITHREADS. 9565 9566 After all of the core data stored in struct regexp is duplicated 9567 the regexp_engine.dupe method is used to copy any private data 9568 stored in the *pprivate pointer. This allows extensions to handle 9569 any duplication it needs to do. 9570 9571 See pregfree() and regfree_internal() if you change anything here. 9572 */ 9573 #if defined(USE_ITHREADS) 9574 #ifndef PERL_IN_XSUB_RE 9575 regexp * 9576 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) 9577 { 9578 dVAR; 9579 regexp *ret; 9580 I32 npar; 9581 U32 precomp_offset; 9582 9583 PERL_ARGS_ASSERT_RE_DUP; 9584 9585 if (!r) 9586 return (REGEXP *)NULL; 9587 9588 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r))) 9589 return ret; 9590 9591 npar = r->nparens+1; 9592 Newx(ret, 1, regexp); 9593 StructCopy(r, ret, regexp); 9594 Newx(ret->offs, npar, regexp_paren_pair); 9595 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 9596 if(ret->swap) { 9597 /* no need to copy these */ 9598 Newx(ret->swap, npar, regexp_paren_pair); 9599 } 9600 9601 if (ret->substrs) { 9602 /* Do it this way to avoid reading from *r after the StructCopy(). 9603 That way, if any of the sv_dup_inc()s dislodge *r from the L1 9604 cache, it doesn't matter. */ 9605 const bool anchored = r->check_substr 9606 ? r->check_substr == r->anchored_substr 9607 : r->check_utf8 == r->anchored_utf8; 9608 Newx(ret->substrs, 1, struct reg_substr_data); 9609 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 9610 9611 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); 9612 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); 9613 ret->float_substr = sv_dup_inc(ret->float_substr, param); 9614 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); 9615 9616 /* check_substr and check_utf8, if non-NULL, point to either their 9617 anchored or float namesakes, and don't hold a second reference. */ 9618 9619 if (ret->check_substr) { 9620 if (anchored) { 9621 assert(r->check_utf8 == r->anchored_utf8); 9622 ret->check_substr = ret->anchored_substr; 9623 ret->check_utf8 = ret->anchored_utf8; 9624 } else { 9625 assert(r->check_substr == r->float_substr); 9626 assert(r->check_utf8 == r->float_utf8); 9627 ret->check_substr = ret->float_substr; 9628 ret->check_utf8 = ret->float_utf8; 9629 } 9630 } else if (ret->check_utf8) { 9631 if (anchored) { 9632 ret->check_utf8 = ret->anchored_utf8; 9633 } else { 9634 ret->check_utf8 = ret->float_utf8; 9635 } 9636 } 9637 } 9638 9639 precomp_offset = RX_PRECOMP(ret) - ret->wrapped; 9640 9641 RX_WRAPPED(ret) = SAVEPVN(RX_WRAPPED(ret), RX_WRAPLEN(ret)+1); 9642 RX_PRECOMP(ret) = ret->wrapped + precomp_offset; 9643 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); 9644 9645 if (ret->pprivate) 9646 RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); 9647 9648 if (RX_MATCH_COPIED(ret)) 9649 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); 9650 else 9651 ret->subbeg = NULL; 9652 #ifdef PERL_OLD_COPY_ON_WRITE 9653 ret->saved_copy = NULL; 9654 #endif 9655 9656 ret->mother_re = NULL; 9657 ret->gofs = 0; 9658 9659 ptr_table_store(PL_ptr_table, r, ret); 9660 return ret; 9661 } 9662 #endif /* PERL_IN_XSUB_RE */ 9663 9664 /* 9665 regdupe_internal() 9666 9667 This is the internal complement to regdupe() which is used to copy 9668 the structure pointed to by the *pprivate pointer in the regexp. 9669 This is the core version of the extension overridable cloning hook. 9670 The regexp structure being duplicated will be copied by perl prior 9671 to this and will be provided as the regexp *r argument, however 9672 with the /old/ structures pprivate pointer value. Thus this routine 9673 may override any copying normally done by perl. 9674 9675 It returns a pointer to the new regexp_internal structure. 9676 */ 9677 9678 void * 9679 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) 9680 { 9681 dVAR; 9682 regexp_internal *reti; 9683 int len, npar; 9684 RXi_GET_DECL(r,ri); 9685 9686 PERL_ARGS_ASSERT_REGDUPE_INTERNAL; 9687 9688 npar = r->nparens+1; 9689 len = ProgLen(ri); 9690 9691 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); 9692 Copy(ri->program, reti->program, len+1, regnode); 9693 9694 9695 reti->regstclass = NULL; 9696 9697 if (ri->data) { 9698 struct reg_data *d; 9699 const int count = ri->data->count; 9700 int i; 9701 9702 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), 9703 char, struct reg_data); 9704 Newx(d->what, count, U8); 9705 9706 d->count = count; 9707 for (i = 0; i < count; i++) { 9708 d->what[i] = ri->data->what[i]; 9709 switch (d->what[i]) { 9710 /* legal options are one of: sSfpontTu 9711 see also regcomp.h and pregfree() */ 9712 case 's': 9713 case 'S': 9714 case 'p': /* actually an AV, but the dup function is identical. */ 9715 case 'u': /* actually an HV, but the dup function is identical. */ 9716 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); 9717 break; 9718 case 'f': 9719 /* This is cheating. */ 9720 Newx(d->data[i], 1, struct regnode_charclass_class); 9721 StructCopy(ri->data->data[i], d->data[i], 9722 struct regnode_charclass_class); 9723 reti->regstclass = (regnode*)d->data[i]; 9724 break; 9725 case 'o': 9726 /* Compiled op trees are readonly and in shared memory, 9727 and can thus be shared without duplication. */ 9728 OP_REFCNT_LOCK; 9729 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]); 9730 OP_REFCNT_UNLOCK; 9731 break; 9732 case 'T': 9733 /* Trie stclasses are readonly and can thus be shared 9734 * without duplication. We free the stclass in pregfree 9735 * when the corresponding reg_ac_data struct is freed. 9736 */ 9737 reti->regstclass= ri->regstclass; 9738 /* Fall through */ 9739 case 't': 9740 OP_REFCNT_LOCK; 9741 ((reg_trie_data*)ri->data->data[i])->refcount++; 9742 OP_REFCNT_UNLOCK; 9743 /* Fall through */ 9744 case 'n': 9745 d->data[i] = ri->data->data[i]; 9746 break; 9747 default: 9748 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); 9749 } 9750 } 9751 9752 reti->data = d; 9753 } 9754 else 9755 reti->data = NULL; 9756 9757 reti->name_list_idx = ri->name_list_idx; 9758 9759 #ifdef RE_TRACK_PATTERN_OFFSETS 9760 if (ri->u.offsets) { 9761 Newx(reti->u.offsets, 2*len+1, U32); 9762 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); 9763 } 9764 #else 9765 SetProgLen(reti,len); 9766 #endif 9767 9768 return (void*)reti; 9769 } 9770 9771 #endif /* USE_ITHREADS */ 9772 9773 /* 9774 reg_stringify() 9775 9776 converts a regexp embedded in a MAGIC struct to its stringified form, 9777 caching the converted form in the struct and returns the cached 9778 string. 9779 9780 If lp is nonnull then it is used to return the length of the 9781 resulting string 9782 9783 If flags is nonnull and the returned string contains UTF8 then 9784 (*flags & 1) will be true. 9785 9786 If haseval is nonnull then it is used to return whether the pattern 9787 contains evals. 9788 9789 Normally called via macro: 9790 9791 CALLREG_STRINGIFY(mg,&len,&utf8); 9792 9793 And internally with 9794 9795 CALLREG_AS_STR(mg,&lp,&flags,&haseval) 9796 9797 See sv_2pv_flags() in sv.c for an example of internal usage. 9798 9799 */ 9800 #ifndef PERL_IN_XSUB_RE 9801 9802 char * 9803 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { 9804 dVAR; 9805 const REGEXP * const re = (REGEXP *)mg->mg_obj; 9806 if (haseval) 9807 *haseval = RX_SEEN_EVALS(re); 9808 if (flags) 9809 *flags = RX_UTF8(re) ? 1 : 0; 9810 if (lp) 9811 *lp = RX_WRAPLEN(re); 9812 return RX_WRAPPED(re); 9813 } 9814 9815 /* 9816 - regnext - dig the "next" pointer out of a node 9817 */ 9818 regnode * 9819 Perl_regnext(pTHX_ register regnode *p) 9820 { 9821 dVAR; 9822 register I32 offset; 9823 9824 if (!p) 9825 return(NULL); 9826 9827 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); 9828 if (offset == 0) 9829 return(NULL); 9830 9831 return(p+offset); 9832 } 9833 #endif 9834 9835 STATIC void 9836 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) 9837 { 9838 va_list args; 9839 STRLEN l1 = strlen(pat1); 9840 STRLEN l2 = strlen(pat2); 9841 char buf[512]; 9842 SV *msv; 9843 const char *message; 9844 9845 PERL_ARGS_ASSERT_RE_CROAK2; 9846 9847 if (l1 > 510) 9848 l1 = 510; 9849 if (l1 + l2 > 510) 9850 l2 = 510 - l1; 9851 Copy(pat1, buf, l1 , char); 9852 Copy(pat2, buf + l1, l2 , char); 9853 buf[l1 + l2] = '\n'; 9854 buf[l1 + l2 + 1] = '\0'; 9855 #ifdef I_STDARG 9856 /* ANSI variant takes additional second argument */ 9857 va_start(args, pat2); 9858 #else 9859 va_start(args); 9860 #endif 9861 msv = vmess(buf, &args); 9862 va_end(args); 9863 message = SvPV_const(msv,l1); 9864 if (l1 > 512) 9865 l1 = 512; 9866 Copy(message, buf, l1 , char); 9867 buf[l1-1] = '\0'; /* Overwrite \n */ 9868 Perl_croak(aTHX_ "%s", buf); 9869 } 9870 9871 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 9872 9873 #ifndef PERL_IN_XSUB_RE 9874 void 9875 Perl_save_re_context(pTHX) 9876 { 9877 dVAR; 9878 9879 struct re_save_state *state; 9880 9881 SAVEVPTR(PL_curcop); 9882 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1); 9883 9884 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix); 9885 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE; 9886 SSPUSHINT(SAVEt_RE_STATE); 9887 9888 Copy(&PL_reg_state, state, 1, struct re_save_state); 9889 9890 PL_reg_start_tmp = 0; 9891 PL_reg_start_tmpl = 0; 9892 PL_reg_oldsaved = NULL; 9893 PL_reg_oldsavedlen = 0; 9894 PL_reg_maxiter = 0; 9895 PL_reg_leftiter = 0; 9896 PL_reg_poscache = NULL; 9897 PL_reg_poscache_size = 0; 9898 #ifdef PERL_OLD_COPY_ON_WRITE 9899 PL_nrs = NULL; 9900 #endif 9901 9902 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ 9903 if (PL_curpm) { 9904 const REGEXP * const rx = PM_GETRE(PL_curpm); 9905 if (rx) { 9906 U32 i; 9907 for (i = 1; i <= RX_NPARENS(rx); i++) { 9908 char digits[TYPE_CHARS(long)]; 9909 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); 9910 GV *const *const gvp 9911 = (GV**)hv_fetch(PL_defstash, digits, len, 0); 9912 9913 if (gvp) { 9914 GV * const gv = *gvp; 9915 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) 9916 save_scalar(gv); 9917 } 9918 } 9919 } 9920 } 9921 } 9922 #endif 9923 9924 static void 9925 clear_re(pTHX_ void *r) 9926 { 9927 dVAR; 9928 ReREFCNT_dec((REGEXP *)r); 9929 } 9930 9931 #ifdef DEBUGGING 9932 9933 STATIC void 9934 S_put_byte(pTHX_ SV *sv, int c) 9935 { 9936 PERL_ARGS_ASSERT_PUT_BYTE; 9937 9938 /* Our definition of isPRINT() ignores locales, so only bytes that are 9939 not part of UTF-8 are considered printable. I assume that the same 9940 holds for UTF-EBCDIC. 9941 Also, code point 255 is not printable in either (it's E0 in EBCDIC, 9942 which Wikipedia says: 9943 9944 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all 9945 ones (binary 1111 1111, hexadecimal FF). It is similar, but not 9946 identical, to the ASCII delete (DEL) or rubout control character. 9947 ) So the old condition can be simplified to !isPRINT(c) */ 9948 if (!isPRINT(c)) 9949 Perl_sv_catpvf(aTHX_ sv, "\\%o", c); 9950 else { 9951 const char string = c; 9952 if (c == '-' || c == ']' || c == '\\' || c == '^') 9953 sv_catpvs(sv, "\\"); 9954 sv_catpvn(sv, &string, 1); 9955 } 9956 } 9957 9958 9959 #define CLEAR_OPTSTART \ 9960 if (optstart) STMT_START { \ 9961 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ 9962 optstart=NULL; \ 9963 } STMT_END 9964 9965 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); 9966 9967 STATIC const regnode * 9968 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, 9969 const regnode *last, const regnode *plast, 9970 SV* sv, I32 indent, U32 depth) 9971 { 9972 dVAR; 9973 register U8 op = PSEUDO; /* Arbitrary non-END op. */ 9974 register const regnode *next; 9975 const regnode *optstart= NULL; 9976 9977 RXi_GET_DECL(r,ri); 9978 GET_RE_DEBUG_FLAGS_DECL; 9979 9980 PERL_ARGS_ASSERT_DUMPUNTIL; 9981 9982 #ifdef DEBUG_DUMPUNTIL 9983 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, 9984 last ? last-start : 0,plast ? plast-start : 0); 9985 #endif 9986 9987 if (plast && plast < last) 9988 last= plast; 9989 9990 while (PL_regkind[op] != END && (!last || node < last)) { 9991 /* While that wasn't END last time... */ 9992 NODE_ALIGN(node); 9993 op = OP(node); 9994 if (op == CLOSE || op == WHILEM) 9995 indent--; 9996 next = regnext((regnode *)node); 9997 9998 /* Where, what. */ 9999 if (OP(node) == OPTIMIZED) { 10000 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) 10001 optstart = node; 10002 else 10003 goto after_print; 10004 } else 10005 CLEAR_OPTSTART; 10006 10007 regprop(r, sv, node); 10008 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), 10009 (int)(2*indent + 1), "", SvPVX_const(sv)); 10010 10011 if (OP(node) != OPTIMIZED) { 10012 if (next == NULL) /* Next ptr. */ 10013 PerlIO_printf(Perl_debug_log, " (0)"); 10014 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) 10015 PerlIO_printf(Perl_debug_log, " (FAIL)"); 10016 else 10017 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); 10018 (void)PerlIO_putc(Perl_debug_log, '\n'); 10019 } 10020 10021 after_print: 10022 if (PL_regkind[(U8)op] == BRANCHJ) { 10023 assert(next); 10024 { 10025 register const regnode *nnode = (OP(next) == LONGJMP 10026 ? regnext((regnode *)next) 10027 : next); 10028 if (last && nnode > last) 10029 nnode = last; 10030 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); 10031 } 10032 } 10033 else if (PL_regkind[(U8)op] == BRANCH) { 10034 assert(next); 10035 DUMPUNTIL(NEXTOPER(node), next); 10036 } 10037 else if ( PL_regkind[(U8)op] == TRIE ) { 10038 const regnode *this_trie = node; 10039 const char op = OP(node); 10040 const U32 n = ARG(node); 10041 const reg_ac_data * const ac = op>=AHOCORASICK ? 10042 (reg_ac_data *)ri->data->data[n] : 10043 NULL; 10044 const reg_trie_data * const trie = 10045 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; 10046 #ifdef DEBUGGING 10047 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); 10048 #endif 10049 const regnode *nextbranch= NULL; 10050 I32 word_idx; 10051 sv_setpvs(sv, ""); 10052 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { 10053 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); 10054 10055 PerlIO_printf(Perl_debug_log, "%*s%s ", 10056 (int)(2*(indent+3)), "", 10057 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, 10058 PL_colors[0], PL_colors[1], 10059 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | 10060 PERL_PV_PRETTY_ELLIPSES | 10061 PERL_PV_PRETTY_LTGT 10062 ) 10063 : "???" 10064 ); 10065 if (trie->jump) { 10066 U16 dist= trie->jump[word_idx+1]; 10067 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", 10068 (UV)((dist ? this_trie + dist : next) - start)); 10069 if (dist) { 10070 if (!nextbranch) 10071 nextbranch= this_trie + trie->jump[0]; 10072 DUMPUNTIL(this_trie + dist, nextbranch); 10073 } 10074 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 10075 nextbranch= regnext((regnode *)nextbranch); 10076 } else { 10077 PerlIO_printf(Perl_debug_log, "\n"); 10078 } 10079 } 10080 if (last && next > last) 10081 node= last; 10082 else 10083 node= next; 10084 } 10085 else if ( op == CURLY ) { /* "next" might be very big: optimizer */ 10086 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, 10087 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); 10088 } 10089 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { 10090 assert(next); 10091 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); 10092 } 10093 else if ( op == PLUS || op == STAR) { 10094 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); 10095 } 10096 else if (op == ANYOF) { 10097 /* arglen 1 + class block */ 10098 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE) 10099 ? ANYOF_CLASS_SKIP : ANYOF_SKIP); 10100 node = NEXTOPER(node); 10101 } 10102 else if (PL_regkind[(U8)op] == EXACT) { 10103 /* Literal string, where present. */ 10104 node += NODE_SZ_STR(node) - 1; 10105 node = NEXTOPER(node); 10106 } 10107 else { 10108 node = NEXTOPER(node); 10109 node += regarglen[(U8)op]; 10110 } 10111 if (op == CURLYX || op == OPEN) 10112 indent++; 10113 } 10114 CLEAR_OPTSTART; 10115 #ifdef DEBUG_DUMPUNTIL 10116 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); 10117 #endif 10118 return node; 10119 } 10120 10121 #endif /* DEBUGGING */ 10122 10123 /* 10124 * Local variables: 10125 * c-indentation-style: bsd 10126 * c-basic-offset: 4 10127 * indent-tabs-mode: t 10128 * End: 10129 * 10130 * ex: set ts=8 sts=4 sw=4 noet: 10131 */ 10132