1 /* regcomp.c 2 */ 3 4 /* 5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee 6 */ 7 8 /* This file contains functions for compiling a regular expression. See 9 * also regexec.c which funnily enough, contains functions for executing 10 * a regular expression. 11 * 12 * This file is also copied at build time to ext/re/re_comp.c, where 13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 14 * This causes the main functions to be compiled under new names and with 15 * debugging support added, which makes "use re 'debug'" work. 16 */ 17 18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 19 * confused with the original package (see point 3 below). Thanks, Henry! 20 */ 21 22 /* Additional note: this code is very heavily munged from Henry's version 23 * in places. In some spots I've traded clarity for efficiency, so don't 24 * blame Henry for some of the lack of readability. 25 */ 26 27 /* The names of the functions have been changed from regcomp and 28 * regexec to pregcomp and pregexec in order to avoid conflicts 29 * with the POSIX routines of the same names. 30 */ 31 32 #ifdef PERL_EXT_RE_BUILD 33 #include "re_top.h" 34 #endif 35 36 /* 37 * pregcomp and pregexec -- regsub and regerror are not used in perl 38 * 39 * Copyright (c) 1986 by University of Toronto. 40 * Written by Henry Spencer. Not derived from licensed software. 41 * 42 * Permission is granted to anyone to use this software for any 43 * purpose on any computer system, and to redistribute it freely, 44 * subject to the following restrictions: 45 * 46 * 1. The author is not responsible for the consequences of use of 47 * this software, no matter how awful, even if they arise 48 * from defects in it. 49 * 50 * 2. The origin of this software must not be misrepresented, either 51 * by explicit claim or by omission. 52 * 53 * 3. Altered versions must be plainly marked as such, and must not 54 * be misrepresented as being the original software. 55 * 56 * 57 **** Alterations to Henry's code are... 58 **** 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others 61 **** 62 **** You may distribute under the terms of either the GNU General Public 63 **** License or the Artistic License, as specified in the README file. 64 65 * 66 * Beware that some of this code is subtly aware of the way operator 67 * precedence is structured in regular expressions. Serious changes in 68 * regular-expression syntax might require a total rethink. 69 */ 70 #include "EXTERN.h" 71 #define PERL_IN_REGCOMP_C 72 #include "perl.h" 73 74 #ifndef PERL_IN_XSUB_RE 75 # include "INTERN.h" 76 #endif 77 78 #define REG_COMP_C 79 #ifdef PERL_IN_XSUB_RE 80 # include "re_comp.h" 81 #else 82 # include "regcomp.h" 83 #endif 84 85 #ifdef op 86 #undef op 87 #endif /* op */ 88 89 #ifdef MSDOS 90 # if defined(BUGGY_MSC6) 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ 92 # pragma optimize("a",off) 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ 94 # pragma optimize("w",on ) 95 # endif /* BUGGY_MSC6 */ 96 #endif /* MSDOS */ 97 98 #ifndef STATIC 99 #define STATIC static 100 #endif 101 102 typedef struct RExC_state_t { 103 U32 flags; /* are we folding, multilining? */ 104 char *precomp; /* uncompiled string. */ 105 regexp *rx; /* perl core regexp structure */ 106 regexp_internal *rxi; /* internal data for regexp object pprivate field */ 107 char *start; /* Start of input for compile */ 108 char *end; /* End of input for compile */ 109 char *parse; /* Input-scan pointer. */ 110 I32 whilem_seen; /* number of WHILEM in this expr */ 111 regnode *emit_start; /* Start of emitted-code area */ 112 regnode *emit_bound; /* First regnode outside of the allocated space */ 113 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ 114 I32 naughty; /* How bad is this pattern? */ 115 I32 sawback; /* Did we see \1, ...? */ 116 U32 seen; 117 I32 size; /* Code size. */ 118 I32 npar; /* Capture buffer count, (OPEN). */ 119 I32 cpar; /* Capture buffer count, (CLOSE). */ 120 I32 nestroot; /* root parens we are in - used by accept */ 121 I32 extralen; 122 I32 seen_zerolen; 123 I32 seen_evals; 124 regnode **open_parens; /* pointers to open parens */ 125 regnode **close_parens; /* pointers to close parens */ 126 regnode *opend; /* END node in program */ 127 I32 utf8; /* whether the pattern is utf8 or not */ 128 I32 orig_utf8; /* whether the pattern was originally in utf8 */ 129 /* XXX use this for future optimisation of case 130 * where pattern must be upgraded to utf8. */ 131 HV *charnames; /* cache of named sequences */ 132 HV *paren_names; /* Paren names */ 133 134 regnode **recurse; /* Recurse regops */ 135 I32 recurse_count; /* Number of recurse regops */ 136 #if ADD_TO_REGEXEC 137 char *starttry; /* -Dr: where regtry was called. */ 138 #define RExC_starttry (pRExC_state->starttry) 139 #endif 140 #ifdef DEBUGGING 141 const char *lastparse; 142 I32 lastnum; 143 AV *paren_name_list; /* idx -> name */ 144 #define RExC_lastparse (pRExC_state->lastparse) 145 #define RExC_lastnum (pRExC_state->lastnum) 146 #define RExC_paren_name_list (pRExC_state->paren_name_list) 147 #endif 148 } RExC_state_t; 149 150 #define RExC_flags (pRExC_state->flags) 151 #define RExC_precomp (pRExC_state->precomp) 152 #define RExC_rx (pRExC_state->rx) 153 #define RExC_rxi (pRExC_state->rxi) 154 #define RExC_start (pRExC_state->start) 155 #define RExC_end (pRExC_state->end) 156 #define RExC_parse (pRExC_state->parse) 157 #define RExC_whilem_seen (pRExC_state->whilem_seen) 158 #ifdef RE_TRACK_PATTERN_OFFSETS 159 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ 160 #endif 161 #define RExC_emit (pRExC_state->emit) 162 #define RExC_emit_start (pRExC_state->emit_start) 163 #define RExC_emit_bound (pRExC_state->emit_bound) 164 #define RExC_naughty (pRExC_state->naughty) 165 #define RExC_sawback (pRExC_state->sawback) 166 #define RExC_seen (pRExC_state->seen) 167 #define RExC_size (pRExC_state->size) 168 #define RExC_npar (pRExC_state->npar) 169 #define RExC_nestroot (pRExC_state->nestroot) 170 #define RExC_extralen (pRExC_state->extralen) 171 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 172 #define RExC_seen_evals (pRExC_state->seen_evals) 173 #define RExC_utf8 (pRExC_state->utf8) 174 #define RExC_orig_utf8 (pRExC_state->orig_utf8) 175 #define RExC_charnames (pRExC_state->charnames) 176 #define RExC_open_parens (pRExC_state->open_parens) 177 #define RExC_close_parens (pRExC_state->close_parens) 178 #define RExC_opend (pRExC_state->opend) 179 #define RExC_paren_names (pRExC_state->paren_names) 180 #define RExC_recurse (pRExC_state->recurse) 181 #define RExC_recurse_count (pRExC_state->recurse_count) 182 183 184 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') 185 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ 186 ((*s) == '{' && regcurly(s))) 187 188 #ifdef SPSTART 189 #undef SPSTART /* dratted cpp namespace... */ 190 #endif 191 /* 192 * Flags to be passed up and down. 193 */ 194 #define WORST 0 /* Worst case. */ 195 #define HASWIDTH 0x01 /* Known to match non-null strings. */ 196 #define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ 197 #define SPSTART 0x04 /* Starts with * or +. */ 198 #define TRYAGAIN 0x08 /* Weeded out a declaration. */ 199 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ 200 201 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) 202 203 /* whether trie related optimizations are enabled */ 204 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 205 #define TRIE_STUDY_OPT 206 #define FULL_TRIE_STUDY 207 #define TRIE_STCLASS 208 #endif 209 210 211 212 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] 213 #define PBITVAL(paren) (1 << ((paren) & 7)) 214 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) 215 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) 216 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) 217 218 219 /* About scan_data_t. 220 221 During optimisation we recurse through the regexp program performing 222 various inplace (keyhole style) optimisations. In addition study_chunk 223 and scan_commit populate this data structure with information about 224 what strings MUST appear in the pattern. We look for the longest 225 string that must appear for at a fixed location, and we look for the 226 longest string that may appear at a floating location. So for instance 227 in the pattern: 228 229 /FOO[xX]A.*B[xX]BAR/ 230 231 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating 232 strings (because they follow a .* construct). study_chunk will identify 233 both FOO and BAR as being the longest fixed and floating strings respectively. 234 235 The strings can be composites, for instance 236 237 /(f)(o)(o)/ 238 239 will result in a composite fixed substring 'foo'. 240 241 For each string some basic information is maintained: 242 243 - offset or min_offset 244 This is the position the string must appear at, or not before. 245 It also implicitly (when combined with minlenp) tells us how many 246 character must match before the string we are searching. 247 Likewise when combined with minlenp and the length of the string 248 tells us how many characters must appear after the string we have 249 found. 250 251 - max_offset 252 Only used for floating strings. This is the rightmost point that 253 the string can appear at. Ifset to I32 max it indicates that the 254 string can occur infinitely far to the right. 255 256 - minlenp 257 A pointer to the minimum length of the pattern that the string 258 was found inside. This is important as in the case of positive 259 lookahead or positive lookbehind we can have multiple patterns 260 involved. Consider 261 262 /(?=FOO).*F/ 263 264 The minimum length of the pattern overall is 3, the minimum length 265 of the lookahead part is 3, but the minimum length of the part that 266 will actually match is 1. So 'FOO's minimum length is 3, but the 267 minimum length for the F is 1. This is important as the minimum length 268 is used to determine offsets in front of and behind the string being 269 looked for. Since strings can be composites this is the length of the 270 pattern at the time it was commited with a scan_commit. Note that 271 the length is calculated by study_chunk, so that the minimum lengths 272 are not known until the full pattern has been compiled, thus the 273 pointer to the value. 274 275 - lookbehind 276 277 In the case of lookbehind the string being searched for can be 278 offset past the start point of the final matching string. 279 If this value was just blithely removed from the min_offset it would 280 invalidate some of the calculations for how many chars must match 281 before or after (as they are derived from min_offset and minlen and 282 the length of the string being searched for). 283 When the final pattern is compiled and the data is moved from the 284 scan_data_t structure into the regexp structure the information 285 about lookbehind is factored in, with the information that would 286 have been lost precalculated in the end_shift field for the 287 associated string. 288 289 The fields pos_min and pos_delta are used to store the minimum offset 290 and the delta to the maximum offset at the current point in the pattern. 291 292 */ 293 294 typedef struct scan_data_t { 295 /*I32 len_min; unused */ 296 /*I32 len_delta; unused */ 297 I32 pos_min; 298 I32 pos_delta; 299 SV *last_found; 300 I32 last_end; /* min value, <0 unless valid. */ 301 I32 last_start_min; 302 I32 last_start_max; 303 SV **longest; /* Either &l_fixed, or &l_float. */ 304 SV *longest_fixed; /* longest fixed string found in pattern */ 305 I32 offset_fixed; /* offset where it starts */ 306 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */ 307 I32 lookbehind_fixed; /* is the position of the string modfied by LB */ 308 SV *longest_float; /* longest floating string found in pattern */ 309 I32 offset_float_min; /* earliest point in string it can appear */ 310 I32 offset_float_max; /* latest point in string it can appear */ 311 I32 *minlen_float; /* pointer to the minlen relevent to the string */ 312 I32 lookbehind_float; /* is the position of the string modified by LB */ 313 I32 flags; 314 I32 whilem_c; 315 I32 *last_closep; 316 struct regnode_charclass_class *start_class; 317 } scan_data_t; 318 319 /* 320 * Forward declarations for pregcomp()'s friends. 321 */ 322 323 static const scan_data_t zero_scan_data = 324 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; 325 326 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 327 #define SF_BEFORE_SEOL 0x0001 328 #define SF_BEFORE_MEOL 0x0002 329 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) 330 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) 331 332 #ifdef NO_UNARY_PLUS 333 # define SF_FIX_SHIFT_EOL (0+2) 334 # define SF_FL_SHIFT_EOL (0+4) 335 #else 336 # define SF_FIX_SHIFT_EOL (+2) 337 # define SF_FL_SHIFT_EOL (+4) 338 #endif 339 340 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) 341 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) 342 343 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) 344 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ 345 #define SF_IS_INF 0x0040 346 #define SF_HAS_PAR 0x0080 347 #define SF_IN_PAR 0x0100 348 #define SF_HAS_EVAL 0x0200 349 #define SCF_DO_SUBSTR 0x0400 350 #define SCF_DO_STCLASS_AND 0x0800 351 #define SCF_DO_STCLASS_OR 0x1000 352 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 353 #define SCF_WHILEM_VISITED_POS 0x2000 354 355 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ 356 #define SCF_SEEN_ACCEPT 0x8000 357 358 #define UTF (RExC_utf8 != 0) 359 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0) 360 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0) 361 362 #define OOB_UNICODE 12345678 363 #define OOB_NAMEDCLASS -1 364 365 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 366 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) 367 368 369 /* length of regex to show in messages that don't mark a position within */ 370 #define RegexLengthToShowInErrorMessages 127 371 372 /* 373 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 374 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 375 * op/pragma/warn/regcomp. 376 */ 377 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 378 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 379 380 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" 381 382 /* 383 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 384 * arg. Show regex, up to a maximum length. If it's too long, chop and add 385 * "...". 386 */ 387 #define _FAIL(code) STMT_START { \ 388 const char *ellipses = ""; \ 389 IV len = RExC_end - RExC_precomp; \ 390 \ 391 if (!SIZE_ONLY) \ 392 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 393 if (len > RegexLengthToShowInErrorMessages) { \ 394 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 395 len = RegexLengthToShowInErrorMessages - 10; \ 396 ellipses = "..."; \ 397 } \ 398 code; \ 399 } STMT_END 400 401 #define FAIL(msg) _FAIL( \ 402 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ 403 msg, (int)len, RExC_precomp, ellipses)) 404 405 #define FAIL2(msg,arg) _FAIL( \ 406 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ 407 arg, (int)len, RExC_precomp, ellipses)) 408 409 /* 410 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 411 */ 412 #define Simple_vFAIL(m) STMT_START { \ 413 const IV offset = RExC_parse - RExC_precomp; \ 414 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 415 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 416 } STMT_END 417 418 /* 419 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 420 */ 421 #define vFAIL(m) STMT_START { \ 422 if (!SIZE_ONLY) \ 423 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 424 Simple_vFAIL(m); \ 425 } STMT_END 426 427 /* 428 * Like Simple_vFAIL(), but accepts two arguments. 429 */ 430 #define Simple_vFAIL2(m,a1) STMT_START { \ 431 const IV offset = RExC_parse - RExC_precomp; \ 432 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ 433 (int)offset, RExC_precomp, RExC_precomp + offset); \ 434 } STMT_END 435 436 /* 437 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 438 */ 439 #define vFAIL2(m,a1) STMT_START { \ 440 if (!SIZE_ONLY) \ 441 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 442 Simple_vFAIL2(m, a1); \ 443 } STMT_END 444 445 446 /* 447 * Like Simple_vFAIL(), but accepts three arguments. 448 */ 449 #define Simple_vFAIL3(m, a1, a2) STMT_START { \ 450 const IV offset = RExC_parse - RExC_precomp; \ 451 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ 452 (int)offset, RExC_precomp, RExC_precomp + offset); \ 453 } STMT_END 454 455 /* 456 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 457 */ 458 #define vFAIL3(m,a1,a2) STMT_START { \ 459 if (!SIZE_ONLY) \ 460 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 461 Simple_vFAIL3(m, a1, a2); \ 462 } STMT_END 463 464 /* 465 * Like Simple_vFAIL(), but accepts four arguments. 466 */ 467 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ 468 const IV offset = RExC_parse - RExC_precomp; \ 469 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ 470 (int)offset, RExC_precomp, RExC_precomp + offset); \ 471 } STMT_END 472 473 #define vWARN(loc,m) STMT_START { \ 474 const IV offset = loc - RExC_precomp; \ 475 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ 476 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 477 } STMT_END 478 479 #define vWARNdep(loc,m) STMT_START { \ 480 const IV offset = loc - RExC_precomp; \ 481 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ 482 "%s" REPORT_LOCATION, \ 483 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 484 } STMT_END 485 486 487 #define vWARN2(loc, m, a1) STMT_START { \ 488 const IV offset = loc - RExC_precomp; \ 489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 490 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ 491 } STMT_END 492 493 #define vWARN3(loc, m, a1, a2) STMT_START { \ 494 const IV offset = loc - RExC_precomp; \ 495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 496 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ 497 } STMT_END 498 499 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ 500 const IV offset = loc - RExC_precomp; \ 501 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 502 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ 503 } STMT_END 504 505 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ 506 const IV offset = loc - RExC_precomp; \ 507 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 508 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ 509 } STMT_END 510 511 512 /* Allow for side effects in s */ 513 #define REGC(c,s) STMT_START { \ 514 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ 515 } STMT_END 516 517 /* Macros for recording node offsets. 20001227 mjd@plover.com 518 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in 519 * element 2*n-1 of the array. Element #2n holds the byte length node #n. 520 * Element 0 holds the number n. 521 * Position is 1 indexed. 522 */ 523 #ifndef RE_TRACK_PATTERN_OFFSETS 524 #define Set_Node_Offset_To_R(node,byte) 525 #define Set_Node_Offset(node,byte) 526 #define Set_Cur_Node_Offset 527 #define Set_Node_Length_To_R(node,len) 528 #define Set_Node_Length(node,len) 529 #define Set_Node_Cur_Length(node) 530 #define Node_Offset(n) 531 #define Node_Length(n) 532 #define Set_Node_Offset_Length(node,offset,len) 533 #define ProgLen(ri) ri->u.proglen 534 #define SetProgLen(ri,x) ri->u.proglen = x 535 #else 536 #define ProgLen(ri) ri->u.offsets[0] 537 #define SetProgLen(ri,x) ri->u.offsets[0] = x 538 #define Set_Node_Offset_To_R(node,byte) STMT_START { \ 539 if (! SIZE_ONLY) { \ 540 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ 541 __LINE__, (int)(node), (int)(byte))); \ 542 if((node) < 0) { \ 543 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ 544 } else { \ 545 RExC_offsets[2*(node)-1] = (byte); \ 546 } \ 547 } \ 548 } STMT_END 549 550 #define Set_Node_Offset(node,byte) \ 551 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) 552 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) 553 554 #define Set_Node_Length_To_R(node,len) STMT_START { \ 555 if (! SIZE_ONLY) { \ 556 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ 557 __LINE__, (int)(node), (int)(len))); \ 558 if((node) < 0) { \ 559 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ 560 } else { \ 561 RExC_offsets[2*(node)] = (len); \ 562 } \ 563 } \ 564 } STMT_END 565 566 #define Set_Node_Length(node,len) \ 567 Set_Node_Length_To_R((node)-RExC_emit_start, len) 568 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) 569 #define Set_Node_Cur_Length(node) \ 570 Set_Node_Length(node, RExC_parse - parse_start) 571 572 /* Get offsets and lengths */ 573 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) 574 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) 575 576 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \ 577 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ 578 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ 579 } STMT_END 580 #endif 581 582 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 583 #define EXPERIMENTAL_INPLACESCAN 584 #endif /*RE_TRACK_PATTERN_OFFSETS*/ 585 586 #define DEBUG_STUDYDATA(str,data,depth) \ 587 DEBUG_OPTIMISE_MORE_r(if(data){ \ 588 PerlIO_printf(Perl_debug_log, \ 589 "%*s" str "Pos:%"IVdf"/%"IVdf \ 590 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ 591 (int)(depth)*2, "", \ 592 (IV)((data)->pos_min), \ 593 (IV)((data)->pos_delta), \ 594 (UV)((data)->flags), \ 595 (IV)((data)->whilem_c), \ 596 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ 597 is_inf ? "INF " : "" \ 598 ); \ 599 if ((data)->last_found) \ 600 PerlIO_printf(Perl_debug_log, \ 601 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ 602 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ 603 SvPVX_const((data)->last_found), \ 604 (IV)((data)->last_end), \ 605 (IV)((data)->last_start_min), \ 606 (IV)((data)->last_start_max), \ 607 ((data)->longest && \ 608 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ 609 SvPVX_const((data)->longest_fixed), \ 610 (IV)((data)->offset_fixed), \ 611 ((data)->longest && \ 612 (data)->longest==&((data)->longest_float)) ? "*" : "", \ 613 SvPVX_const((data)->longest_float), \ 614 (IV)((data)->offset_float_min), \ 615 (IV)((data)->offset_float_max) \ 616 ); \ 617 PerlIO_printf(Perl_debug_log,"\n"); \ 618 }); 619 620 static void clear_re(pTHX_ void *r); 621 622 /* Mark that we cannot extend a found fixed substring at this point. 623 Update the longest found anchored substring and the longest found 624 floating substrings if needed. */ 625 626 STATIC void 627 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) 628 { 629 const STRLEN l = CHR_SVLEN(data->last_found); 630 const STRLEN old_l = CHR_SVLEN(*data->longest); 631 GET_RE_DEBUG_FLAGS_DECL; 632 633 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 634 SvSetMagicSV(*data->longest, data->last_found); 635 if (*data->longest == data->longest_fixed) { 636 data->offset_fixed = l ? data->last_start_min : data->pos_min; 637 if (data->flags & SF_BEFORE_EOL) 638 data->flags 639 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); 640 else 641 data->flags &= ~SF_FIX_BEFORE_EOL; 642 data->minlen_fixed=minlenp; 643 data->lookbehind_fixed=0; 644 } 645 else { /* *data->longest == data->longest_float */ 646 data->offset_float_min = l ? data->last_start_min : data->pos_min; 647 data->offset_float_max = (l 648 ? data->last_start_max 649 : data->pos_min + data->pos_delta); 650 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) 651 data->offset_float_max = I32_MAX; 652 if (data->flags & SF_BEFORE_EOL) 653 data->flags 654 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); 655 else 656 data->flags &= ~SF_FL_BEFORE_EOL; 657 data->minlen_float=minlenp; 658 data->lookbehind_float=0; 659 } 660 } 661 SvCUR_set(data->last_found, 0); 662 { 663 SV * const sv = data->last_found; 664 if (SvUTF8(sv) && SvMAGICAL(sv)) { 665 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); 666 if (mg) 667 mg->mg_len = 0; 668 } 669 } 670 data->last_end = -1; 671 data->flags &= ~SF_BEFORE_EOL; 672 DEBUG_STUDYDATA("commit: ",data,0); 673 } 674 675 /* Can match anything (initialization) */ 676 STATIC void 677 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 678 { 679 ANYOF_CLASS_ZERO(cl); 680 ANYOF_BITMAP_SETALL(cl); 681 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; 682 if (LOC) 683 cl->flags |= ANYOF_LOCALE; 684 } 685 686 /* Can match anything (initialization) */ 687 STATIC int 688 S_cl_is_anything(const struct regnode_charclass_class *cl) 689 { 690 int value; 691 692 for (value = 0; value <= ANYOF_MAX; value += 2) 693 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) 694 return 1; 695 if (!(cl->flags & ANYOF_UNICODE_ALL)) 696 return 0; 697 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) 698 return 0; 699 return 1; 700 } 701 702 /* Can match anything (initialization) */ 703 STATIC void 704 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 705 { 706 Zero(cl, 1, struct regnode_charclass_class); 707 cl->type = ANYOF; 708 cl_anything(pRExC_state, cl); 709 } 710 711 STATIC void 712 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 713 { 714 Zero(cl, 1, struct regnode_charclass_class); 715 cl->type = ANYOF; 716 cl_anything(pRExC_state, cl); 717 if (LOC) 718 cl->flags |= ANYOF_LOCALE; 719 } 720 721 /* 'And' a given class with another one. Can create false positives */ 722 /* We assume that cl is not inverted */ 723 STATIC void 724 S_cl_and(struct regnode_charclass_class *cl, 725 const struct regnode_charclass_class *and_with) 726 { 727 728 assert(and_with->type == ANYOF); 729 if (!(and_with->flags & ANYOF_CLASS) 730 && !(cl->flags & ANYOF_CLASS) 731 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 732 && !(and_with->flags & ANYOF_FOLD) 733 && !(cl->flags & ANYOF_FOLD)) { 734 int i; 735 736 if (and_with->flags & ANYOF_INVERT) 737 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 738 cl->bitmap[i] &= ~and_with->bitmap[i]; 739 else 740 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 741 cl->bitmap[i] &= and_with->bitmap[i]; 742 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ 743 if (!(and_with->flags & ANYOF_EOS)) 744 cl->flags &= ~ANYOF_EOS; 745 746 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE && 747 !(and_with->flags & ANYOF_INVERT)) { 748 cl->flags &= ~ANYOF_UNICODE_ALL; 749 cl->flags |= ANYOF_UNICODE; 750 ARG_SET(cl, ARG(and_with)); 751 } 752 if (!(and_with->flags & ANYOF_UNICODE_ALL) && 753 !(and_with->flags & ANYOF_INVERT)) 754 cl->flags &= ~ANYOF_UNICODE_ALL; 755 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) && 756 !(and_with->flags & ANYOF_INVERT)) 757 cl->flags &= ~ANYOF_UNICODE; 758 } 759 760 /* 'OR' a given class with another one. Can create false positives */ 761 /* We assume that cl is not inverted */ 762 STATIC void 763 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) 764 { 765 if (or_with->flags & ANYOF_INVERT) { 766 /* We do not use 767 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) 768 * <= (B1 | !B2) | (CL1 | !CL2) 769 * which is wasteful if CL2 is small, but we ignore CL2: 770 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 771 * XXXX Can we handle case-fold? Unclear: 772 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = 773 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) 774 */ 775 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 776 && !(or_with->flags & ANYOF_FOLD) 777 && !(cl->flags & ANYOF_FOLD) ) { 778 int i; 779 780 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 781 cl->bitmap[i] |= ~or_with->bitmap[i]; 782 } /* XXXX: logic is complicated otherwise */ 783 else { 784 cl_anything(pRExC_state, cl); 785 } 786 } else { 787 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ 788 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 789 && (!(or_with->flags & ANYOF_FOLD) 790 || (cl->flags & ANYOF_FOLD)) ) { 791 int i; 792 793 /* OR char bitmap and class bitmap separately */ 794 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 795 cl->bitmap[i] |= or_with->bitmap[i]; 796 if (or_with->flags & ANYOF_CLASS) { 797 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++) 798 cl->classflags[i] |= or_with->classflags[i]; 799 cl->flags |= ANYOF_CLASS; 800 } 801 } 802 else { /* XXXX: logic is complicated, leave it along for a moment. */ 803 cl_anything(pRExC_state, cl); 804 } 805 } 806 if (or_with->flags & ANYOF_EOS) 807 cl->flags |= ANYOF_EOS; 808 809 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE && 810 ARG(cl) != ARG(or_with)) { 811 cl->flags |= ANYOF_UNICODE_ALL; 812 cl->flags &= ~ANYOF_UNICODE; 813 } 814 if (or_with->flags & ANYOF_UNICODE_ALL) { 815 cl->flags |= ANYOF_UNICODE_ALL; 816 cl->flags &= ~ANYOF_UNICODE; 817 } 818 } 819 820 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] 821 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) 822 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) 823 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) 824 825 826 #ifdef DEBUGGING 827 /* 828 dump_trie(trie,widecharmap,revcharmap) 829 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) 830 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) 831 832 These routines dump out a trie in a somewhat readable format. 833 The _interim_ variants are used for debugging the interim 834 tables that are used to generate the final compressed 835 representation which is what dump_trie expects. 836 837 Part of the reason for their existance is to provide a form 838 of documentation as to how the different representations function. 839 840 */ 841 842 /* 843 Dumps the final compressed table form of the trie to Perl_debug_log. 844 Used for debugging make_trie(). 845 */ 846 847 STATIC void 848 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, 849 AV *revcharmap, U32 depth) 850 { 851 U32 state; 852 SV *sv=sv_newmortal(); 853 int colwidth= widecharmap ? 6 : 4; 854 GET_RE_DEBUG_FLAGS_DECL; 855 856 857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", 858 (int)depth * 2 + 2,"", 859 "Match","Base","Ofs" ); 860 861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) { 862 SV ** const tmp = av_fetch( revcharmap, state, 0); 863 if ( tmp ) { 864 PerlIO_printf( Perl_debug_log, "%*s", 865 colwidth, 866 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 867 PL_colors[0], PL_colors[1], 868 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 869 PERL_PV_ESCAPE_FIRSTCHAR 870 ) 871 ); 872 } 873 } 874 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", 875 (int)depth * 2 + 2,""); 876 877 for( state = 0 ; state < trie->uniquecharcount ; state++ ) 878 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); 879 PerlIO_printf( Perl_debug_log, "\n"); 880 881 for( state = 1 ; state < trie->statecount ; state++ ) { 882 const U32 base = trie->states[ state ].trans.base; 883 884 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); 885 886 if ( trie->states[ state ].wordnum ) { 887 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); 888 } else { 889 PerlIO_printf( Perl_debug_log, "%6s", "" ); 890 } 891 892 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); 893 894 if ( base ) { 895 U32 ofs = 0; 896 897 while( ( base + ofs < trie->uniquecharcount ) || 898 ( base + ofs - trie->uniquecharcount < trie->lasttrans 899 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) 900 ofs++; 901 902 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); 903 904 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 905 if ( ( base + ofs >= trie->uniquecharcount ) && 906 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 907 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 908 { 909 PerlIO_printf( Perl_debug_log, "%*"UVXf, 910 colwidth, 911 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); 912 } else { 913 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); 914 } 915 } 916 917 PerlIO_printf( Perl_debug_log, "]"); 918 919 } 920 PerlIO_printf( Perl_debug_log, "\n" ); 921 } 922 } 923 /* 924 Dumps a fully constructed but uncompressed trie in list form. 925 List tries normally only are used for construction when the number of 926 possible chars (trie->uniquecharcount) is very high. 927 Used for debugging make_trie(). 928 */ 929 STATIC void 930 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, 931 HV *widecharmap, AV *revcharmap, U32 next_alloc, 932 U32 depth) 933 { 934 U32 state; 935 SV *sv=sv_newmortal(); 936 int colwidth= widecharmap ? 6 : 4; 937 GET_RE_DEBUG_FLAGS_DECL; 938 /* print out the table precompression. */ 939 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", 940 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", 941 "------:-----+-----------------\n" ); 942 943 for( state=1 ; state < next_alloc ; state ++ ) { 944 U16 charid; 945 946 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", 947 (int)depth * 2 + 2,"", (UV)state ); 948 if ( ! trie->states[ state ].wordnum ) { 949 PerlIO_printf( Perl_debug_log, "%5s| ",""); 950 } else { 951 PerlIO_printf( Perl_debug_log, "W%4x| ", 952 trie->states[ state ].wordnum 953 ); 954 } 955 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { 956 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); 957 if ( tmp ) { 958 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", 959 colwidth, 960 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 961 PL_colors[0], PL_colors[1], 962 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 963 PERL_PV_ESCAPE_FIRSTCHAR 964 ) , 965 TRIE_LIST_ITEM(state,charid).forid, 966 (UV)TRIE_LIST_ITEM(state,charid).newstate 967 ); 968 if (!(charid % 10)) 969 PerlIO_printf(Perl_debug_log, "\n%*s| ", 970 (int)((depth * 2) + 14), ""); 971 } 972 } 973 PerlIO_printf( Perl_debug_log, "\n"); 974 } 975 } 976 977 /* 978 Dumps a fully constructed but uncompressed trie in table form. 979 This is the normal DFA style state transition table, with a few 980 twists to facilitate compression later. 981 Used for debugging make_trie(). 982 */ 983 STATIC void 984 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, 985 HV *widecharmap, AV *revcharmap, U32 next_alloc, 986 U32 depth) 987 { 988 U32 state; 989 U16 charid; 990 SV *sv=sv_newmortal(); 991 int colwidth= widecharmap ? 6 : 4; 992 GET_RE_DEBUG_FLAGS_DECL; 993 994 /* 995 print out the table precompression so that we can do a visual check 996 that they are identical. 997 */ 998 999 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); 1000 1001 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 1002 SV ** const tmp = av_fetch( revcharmap, charid, 0); 1003 if ( tmp ) { 1004 PerlIO_printf( Perl_debug_log, "%*s", 1005 colwidth, 1006 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 1007 PL_colors[0], PL_colors[1], 1008 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 1009 PERL_PV_ESCAPE_FIRSTCHAR 1010 ) 1011 ); 1012 } 1013 } 1014 1015 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); 1016 1017 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { 1018 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); 1019 } 1020 1021 PerlIO_printf( Perl_debug_log, "\n" ); 1022 1023 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { 1024 1025 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", 1026 (int)depth * 2 + 2,"", 1027 (UV)TRIE_NODENUM( state ) ); 1028 1029 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 1030 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); 1031 if (v) 1032 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); 1033 else 1034 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); 1035 } 1036 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { 1037 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); 1038 } else { 1039 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, 1040 trie->states[ TRIE_NODENUM( state ) ].wordnum ); 1041 } 1042 } 1043 } 1044 1045 #endif 1046 1047 /* make_trie(startbranch,first,last,tail,word_count,flags,depth) 1048 startbranch: the first branch in the whole branch sequence 1049 first : start branch of sequence of branch-exact nodes. 1050 May be the same as startbranch 1051 last : Thing following the last branch. 1052 May be the same as tail. 1053 tail : item following the branch sequence 1054 count : words in the sequence 1055 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ 1056 depth : indent depth 1057 1058 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. 1059 1060 A trie is an N'ary tree where the branches are determined by digital 1061 decomposition of the key. IE, at the root node you look up the 1st character and 1062 follow that branch repeat until you find the end of the branches. Nodes can be 1063 marked as "accepting" meaning they represent a complete word. Eg: 1064 1065 /he|she|his|hers/ 1066 1067 would convert into the following structure. Numbers represent states, letters 1068 following numbers represent valid transitions on the letter from that state, if 1069 the number is in square brackets it represents an accepting state, otherwise it 1070 will be in parenthesis. 1071 1072 +-h->+-e->[3]-+-r->(8)-+-s->[9] 1073 | | 1074 | (2) 1075 | | 1076 (1) +-i->(6)-+-s->[7] 1077 | 1078 +-s->(3)-+-h->(4)-+-e->[5] 1079 1080 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) 1081 1082 This shows that when matching against the string 'hers' we will begin at state 1 1083 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, 1084 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which 1085 is also accepting. Thus we know that we can match both 'he' and 'hers' with a 1086 single traverse. We store a mapping from accepting to state to which word was 1087 matched, and then when we have multiple possibilities we try to complete the 1088 rest of the regex in the order in which they occured in the alternation. 1089 1090 The only prior NFA like behaviour that would be changed by the TRIE support is 1091 the silent ignoring of duplicate alternations which are of the form: 1092 1093 / (DUPE|DUPE) X? (?{ ... }) Y /x 1094 1095 Thus EVAL blocks follwing a trie may be called a different number of times with 1096 and without the optimisation. With the optimisations dupes will be silently 1097 ignored. This inconsistant behaviour of EVAL type nodes is well established as 1098 the following demonstrates: 1099 1100 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ 1101 1102 which prints out 'word' three times, but 1103 1104 'words'=~/(word|word|word)(?{ print $1 })S/ 1105 1106 which doesnt print it out at all. This is due to other optimisations kicking in. 1107 1108 Example of what happens on a structural level: 1109 1110 The regexp /(ac|ad|ab)+/ will produce the folowing debug output: 1111 1112 1: CURLYM[1] {1,32767}(18) 1113 5: BRANCH(8) 1114 6: EXACT <ac>(16) 1115 8: BRANCH(11) 1116 9: EXACT <ad>(16) 1117 11: BRANCH(14) 1118 12: EXACT <ab>(16) 1119 16: SUCCEED(0) 1120 17: NOTHING(18) 1121 18: END(0) 1122 1123 This would be optimizable with startbranch=5, first=5, last=16, tail=16 1124 and should turn into: 1125 1126 1: CURLYM[1] {1,32767}(18) 1127 5: TRIE(16) 1128 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] 1129 <ac> 1130 <ad> 1131 <ab> 1132 16: SUCCEED(0) 1133 17: NOTHING(18) 1134 18: END(0) 1135 1136 Cases where tail != last would be like /(?foo|bar)baz/: 1137 1138 1: BRANCH(4) 1139 2: EXACT <foo>(8) 1140 4: BRANCH(7) 1141 5: EXACT <bar>(8) 1142 7: TAIL(8) 1143 8: EXACT <baz>(10) 1144 10: END(0) 1145 1146 which would be optimizable with startbranch=1, first=1, last=7, tail=8 1147 and would end up looking like: 1148 1149 1: TRIE(8) 1150 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] 1151 <foo> 1152 <bar> 1153 7: TAIL(8) 1154 8: EXACT <baz>(10) 1155 10: END(0) 1156 1157 d = uvuni_to_utf8_flags(d, uv, 0); 1158 1159 is the recommended Unicode-aware way of saying 1160 1161 *(d++) = uv; 1162 */ 1163 1164 #define TRIE_STORE_REVCHAR \ 1165 STMT_START { \ 1166 if (UTF) { \ 1167 SV *zlopp = newSV(2); \ 1168 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ 1169 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ 1170 SvCUR_set(zlopp, kapow - flrbbbbb); \ 1171 SvPOK_on(zlopp); \ 1172 SvUTF8_on(zlopp); \ 1173 av_push(revcharmap, zlopp); \ 1174 } else { \ 1175 char ooooff = (char)uvc; \ 1176 av_push(revcharmap, newSVpvn(&ooooff, 1)); \ 1177 } \ 1178 } STMT_END 1179 1180 #define TRIE_READ_CHAR STMT_START { \ 1181 wordlen++; \ 1182 if ( UTF ) { \ 1183 if ( folder ) { \ 1184 if ( foldlen > 0 ) { \ 1185 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \ 1186 foldlen -= len; \ 1187 scan += len; \ 1188 len = 0; \ 1189 } else { \ 1190 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ 1191 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ 1192 foldlen -= UNISKIP( uvc ); \ 1193 scan = foldbuf + UNISKIP( uvc ); \ 1194 } \ 1195 } else { \ 1196 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ 1197 } \ 1198 } else { \ 1199 uvc = (U32)*uc; \ 1200 len = 1; \ 1201 } \ 1202 } STMT_END 1203 1204 1205 1206 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ 1207 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ 1208 U32 ging = TRIE_LIST_LEN( state ) *= 2; \ 1209 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ 1210 } \ 1211 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ 1212 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ 1213 TRIE_LIST_CUR( state )++; \ 1214 } STMT_END 1215 1216 #define TRIE_LIST_NEW(state) STMT_START { \ 1217 Newxz( trie->states[ state ].trans.list, \ 1218 4, reg_trie_trans_le ); \ 1219 TRIE_LIST_CUR( state ) = 1; \ 1220 TRIE_LIST_LEN( state ) = 4; \ 1221 } STMT_END 1222 1223 #define TRIE_HANDLE_WORD(state) STMT_START { \ 1224 U16 dupe= trie->states[ state ].wordnum; \ 1225 regnode * const noper_next = regnext( noper ); \ 1226 \ 1227 if (trie->wordlen) \ 1228 trie->wordlen[ curword ] = wordlen; \ 1229 DEBUG_r({ \ 1230 /* store the word for dumping */ \ 1231 SV* tmp; \ 1232 if (OP(noper) != NOTHING) \ 1233 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \ 1234 else \ 1235 tmp = newSVpvn( "", 0 ); \ 1236 if ( UTF ) SvUTF8_on( tmp ); \ 1237 av_push( trie_words, tmp ); \ 1238 }); \ 1239 \ 1240 curword++; \ 1241 \ 1242 if ( noper_next < tail ) { \ 1243 if (!trie->jump) \ 1244 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ 1245 trie->jump[curword] = (U16)(noper_next - convert); \ 1246 if (!jumper) \ 1247 jumper = noper_next; \ 1248 if (!nextbranch) \ 1249 nextbranch= regnext(cur); \ 1250 } \ 1251 \ 1252 if ( dupe ) { \ 1253 /* So it's a dupe. This means we need to maintain a */\ 1254 /* linked-list from the first to the next. */\ 1255 /* we only allocate the nextword buffer when there */\ 1256 /* a dupe, so first time we have to do the allocation */\ 1257 if (!trie->nextword) \ 1258 trie->nextword = (U16 *) \ 1259 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \ 1260 while ( trie->nextword[dupe] ) \ 1261 dupe= trie->nextword[dupe]; \ 1262 trie->nextword[dupe]= curword; \ 1263 } else { \ 1264 /* we haven't inserted this word yet. */ \ 1265 trie->states[ state ].wordnum = curword; \ 1266 } \ 1267 } STMT_END 1268 1269 1270 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ 1271 ( ( base + charid >= ucharcount \ 1272 && base + charid < ubound \ 1273 && state == trie->trans[ base - ucharcount + charid ].check \ 1274 && trie->trans[ base - ucharcount + charid ].next ) \ 1275 ? trie->trans[ base - ucharcount + charid ].next \ 1276 : ( state==1 ? special : 0 ) \ 1277 ) 1278 1279 #define MADE_TRIE 1 1280 #define MADE_JUMP_TRIE 2 1281 #define MADE_EXACT_TRIE 4 1282 1283 STATIC I32 1284 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) 1285 { 1286 dVAR; 1287 /* first pass, loop through and scan words */ 1288 reg_trie_data *trie; 1289 HV *widecharmap = NULL; 1290 AV *revcharmap = newAV(); 1291 regnode *cur; 1292 const U32 uniflags = UTF8_ALLOW_DEFAULT; 1293 STRLEN len = 0; 1294 UV uvc = 0; 1295 U16 curword = 0; 1296 U32 next_alloc = 0; 1297 regnode *jumper = NULL; 1298 regnode *nextbranch = NULL; 1299 regnode *convert = NULL; 1300 /* we just use folder as a flag in utf8 */ 1301 const U8 * const folder = ( flags == EXACTF 1302 ? PL_fold 1303 : ( flags == EXACTFL 1304 ? PL_fold_locale 1305 : NULL 1306 ) 1307 ); 1308 1309 #ifdef DEBUGGING 1310 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); 1311 AV *trie_words = NULL; 1312 /* along with revcharmap, this only used during construction but both are 1313 * useful during debugging so we store them in the struct when debugging. 1314 */ 1315 #else 1316 const U32 data_slot = add_data( pRExC_state, 2, "tu" ); 1317 STRLEN trie_charcount=0; 1318 #endif 1319 SV *re_trie_maxbuff; 1320 GET_RE_DEBUG_FLAGS_DECL; 1321 #ifndef DEBUGGING 1322 PERL_UNUSED_ARG(depth); 1323 #endif 1324 1325 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); 1326 trie->refcount = 1; 1327 trie->startstate = 1; 1328 trie->wordcount = word_count; 1329 RExC_rxi->data->data[ data_slot ] = (void*)trie; 1330 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); 1331 if (!(UTF && folder)) 1332 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); 1333 DEBUG_r({ 1334 trie_words = newAV(); 1335 }); 1336 1337 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 1338 if (!SvIOK(re_trie_maxbuff)) { 1339 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 1340 } 1341 DEBUG_OPTIMISE_r({ 1342 PerlIO_printf( Perl_debug_log, 1343 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", 1344 (int)depth * 2 + 2, "", 1345 REG_NODE_NUM(startbranch),REG_NODE_NUM(first), 1346 REG_NODE_NUM(last), REG_NODE_NUM(tail), 1347 (int)depth); 1348 }); 1349 1350 /* Find the node we are going to overwrite */ 1351 if ( first == startbranch && OP( last ) != BRANCH ) { 1352 /* whole branch chain */ 1353 convert = first; 1354 } else { 1355 /* branch sub-chain */ 1356 convert = NEXTOPER( first ); 1357 } 1358 1359 /* -- First loop and Setup -- 1360 1361 We first traverse the branches and scan each word to determine if it 1362 contains widechars, and how many unique chars there are, this is 1363 important as we have to build a table with at least as many columns as we 1364 have unique chars. 1365 1366 We use an array of integers to represent the character codes 0..255 1367 (trie->charmap) and we use a an HV* to store Unicode characters. We use the 1368 native representation of the character value as the key and IV's for the 1369 coded index. 1370 1371 *TODO* If we keep track of how many times each character is used we can 1372 remap the columns so that the table compression later on is more 1373 efficient in terms of memory by ensuring most common value is in the 1374 middle and the least common are on the outside. IMO this would be better 1375 than a most to least common mapping as theres a decent chance the most 1376 common letter will share a node with the least common, meaning the node 1377 will not be compressable. With a middle is most common approach the worst 1378 case is when we have the least common nodes twice. 1379 1380 */ 1381 1382 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1383 regnode * const noper = NEXTOPER( cur ); 1384 const U8 *uc = (U8*)STRING( noper ); 1385 const U8 * const e = uc + STR_LEN( noper ); 1386 STRLEN foldlen = 0; 1387 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1388 const U8 *scan = (U8*)NULL; 1389 U32 wordlen = 0; /* required init */ 1390 STRLEN chars = 0; 1391 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ 1392 1393 if (OP(noper) == NOTHING) { 1394 trie->minlen= 0; 1395 continue; 1396 } 1397 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ 1398 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte 1399 regardless of encoding */ 1400 1401 for ( ; uc < e ; uc += len ) { 1402 TRIE_CHARCOUNT(trie)++; 1403 TRIE_READ_CHAR; 1404 chars++; 1405 if ( uvc < 256 ) { 1406 if ( !trie->charmap[ uvc ] ) { 1407 trie->charmap[ uvc ]=( ++trie->uniquecharcount ); 1408 if ( folder ) 1409 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; 1410 TRIE_STORE_REVCHAR; 1411 } 1412 if ( set_bit ) { 1413 /* store the codepoint in the bitmap, and if its ascii 1414 also store its folded equivelent. */ 1415 TRIE_BITMAP_SET(trie,uvc); 1416 1417 /* store the folded codepoint */ 1418 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); 1419 1420 if ( !UTF ) { 1421 /* store first byte of utf8 representation of 1422 codepoints in the 127 < uvc < 256 range */ 1423 if (127 < uvc && uvc < 192) { 1424 TRIE_BITMAP_SET(trie,194); 1425 } else if (191 < uvc ) { 1426 TRIE_BITMAP_SET(trie,195); 1427 /* && uvc < 256 -- we know uvc is < 256 already */ 1428 } 1429 } 1430 set_bit = 0; /* We've done our bit :-) */ 1431 } 1432 } else { 1433 SV** svpp; 1434 if ( !widecharmap ) 1435 widecharmap = newHV(); 1436 1437 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); 1438 1439 if ( !svpp ) 1440 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); 1441 1442 if ( !SvTRUE( *svpp ) ) { 1443 sv_setiv( *svpp, ++trie->uniquecharcount ); 1444 TRIE_STORE_REVCHAR; 1445 } 1446 } 1447 } 1448 if( cur == first ) { 1449 trie->minlen=chars; 1450 trie->maxlen=chars; 1451 } else if (chars < trie->minlen) { 1452 trie->minlen=chars; 1453 } else if (chars > trie->maxlen) { 1454 trie->maxlen=chars; 1455 } 1456 1457 } /* end first pass */ 1458 DEBUG_TRIE_COMPILE_r( 1459 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", 1460 (int)depth * 2 + 2,"", 1461 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, 1462 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, 1463 (int)trie->minlen, (int)trie->maxlen ) 1464 ); 1465 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) ); 1466 1467 /* 1468 We now know what we are dealing with in terms of unique chars and 1469 string sizes so we can calculate how much memory a naive 1470 representation using a flat table will take. If it's over a reasonable 1471 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory 1472 conservative but potentially much slower representation using an array 1473 of lists. 1474 1475 At the end we convert both representations into the same compressed 1476 form that will be used in regexec.c for matching with. The latter 1477 is a form that cannot be used to construct with but has memory 1478 properties similar to the list form and access properties similar 1479 to the table form making it both suitable for fast searches and 1480 small enough that its feasable to store for the duration of a program. 1481 1482 See the comment in the code where the compressed table is produced 1483 inplace from the flat tabe representation for an explanation of how 1484 the compression works. 1485 1486 */ 1487 1488 1489 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { 1490 /* 1491 Second Pass -- Array Of Lists Representation 1492 1493 Each state will be represented by a list of charid:state records 1494 (reg_trie_trans_le) the first such element holds the CUR and LEN 1495 points of the allocated array. (See defines above). 1496 1497 We build the initial structure using the lists, and then convert 1498 it into the compressed table form which allows faster lookups 1499 (but cant be modified once converted). 1500 */ 1501 1502 STRLEN transcount = 1; 1503 1504 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 1505 "%*sCompiling trie using list compiler\n", 1506 (int)depth * 2 + 2, "")); 1507 1508 trie->states = (reg_trie_state *) 1509 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 1510 sizeof(reg_trie_state) ); 1511 TRIE_LIST_NEW(1); 1512 next_alloc = 2; 1513 1514 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1515 1516 regnode * const noper = NEXTOPER( cur ); 1517 U8 *uc = (U8*)STRING( noper ); 1518 const U8 * const e = uc + STR_LEN( noper ); 1519 U32 state = 1; /* required init */ 1520 U16 charid = 0; /* sanity init */ 1521 U8 *scan = (U8*)NULL; /* sanity init */ 1522 STRLEN foldlen = 0; /* required init */ 1523 U32 wordlen = 0; /* required init */ 1524 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1525 1526 if (OP(noper) != NOTHING) { 1527 for ( ; uc < e ; uc += len ) { 1528 1529 TRIE_READ_CHAR; 1530 1531 if ( uvc < 256 ) { 1532 charid = trie->charmap[ uvc ]; 1533 } else { 1534 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); 1535 if ( !svpp ) { 1536 charid = 0; 1537 } else { 1538 charid=(U16)SvIV( *svpp ); 1539 } 1540 } 1541 /* charid is now 0 if we dont know the char read, or nonzero if we do */ 1542 if ( charid ) { 1543 1544 U16 check; 1545 U32 newstate = 0; 1546 1547 charid--; 1548 if ( !trie->states[ state ].trans.list ) { 1549 TRIE_LIST_NEW( state ); 1550 } 1551 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { 1552 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { 1553 newstate = TRIE_LIST_ITEM( state, check ).newstate; 1554 break; 1555 } 1556 } 1557 if ( ! newstate ) { 1558 newstate = next_alloc++; 1559 TRIE_LIST_PUSH( state, charid, newstate ); 1560 transcount++; 1561 } 1562 state = newstate; 1563 } else { 1564 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 1565 } 1566 } 1567 } 1568 TRIE_HANDLE_WORD(state); 1569 1570 } /* end second pass */ 1571 1572 /* next alloc is the NEXT state to be allocated */ 1573 trie->statecount = next_alloc; 1574 trie->states = (reg_trie_state *) 1575 PerlMemShared_realloc( trie->states, 1576 next_alloc 1577 * sizeof(reg_trie_state) ); 1578 1579 /* and now dump it out before we compress it */ 1580 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, 1581 revcharmap, next_alloc, 1582 depth+1) 1583 ); 1584 1585 trie->trans = (reg_trie_trans *) 1586 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); 1587 { 1588 U32 state; 1589 U32 tp = 0; 1590 U32 zp = 0; 1591 1592 1593 for( state=1 ; state < next_alloc ; state ++ ) { 1594 U32 base=0; 1595 1596 /* 1597 DEBUG_TRIE_COMPILE_MORE_r( 1598 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) 1599 ); 1600 */ 1601 1602 if (trie->states[state].trans.list) { 1603 U16 minid=TRIE_LIST_ITEM( state, 1).forid; 1604 U16 maxid=minid; 1605 U16 idx; 1606 1607 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 1608 const U16 forid = TRIE_LIST_ITEM( state, idx).forid; 1609 if ( forid < minid ) { 1610 minid=forid; 1611 } else if ( forid > maxid ) { 1612 maxid=forid; 1613 } 1614 } 1615 if ( transcount < tp + maxid - minid + 1) { 1616 transcount *= 2; 1617 trie->trans = (reg_trie_trans *) 1618 PerlMemShared_realloc( trie->trans, 1619 transcount 1620 * sizeof(reg_trie_trans) ); 1621 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); 1622 } 1623 base = trie->uniquecharcount + tp - minid; 1624 if ( maxid == minid ) { 1625 U32 set = 0; 1626 for ( ; zp < tp ; zp++ ) { 1627 if ( ! trie->trans[ zp ].next ) { 1628 base = trie->uniquecharcount + zp - minid; 1629 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; 1630 trie->trans[ zp ].check = state; 1631 set = 1; 1632 break; 1633 } 1634 } 1635 if ( !set ) { 1636 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; 1637 trie->trans[ tp ].check = state; 1638 tp++; 1639 zp = tp; 1640 } 1641 } else { 1642 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 1643 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; 1644 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; 1645 trie->trans[ tid ].check = state; 1646 } 1647 tp += ( maxid - minid + 1 ); 1648 } 1649 Safefree(trie->states[ state ].trans.list); 1650 } 1651 /* 1652 DEBUG_TRIE_COMPILE_MORE_r( 1653 PerlIO_printf( Perl_debug_log, " base: %d\n",base); 1654 ); 1655 */ 1656 trie->states[ state ].trans.base=base; 1657 } 1658 trie->lasttrans = tp + 1; 1659 } 1660 } else { 1661 /* 1662 Second Pass -- Flat Table Representation. 1663 1664 we dont use the 0 slot of either trans[] or states[] so we add 1 to each. 1665 We know that we will need Charcount+1 trans at most to store the data 1666 (one row per char at worst case) So we preallocate both structures 1667 assuming worst case. 1668 1669 We then construct the trie using only the .next slots of the entry 1670 structs. 1671 1672 We use the .check field of the first entry of the node temporarily to 1673 make compression both faster and easier by keeping track of how many non 1674 zero fields are in the node. 1675 1676 Since trans are numbered from 1 any 0 pointer in the table is a FAIL 1677 transition. 1678 1679 There are two terms at use here: state as a TRIE_NODEIDX() which is a 1680 number representing the first entry of the node, and state as a 1681 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and 1682 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there 1683 are 2 entrys per node. eg: 1684 1685 A B A B 1686 1. 2 4 1. 3 7 1687 2. 0 3 3. 0 5 1688 3. 0 0 5. 0 0 1689 4. 0 0 7. 0 0 1690 1691 The table is internally in the right hand, idx form. However as we also 1692 have to deal with the states array which is indexed by nodenum we have to 1693 use TRIE_NODENUM() to convert. 1694 1695 */ 1696 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 1697 "%*sCompiling trie using table compiler\n", 1698 (int)depth * 2 + 2, "")); 1699 1700 trie->trans = (reg_trie_trans *) 1701 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) 1702 * trie->uniquecharcount + 1, 1703 sizeof(reg_trie_trans) ); 1704 trie->states = (reg_trie_state *) 1705 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 1706 sizeof(reg_trie_state) ); 1707 next_alloc = trie->uniquecharcount + 1; 1708 1709 1710 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1711 1712 regnode * const noper = NEXTOPER( cur ); 1713 const U8 *uc = (U8*)STRING( noper ); 1714 const U8 * const e = uc + STR_LEN( noper ); 1715 1716 U32 state = 1; /* required init */ 1717 1718 U16 charid = 0; /* sanity init */ 1719 U32 accept_state = 0; /* sanity init */ 1720 U8 *scan = (U8*)NULL; /* sanity init */ 1721 1722 STRLEN foldlen = 0; /* required init */ 1723 U32 wordlen = 0; /* required init */ 1724 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1725 1726 if ( OP(noper) != NOTHING ) { 1727 for ( ; uc < e ; uc += len ) { 1728 1729 TRIE_READ_CHAR; 1730 1731 if ( uvc < 256 ) { 1732 charid = trie->charmap[ uvc ]; 1733 } else { 1734 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); 1735 charid = svpp ? (U16)SvIV(*svpp) : 0; 1736 } 1737 if ( charid ) { 1738 charid--; 1739 if ( !trie->trans[ state + charid ].next ) { 1740 trie->trans[ state + charid ].next = next_alloc; 1741 trie->trans[ state ].check++; 1742 next_alloc += trie->uniquecharcount; 1743 } 1744 state = trie->trans[ state + charid ].next; 1745 } else { 1746 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); 1747 } 1748 /* charid is now 0 if we dont know the char read, or nonzero if we do */ 1749 } 1750 } 1751 accept_state = TRIE_NODENUM( state ); 1752 TRIE_HANDLE_WORD(accept_state); 1753 1754 } /* end second pass */ 1755 1756 /* and now dump it out before we compress it */ 1757 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, 1758 revcharmap, 1759 next_alloc, depth+1)); 1760 1761 { 1762 /* 1763 * Inplace compress the table.* 1764 1765 For sparse data sets the table constructed by the trie algorithm will 1766 be mostly 0/FAIL transitions or to put it another way mostly empty. 1767 (Note that leaf nodes will not contain any transitions.) 1768 1769 This algorithm compresses the tables by eliminating most such 1770 transitions, at the cost of a modest bit of extra work during lookup: 1771 1772 - Each states[] entry contains a .base field which indicates the 1773 index in the state[] array wheres its transition data is stored. 1774 1775 - If .base is 0 there are no valid transitions from that node. 1776 1777 - If .base is nonzero then charid is added to it to find an entry in 1778 the trans array. 1779 1780 -If trans[states[state].base+charid].check!=state then the 1781 transition is taken to be a 0/Fail transition. Thus if there are fail 1782 transitions at the front of the node then the .base offset will point 1783 somewhere inside the previous nodes data (or maybe even into a node 1784 even earlier), but the .check field determines if the transition is 1785 valid. 1786 1787 XXX - wrong maybe? 1788 The following process inplace converts the table to the compressed 1789 table: We first do not compress the root node 1,and mark its all its 1790 .check pointers as 1 and set its .base pointer as 1 as well. This 1791 allows to do a DFA construction from the compressed table later, and 1792 ensures that any .base pointers we calculate later are greater than 1793 0. 1794 1795 - We set 'pos' to indicate the first entry of the second node. 1796 1797 - We then iterate over the columns of the node, finding the first and 1798 last used entry at l and m. We then copy l..m into pos..(pos+m-l), 1799 and set the .check pointers accordingly, and advance pos 1800 appropriately and repreat for the next node. Note that when we copy 1801 the next pointers we have to convert them from the original 1802 NODEIDX form to NODENUM form as the former is not valid post 1803 compression. 1804 1805 - If a node has no transitions used we mark its base as 0 and do not 1806 advance the pos pointer. 1807 1808 - If a node only has one transition we use a second pointer into the 1809 structure to fill in allocated fail transitions from other states. 1810 This pointer is independent of the main pointer and scans forward 1811 looking for null transitions that are allocated to a state. When it 1812 finds one it writes the single transition into the "hole". If the 1813 pointer doesnt find one the single transition is appended as normal. 1814 1815 - Once compressed we can Renew/realloc the structures to release the 1816 excess space. 1817 1818 See "Table-Compression Methods" in sec 3.9 of the Red Dragon, 1819 specifically Fig 3.47 and the associated pseudocode. 1820 1821 demq 1822 */ 1823 const U32 laststate = TRIE_NODENUM( next_alloc ); 1824 U32 state, charid; 1825 U32 pos = 0, zp=0; 1826 trie->statecount = laststate; 1827 1828 for ( state = 1 ; state < laststate ; state++ ) { 1829 U8 flag = 0; 1830 const U32 stateidx = TRIE_NODEIDX( state ); 1831 const U32 o_used = trie->trans[ stateidx ].check; 1832 U32 used = trie->trans[ stateidx ].check; 1833 trie->trans[ stateidx ].check = 0; 1834 1835 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { 1836 if ( flag || trie->trans[ stateidx + charid ].next ) { 1837 if ( trie->trans[ stateidx + charid ].next ) { 1838 if (o_used == 1) { 1839 for ( ; zp < pos ; zp++ ) { 1840 if ( ! trie->trans[ zp ].next ) { 1841 break; 1842 } 1843 } 1844 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; 1845 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); 1846 trie->trans[ zp ].check = state; 1847 if ( ++zp > pos ) pos = zp; 1848 break; 1849 } 1850 used--; 1851 } 1852 if ( !flag ) { 1853 flag = 1; 1854 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; 1855 } 1856 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); 1857 trie->trans[ pos ].check = state; 1858 pos++; 1859 } 1860 } 1861 } 1862 trie->lasttrans = pos + 1; 1863 trie->states = (reg_trie_state *) 1864 PerlMemShared_realloc( trie->states, laststate 1865 * sizeof(reg_trie_state) ); 1866 DEBUG_TRIE_COMPILE_MORE_r( 1867 PerlIO_printf( Perl_debug_log, 1868 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", 1869 (int)depth * 2 + 2,"", 1870 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), 1871 (IV)next_alloc, 1872 (IV)pos, 1873 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); 1874 ); 1875 1876 } /* end table compress */ 1877 } 1878 DEBUG_TRIE_COMPILE_MORE_r( 1879 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", 1880 (int)depth * 2 + 2, "", 1881 (UV)trie->statecount, 1882 (UV)trie->lasttrans) 1883 ); 1884 /* resize the trans array to remove unused space */ 1885 trie->trans = (reg_trie_trans *) 1886 PerlMemShared_realloc( trie->trans, trie->lasttrans 1887 * sizeof(reg_trie_trans) ); 1888 1889 /* and now dump out the compressed format */ 1890 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); 1891 1892 { /* Modify the program and insert the new TRIE node*/ 1893 U8 nodetype =(U8)(flags & 0xFF); 1894 char *str=NULL; 1895 1896 #ifdef DEBUGGING 1897 regnode *optimize = NULL; 1898 #ifdef RE_TRACK_PATTERN_OFFSETS 1899 1900 U32 mjd_offset = 0; 1901 U32 mjd_nodelen = 0; 1902 #endif /* RE_TRACK_PATTERN_OFFSETS */ 1903 #endif /* DEBUGGING */ 1904 /* 1905 This means we convert either the first branch or the first Exact, 1906 depending on whether the thing following (in 'last') is a branch 1907 or not and whther first is the startbranch (ie is it a sub part of 1908 the alternation or is it the whole thing.) 1909 Assuming its a sub part we conver the EXACT otherwise we convert 1910 the whole branch sequence, including the first. 1911 */ 1912 /* Find the node we are going to overwrite */ 1913 if ( first != startbranch || OP( last ) == BRANCH ) { 1914 /* branch sub-chain */ 1915 NEXT_OFF( first ) = (U16)(last - first); 1916 #ifdef RE_TRACK_PATTERN_OFFSETS 1917 DEBUG_r({ 1918 mjd_offset= Node_Offset((convert)); 1919 mjd_nodelen= Node_Length((convert)); 1920 }); 1921 #endif 1922 /* whole branch chain */ 1923 } 1924 #ifdef RE_TRACK_PATTERN_OFFSETS 1925 else { 1926 DEBUG_r({ 1927 const regnode *nop = NEXTOPER( convert ); 1928 mjd_offset= Node_Offset((nop)); 1929 mjd_nodelen= Node_Length((nop)); 1930 }); 1931 } 1932 DEBUG_OPTIMISE_r( 1933 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", 1934 (int)depth * 2 + 2, "", 1935 (UV)mjd_offset, (UV)mjd_nodelen) 1936 ); 1937 #endif 1938 /* But first we check to see if there is a common prefix we can 1939 split out as an EXACT and put in front of the TRIE node. */ 1940 trie->startstate= 1; 1941 if ( trie->bitmap && !widecharmap && !trie->jump ) { 1942 U32 state; 1943 for ( state = 1 ; state < trie->statecount-1 ; state++ ) { 1944 U32 ofs = 0; 1945 I32 idx = -1; 1946 U32 count = 0; 1947 const U32 base = trie->states[ state ].trans.base; 1948 1949 if ( trie->states[state].wordnum ) 1950 count = 1; 1951 1952 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 1953 if ( ( base + ofs >= trie->uniquecharcount ) && 1954 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 1955 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 1956 { 1957 if ( ++count > 1 ) { 1958 SV **tmp = av_fetch( revcharmap, ofs, 0); 1959 const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); 1960 if ( state == 1 ) break; 1961 if ( count == 2 ) { 1962 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); 1963 DEBUG_OPTIMISE_r( 1964 PerlIO_printf(Perl_debug_log, 1965 "%*sNew Start State=%"UVuf" Class: [", 1966 (int)depth * 2 + 2, "", 1967 (UV)state)); 1968 if (idx >= 0) { 1969 SV ** const tmp = av_fetch( revcharmap, idx, 0); 1970 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); 1971 1972 TRIE_BITMAP_SET(trie,*ch); 1973 if ( folder ) 1974 TRIE_BITMAP_SET(trie, folder[ *ch ]); 1975 DEBUG_OPTIMISE_r( 1976 PerlIO_printf(Perl_debug_log, (char*)ch) 1977 ); 1978 } 1979 } 1980 TRIE_BITMAP_SET(trie,*ch); 1981 if ( folder ) 1982 TRIE_BITMAP_SET(trie,folder[ *ch ]); 1983 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); 1984 } 1985 idx = ofs; 1986 } 1987 } 1988 if ( count == 1 ) { 1989 SV **tmp = av_fetch( revcharmap, idx, 0); 1990 STRLEN len; 1991 char *ch = SvPV( *tmp, len ); 1992 DEBUG_OPTIMISE_r({ 1993 SV *sv=sv_newmortal(); 1994 PerlIO_printf( Perl_debug_log, 1995 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", 1996 (int)depth * 2 + 2, "", 1997 (UV)state, (UV)idx, 1998 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, 1999 PL_colors[0], PL_colors[1], 2000 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 2001 PERL_PV_ESCAPE_FIRSTCHAR 2002 ) 2003 ); 2004 }); 2005 if ( state==1 ) { 2006 OP( convert ) = nodetype; 2007 str=STRING(convert); 2008 STR_LEN(convert)=0; 2009 } 2010 STR_LEN(convert) += len; 2011 while (len--) 2012 *str++ = *ch++; 2013 } else { 2014 #ifdef DEBUGGING 2015 if (state>1) 2016 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); 2017 #endif 2018 break; 2019 } 2020 } 2021 if (str) { 2022 regnode *n = convert+NODE_SZ_STR(convert); 2023 NEXT_OFF(convert) = NODE_SZ_STR(convert); 2024 trie->startstate = state; 2025 trie->minlen -= (state - 1); 2026 trie->maxlen -= (state - 1); 2027 #ifdef DEBUGGING 2028 /* At least the UNICOS C compiler choked on this 2029 * being argument to DEBUG_r(), so let's just have 2030 * it right here. */ 2031 if ( 2032 #ifdef PERL_EXT_RE_BUILD 2033 1 2034 #else 2035 DEBUG_r_TEST 2036 #endif 2037 ) { 2038 regnode *fix = convert; 2039 U32 word = trie->wordcount; 2040 mjd_nodelen++; 2041 Set_Node_Offset_Length(convert, mjd_offset, state - 1); 2042 while( ++fix < n ) { 2043 Set_Node_Offset_Length(fix, 0, 0); 2044 } 2045 while (word--) { 2046 SV ** const tmp = av_fetch( trie_words, word, 0 ); 2047 if (tmp) { 2048 if ( STR_LEN(convert) <= SvCUR(*tmp) ) 2049 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); 2050 else 2051 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); 2052 } 2053 } 2054 } 2055 #endif 2056 if (trie->maxlen) { 2057 convert = n; 2058 } else { 2059 NEXT_OFF(convert) = (U16)(tail - convert); 2060 DEBUG_r(optimize= n); 2061 } 2062 } 2063 } 2064 if (!jumper) 2065 jumper = last; 2066 if ( trie->maxlen ) { 2067 NEXT_OFF( convert ) = (U16)(tail - convert); 2068 ARG_SET( convert, data_slot ); 2069 /* Store the offset to the first unabsorbed branch in 2070 jump[0], which is otherwise unused by the jump logic. 2071 We use this when dumping a trie and during optimisation. */ 2072 if (trie->jump) 2073 trie->jump[0] = (U16)(nextbranch - convert); 2074 2075 /* XXXX */ 2076 if ( !trie->states[trie->startstate].wordnum && trie->bitmap && 2077 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) 2078 { 2079 OP( convert ) = TRIEC; 2080 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); 2081 PerlMemShared_free(trie->bitmap); 2082 trie->bitmap= NULL; 2083 } else 2084 OP( convert ) = TRIE; 2085 2086 /* store the type in the flags */ 2087 convert->flags = nodetype; 2088 DEBUG_r({ 2089 optimize = convert 2090 + NODE_STEP_REGNODE 2091 + regarglen[ OP( convert ) ]; 2092 }); 2093 /* XXX We really should free up the resource in trie now, 2094 as we won't use them - (which resources?) dmq */ 2095 } 2096 /* needed for dumping*/ 2097 DEBUG_r(if (optimize) { 2098 regnode *opt = convert; 2099 2100 while ( ++opt < optimize) { 2101 Set_Node_Offset_Length(opt,0,0); 2102 } 2103 /* 2104 Try to clean up some of the debris left after the 2105 optimisation. 2106 */ 2107 while( optimize < jumper ) { 2108 mjd_nodelen += Node_Length((optimize)); 2109 OP( optimize ) = OPTIMIZED; 2110 Set_Node_Offset_Length(optimize,0,0); 2111 optimize++; 2112 } 2113 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); 2114 }); 2115 } /* end node insert */ 2116 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; 2117 #ifdef DEBUGGING 2118 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; 2119 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; 2120 #else 2121 SvREFCNT_dec(revcharmap); 2122 #endif 2123 return trie->jump 2124 ? MADE_JUMP_TRIE 2125 : trie->startstate>1 2126 ? MADE_EXACT_TRIE 2127 : MADE_TRIE; 2128 } 2129 2130 STATIC void 2131 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) 2132 { 2133 /* The Trie is constructed and compressed now so we can build a fail array now if its needed 2134 2135 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the 2136 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 2137 ISBN 0-201-10088-6 2138 2139 We find the fail state for each state in the trie, this state is the longest proper 2140 suffix of the current states 'word' that is also a proper prefix of another word in our 2141 trie. State 1 represents the word '' and is the thus the default fail state. This allows 2142 the DFA not to have to restart after its tried and failed a word at a given point, it 2143 simply continues as though it had been matching the other word in the first place. 2144 Consider 2145 'abcdgu'=~/abcdefg|cdgu/ 2146 When we get to 'd' we are still matching the first word, we would encounter 'g' which would 2147 fail, which would bring use to the state representing 'd' in the second word where we would 2148 try 'g' and succeed, prodceding to match 'cdgu'. 2149 */ 2150 /* add a fail transition */ 2151 const U32 trie_offset = ARG(source); 2152 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; 2153 U32 *q; 2154 const U32 ucharcount = trie->uniquecharcount; 2155 const U32 numstates = trie->statecount; 2156 const U32 ubound = trie->lasttrans + ucharcount; 2157 U32 q_read = 0; 2158 U32 q_write = 0; 2159 U32 charid; 2160 U32 base = trie->states[ 1 ].trans.base; 2161 U32 *fail; 2162 reg_ac_data *aho; 2163 const U32 data_slot = add_data( pRExC_state, 1, "T" ); 2164 GET_RE_DEBUG_FLAGS_DECL; 2165 #ifndef DEBUGGING 2166 PERL_UNUSED_ARG(depth); 2167 #endif 2168 2169 2170 ARG_SET( stclass, data_slot ); 2171 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); 2172 RExC_rxi->data->data[ data_slot ] = (void*)aho; 2173 aho->trie=trie_offset; 2174 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); 2175 Copy( trie->states, aho->states, numstates, reg_trie_state ); 2176 Newxz( q, numstates, U32); 2177 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); 2178 aho->refcount = 1; 2179 fail = aho->fail; 2180 /* initialize fail[0..1] to be 1 so that we always have 2181 a valid final fail state */ 2182 fail[ 0 ] = fail[ 1 ] = 1; 2183 2184 for ( charid = 0; charid < ucharcount ; charid++ ) { 2185 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); 2186 if ( newstate ) { 2187 q[ q_write ] = newstate; 2188 /* set to point at the root */ 2189 fail[ q[ q_write++ ] ]=1; 2190 } 2191 } 2192 while ( q_read < q_write) { 2193 const U32 cur = q[ q_read++ % numstates ]; 2194 base = trie->states[ cur ].trans.base; 2195 2196 for ( charid = 0 ; charid < ucharcount ; charid++ ) { 2197 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); 2198 if (ch_state) { 2199 U32 fail_state = cur; 2200 U32 fail_base; 2201 do { 2202 fail_state = fail[ fail_state ]; 2203 fail_base = aho->states[ fail_state ].trans.base; 2204 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); 2205 2206 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); 2207 fail[ ch_state ] = fail_state; 2208 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) 2209 { 2210 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; 2211 } 2212 q[ q_write++ % numstates] = ch_state; 2213 } 2214 } 2215 } 2216 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop 2217 when we fail in state 1, this allows us to use the 2218 charclass scan to find a valid start char. This is based on the principle 2219 that theres a good chance the string being searched contains lots of stuff 2220 that cant be a start char. 2221 */ 2222 fail[ 0 ] = fail[ 1 ] = 0; 2223 DEBUG_TRIE_COMPILE_r({ 2224 PerlIO_printf(Perl_debug_log, 2225 "%*sStclass Failtable (%"UVuf" states): 0", 2226 (int)(depth * 2), "", (UV)numstates 2227 ); 2228 for( q_read=1; q_read<numstates; q_read++ ) { 2229 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]); 2230 } 2231 PerlIO_printf(Perl_debug_log, "\n"); 2232 }); 2233 Safefree(q); 2234 /*RExC_seen |= REG_SEEN_TRIEDFA;*/ 2235 } 2236 2237 2238 /* 2239 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. 2240 * These need to be revisited when a newer toolchain becomes available. 2241 */ 2242 #if defined(__sparc64__) && defined(__GNUC__) 2243 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) 2244 # undef SPARC64_GCC_WORKAROUND 2245 # define SPARC64_GCC_WORKAROUND 1 2246 # endif 2247 #endif 2248 2249 #define DEBUG_PEEP(str,scan,depth) \ 2250 DEBUG_OPTIMISE_r({if (scan){ \ 2251 SV * const mysv=sv_newmortal(); \ 2252 regnode *Next = regnext(scan); \ 2253 regprop(RExC_rx, mysv, scan); \ 2254 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ 2255 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ 2256 Next ? (REG_NODE_NUM(Next)) : 0 ); \ 2257 }}); 2258 2259 2260 2261 2262 2263 #define JOIN_EXACT(scan,min,flags) \ 2264 if (PL_regkind[OP(scan)] == EXACT) \ 2265 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1) 2266 2267 STATIC U32 2268 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) { 2269 /* Merge several consecutive EXACTish nodes into one. */ 2270 regnode *n = regnext(scan); 2271 U32 stringok = 1; 2272 regnode *next = scan + NODE_SZ_STR(scan); 2273 U32 merged = 0; 2274 U32 stopnow = 0; 2275 #ifdef DEBUGGING 2276 regnode *stop = scan; 2277 GET_RE_DEBUG_FLAGS_DECL; 2278 #else 2279 PERL_UNUSED_ARG(depth); 2280 #endif 2281 #ifndef EXPERIMENTAL_INPLACESCAN 2282 PERL_UNUSED_ARG(flags); 2283 PERL_UNUSED_ARG(val); 2284 #endif 2285 DEBUG_PEEP("join",scan,depth); 2286 2287 /* Skip NOTHING, merge EXACT*. */ 2288 while (n && 2289 ( PL_regkind[OP(n)] == NOTHING || 2290 (stringok && (OP(n) == OP(scan)))) 2291 && NEXT_OFF(n) 2292 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { 2293 2294 if (OP(n) == TAIL || n > next) 2295 stringok = 0; 2296 if (PL_regkind[OP(n)] == NOTHING) { 2297 DEBUG_PEEP("skip:",n,depth); 2298 NEXT_OFF(scan) += NEXT_OFF(n); 2299 next = n + NODE_STEP_REGNODE; 2300 #ifdef DEBUGGING 2301 if (stringok) 2302 stop = n; 2303 #endif 2304 n = regnext(n); 2305 } 2306 else if (stringok) { 2307 const unsigned int oldl = STR_LEN(scan); 2308 regnode * const nnext = regnext(n); 2309 2310 DEBUG_PEEP("merg",n,depth); 2311 2312 merged++; 2313 if (oldl + STR_LEN(n) > U8_MAX) 2314 break; 2315 NEXT_OFF(scan) += NEXT_OFF(n); 2316 STR_LEN(scan) += STR_LEN(n); 2317 next = n + NODE_SZ_STR(n); 2318 /* Now we can overwrite *n : */ 2319 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 2320 #ifdef DEBUGGING 2321 stop = next - 1; 2322 #endif 2323 n = nnext; 2324 if (stopnow) break; 2325 } 2326 2327 #ifdef EXPERIMENTAL_INPLACESCAN 2328 if (flags && !NEXT_OFF(n)) { 2329 DEBUG_PEEP("atch", val, depth); 2330 if (reg_off_by_arg[OP(n)]) { 2331 ARG_SET(n, val - n); 2332 } 2333 else { 2334 NEXT_OFF(n) = val - n; 2335 } 2336 stopnow = 1; 2337 } 2338 #endif 2339 } 2340 2341 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) { 2342 /* 2343 Two problematic code points in Unicode casefolding of EXACT nodes: 2344 2345 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 2346 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 2347 2348 which casefold to 2349 2350 Unicode UTF-8 2351 2352 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 2353 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 2354 2355 This means that in case-insensitive matching (or "loose matching", 2356 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte 2357 length of the above casefolded versions) can match a target string 2358 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). 2359 This would rather mess up the minimum length computation. 2360 2361 What we'll do is to look for the tail four bytes, and then peek 2362 at the preceding two bytes to see whether we need to decrease 2363 the minimum length by four (six minus two). 2364 2365 Thanks to the design of UTF-8, there cannot be false matches: 2366 A sequence of valid UTF-8 bytes cannot be a subsequence of 2367 another valid sequence of UTF-8 bytes. 2368 2369 */ 2370 char * const s0 = STRING(scan), *s, *t; 2371 char * const s1 = s0 + STR_LEN(scan) - 1; 2372 char * const s2 = s1 - 4; 2373 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ 2374 const char t0[] = "\xaf\x49\xaf\x42"; 2375 #else 2376 const char t0[] = "\xcc\x88\xcc\x81"; 2377 #endif 2378 const char * const t1 = t0 + 3; 2379 2380 for (s = s0 + 2; 2381 s < s2 && (t = ninstr(s, s1, t0, t1)); 2382 s = t + 4) { 2383 #ifdef EBCDIC 2384 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) || 2385 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5)) 2386 #else 2387 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || 2388 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) 2389 #endif 2390 *min -= 4; 2391 } 2392 } 2393 2394 #ifdef DEBUGGING 2395 /* Allow dumping */ 2396 n = scan + NODE_SZ_STR(scan); 2397 while (n <= stop) { 2398 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) { 2399 OP(n) = OPTIMIZED; 2400 NEXT_OFF(n) = 0; 2401 } 2402 n++; 2403 } 2404 #endif 2405 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); 2406 return stopnow; 2407 } 2408 2409 /* REx optimizer. Converts nodes into quickier variants "in place". 2410 Finds fixed substrings. */ 2411 2412 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set 2413 to the position after last scanned or to NULL. */ 2414 2415 #define INIT_AND_WITHP \ 2416 assert(!and_withp); \ 2417 Newx(and_withp,1,struct regnode_charclass_class); \ 2418 SAVEFREEPV(and_withp) 2419 2420 /* this is a chain of data about sub patterns we are processing that 2421 need to be handled seperately/specially in study_chunk. Its so 2422 we can simulate recursion without losing state. */ 2423 struct scan_frame; 2424 typedef struct scan_frame { 2425 regnode *last; /* last node to process in this frame */ 2426 regnode *next; /* next node to process when last is reached */ 2427 struct scan_frame *prev; /*previous frame*/ 2428 I32 stop; /* what stopparen do we use */ 2429 } scan_frame; 2430 2431 2432 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) 2433 2434 #define CASE_SYNST_FNC(nAmE) \ 2435 case nAmE: \ 2436 if (flags & SCF_DO_STCLASS_AND) { \ 2437 for (value = 0; value < 256; value++) \ 2438 if (!is_ ## nAmE ## _cp(value)) \ 2439 ANYOF_BITMAP_CLEAR(data->start_class, value); \ 2440 } \ 2441 else { \ 2442 for (value = 0; value < 256; value++) \ 2443 if (is_ ## nAmE ## _cp(value)) \ 2444 ANYOF_BITMAP_SET(data->start_class, value); \ 2445 } \ 2446 break; \ 2447 case N ## nAmE: \ 2448 if (flags & SCF_DO_STCLASS_AND) { \ 2449 for (value = 0; value < 256; value++) \ 2450 if (is_ ## nAmE ## _cp(value)) \ 2451 ANYOF_BITMAP_CLEAR(data->start_class, value); \ 2452 } \ 2453 else { \ 2454 for (value = 0; value < 256; value++) \ 2455 if (!is_ ## nAmE ## _cp(value)) \ 2456 ANYOF_BITMAP_SET(data->start_class, value); \ 2457 } \ 2458 break 2459 2460 2461 2462 STATIC I32 2463 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, 2464 I32 *minlenp, I32 *deltap, 2465 regnode *last, 2466 scan_data_t *data, 2467 I32 stopparen, 2468 U8* recursed, 2469 struct regnode_charclass_class *and_withp, 2470 U32 flags, U32 depth) 2471 /* scanp: Start here (read-write). */ 2472 /* deltap: Write maxlen-minlen here. */ 2473 /* last: Stop before this one. */ 2474 /* data: string data about the pattern */ 2475 /* stopparen: treat close N as END */ 2476 /* recursed: which subroutines have we recursed into */ 2477 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ 2478 { 2479 dVAR; 2480 I32 min = 0, pars = 0, code; 2481 regnode *scan = *scanp, *next; 2482 I32 delta = 0; 2483 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 2484 int is_inf_internal = 0; /* The studied chunk is infinite */ 2485 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; 2486 scan_data_t data_fake; 2487 SV *re_trie_maxbuff = NULL; 2488 regnode *first_non_open = scan; 2489 I32 stopmin = I32_MAX; 2490 scan_frame *frame = NULL; 2491 2492 GET_RE_DEBUG_FLAGS_DECL; 2493 2494 #ifdef DEBUGGING 2495 StructCopy(&zero_scan_data, &data_fake, scan_data_t); 2496 #endif 2497 2498 if ( depth == 0 ) { 2499 while (first_non_open && OP(first_non_open) == OPEN) 2500 first_non_open=regnext(first_non_open); 2501 } 2502 2503 2504 fake_study_recurse: 2505 while ( scan && OP(scan) != END && scan < last ){ 2506 /* Peephole optimizer: */ 2507 DEBUG_STUDYDATA("Peep:", data,depth); 2508 DEBUG_PEEP("Peep",scan,depth); 2509 JOIN_EXACT(scan,&min,0); 2510 2511 /* Follow the next-chain of the current node and optimize 2512 away all the NOTHINGs from it. */ 2513 if (OP(scan) != CURLYX) { 2514 const int max = (reg_off_by_arg[OP(scan)] 2515 ? I32_MAX 2516 /* I32 may be smaller than U16 on CRAYs! */ 2517 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 2518 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); 2519 int noff; 2520 regnode *n = scan; 2521 2522 /* Skip NOTHING and LONGJMP. */ 2523 while ((n = regnext(n)) 2524 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) 2525 || ((OP(n) == LONGJMP) && (noff = ARG(n)))) 2526 && off + noff < max) 2527 off += noff; 2528 if (reg_off_by_arg[OP(scan)]) 2529 ARG(scan) = off; 2530 else 2531 NEXT_OFF(scan) = off; 2532 } 2533 2534 2535 2536 /* The principal pseudo-switch. Cannot be a switch, since we 2537 look into several different things. */ 2538 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ 2539 || OP(scan) == IFTHEN) { 2540 next = regnext(scan); 2541 code = OP(scan); 2542 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ 2543 2544 if (OP(next) == code || code == IFTHEN) { 2545 /* NOTE - There is similar code to this block below for handling 2546 TRIE nodes on a re-study. If you change stuff here check there 2547 too. */ 2548 I32 max1 = 0, min1 = I32_MAX, num = 0; 2549 struct regnode_charclass_class accum; 2550 regnode * const startbranch=scan; 2551 2552 if (flags & SCF_DO_SUBSTR) 2553 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ 2554 if (flags & SCF_DO_STCLASS) 2555 cl_init_zero(pRExC_state, &accum); 2556 2557 while (OP(scan) == code) { 2558 I32 deltanext, minnext, f = 0, fake; 2559 struct regnode_charclass_class this_class; 2560 2561 num++; 2562 data_fake.flags = 0; 2563 if (data) { 2564 data_fake.whilem_c = data->whilem_c; 2565 data_fake.last_closep = data->last_closep; 2566 } 2567 else 2568 data_fake.last_closep = &fake; 2569 2570 data_fake.pos_delta = delta; 2571 next = regnext(scan); 2572 scan = NEXTOPER(scan); 2573 if (code != BRANCH) 2574 scan = NEXTOPER(scan); 2575 if (flags & SCF_DO_STCLASS) { 2576 cl_init(pRExC_state, &this_class); 2577 data_fake.start_class = &this_class; 2578 f = SCF_DO_STCLASS_AND; 2579 } 2580 if (flags & SCF_WHILEM_VISITED_POS) 2581 f |= SCF_WHILEM_VISITED_POS; 2582 2583 /* we suppose the run is continuous, last=next...*/ 2584 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 2585 next, &data_fake, 2586 stopparen, recursed, NULL, f,depth+1); 2587 if (min1 > minnext) 2588 min1 = minnext; 2589 if (max1 < minnext + deltanext) 2590 max1 = minnext + deltanext; 2591 if (deltanext == I32_MAX) 2592 is_inf = is_inf_internal = 1; 2593 scan = next; 2594 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 2595 pars++; 2596 if (data_fake.flags & SCF_SEEN_ACCEPT) { 2597 if ( stopmin > minnext) 2598 stopmin = min + min1; 2599 flags &= ~SCF_DO_SUBSTR; 2600 if (data) 2601 data->flags |= SCF_SEEN_ACCEPT; 2602 } 2603 if (data) { 2604 if (data_fake.flags & SF_HAS_EVAL) 2605 data->flags |= SF_HAS_EVAL; 2606 data->whilem_c = data_fake.whilem_c; 2607 } 2608 if (flags & SCF_DO_STCLASS) 2609 cl_or(pRExC_state, &accum, &this_class); 2610 } 2611 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 2612 min1 = 0; 2613 if (flags & SCF_DO_SUBSTR) { 2614 data->pos_min += min1; 2615 data->pos_delta += max1 - min1; 2616 if (max1 != min1 || is_inf) 2617 data->longest = &(data->longest_float); 2618 } 2619 min += min1; 2620 delta += max1 - min1; 2621 if (flags & SCF_DO_STCLASS_OR) { 2622 cl_or(pRExC_state, data->start_class, &accum); 2623 if (min1) { 2624 cl_and(data->start_class, and_withp); 2625 flags &= ~SCF_DO_STCLASS; 2626 } 2627 } 2628 else if (flags & SCF_DO_STCLASS_AND) { 2629 if (min1) { 2630 cl_and(data->start_class, &accum); 2631 flags &= ~SCF_DO_STCLASS; 2632 } 2633 else { 2634 /* Switch to OR mode: cache the old value of 2635 * data->start_class */ 2636 INIT_AND_WITHP; 2637 StructCopy(data->start_class, and_withp, 2638 struct regnode_charclass_class); 2639 flags &= ~SCF_DO_STCLASS_AND; 2640 StructCopy(&accum, data->start_class, 2641 struct regnode_charclass_class); 2642 flags |= SCF_DO_STCLASS_OR; 2643 data->start_class->flags |= ANYOF_EOS; 2644 } 2645 } 2646 2647 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { 2648 /* demq. 2649 2650 Assuming this was/is a branch we are dealing with: 'scan' now 2651 points at the item that follows the branch sequence, whatever 2652 it is. We now start at the beginning of the sequence and look 2653 for subsequences of 2654 2655 BRANCH->EXACT=>x1 2656 BRANCH->EXACT=>x2 2657 tail 2658 2659 which would be constructed from a pattern like /A|LIST|OF|WORDS/ 2660 2661 If we can find such a subseqence we need to turn the first 2662 element into a trie and then add the subsequent branch exact 2663 strings to the trie. 2664 2665 We have two cases 2666 2667 1. patterns where the whole set of branch can be converted. 2668 2669 2. patterns where only a subset can be converted. 2670 2671 In case 1 we can replace the whole set with a single regop 2672 for the trie. In case 2 we need to keep the start and end 2673 branchs so 2674 2675 'BRANCH EXACT; BRANCH EXACT; BRANCH X' 2676 becomes BRANCH TRIE; BRANCH X; 2677 2678 There is an additional case, that being where there is a 2679 common prefix, which gets split out into an EXACT like node 2680 preceding the TRIE node. 2681 2682 If x(1..n)==tail then we can do a simple trie, if not we make 2683 a "jump" trie, such that when we match the appropriate word 2684 we "jump" to the appopriate tail node. Essentailly we turn 2685 a nested if into a case structure of sorts. 2686 2687 */ 2688 2689 int made=0; 2690 if (!re_trie_maxbuff) { 2691 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); 2692 if (!SvIOK(re_trie_maxbuff)) 2693 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 2694 } 2695 if ( SvIV(re_trie_maxbuff)>=0 ) { 2696 regnode *cur; 2697 regnode *first = (regnode *)NULL; 2698 regnode *last = (regnode *)NULL; 2699 regnode *tail = scan; 2700 U8 optype = 0; 2701 U32 count=0; 2702 2703 #ifdef DEBUGGING 2704 SV * const mysv = sv_newmortal(); /* for dumping */ 2705 #endif 2706 /* var tail is used because there may be a TAIL 2707 regop in the way. Ie, the exacts will point to the 2708 thing following the TAIL, but the last branch will 2709 point at the TAIL. So we advance tail. If we 2710 have nested (?:) we may have to move through several 2711 tails. 2712 */ 2713 2714 while ( OP( tail ) == TAIL ) { 2715 /* this is the TAIL generated by (?:) */ 2716 tail = regnext( tail ); 2717 } 2718 2719 2720 DEBUG_OPTIMISE_r({ 2721 regprop(RExC_rx, mysv, tail ); 2722 PerlIO_printf( Perl_debug_log, "%*s%s%s\n", 2723 (int)depth * 2 + 2, "", 2724 "Looking for TRIE'able sequences. Tail node is: ", 2725 SvPV_nolen_const( mysv ) 2726 ); 2727 }); 2728 2729 /* 2730 2731 step through the branches, cur represents each 2732 branch, noper is the first thing to be matched 2733 as part of that branch and noper_next is the 2734 regnext() of that node. if noper is an EXACT 2735 and noper_next is the same as scan (our current 2736 position in the regex) then the EXACT branch is 2737 a possible optimization target. Once we have 2738 two or more consequetive such branches we can 2739 create a trie of the EXACT's contents and stich 2740 it in place. If the sequence represents all of 2741 the branches we eliminate the whole thing and 2742 replace it with a single TRIE. If it is a 2743 subsequence then we need to stitch it in. This 2744 means the first branch has to remain, and needs 2745 to be repointed at the item on the branch chain 2746 following the last branch optimized. This could 2747 be either a BRANCH, in which case the 2748 subsequence is internal, or it could be the 2749 item following the branch sequence in which 2750 case the subsequence is at the end. 2751 2752 */ 2753 2754 /* dont use tail as the end marker for this traverse */ 2755 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { 2756 regnode * const noper = NEXTOPER( cur ); 2757 #if defined(DEBUGGING) || defined(NOJUMPTRIE) 2758 regnode * const noper_next = regnext( noper ); 2759 #endif 2760 2761 DEBUG_OPTIMISE_r({ 2762 regprop(RExC_rx, mysv, cur); 2763 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", 2764 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); 2765 2766 regprop(RExC_rx, mysv, noper); 2767 PerlIO_printf( Perl_debug_log, " -> %s", 2768 SvPV_nolen_const(mysv)); 2769 2770 if ( noper_next ) { 2771 regprop(RExC_rx, mysv, noper_next ); 2772 PerlIO_printf( Perl_debug_log,"\t=> %s\t", 2773 SvPV_nolen_const(mysv)); 2774 } 2775 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n", 2776 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) ); 2777 }); 2778 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype 2779 : PL_regkind[ OP( noper ) ] == EXACT ) 2780 || OP(noper) == NOTHING ) 2781 #ifdef NOJUMPTRIE 2782 && noper_next == tail 2783 #endif 2784 && count < U16_MAX) 2785 { 2786 count++; 2787 if ( !first || optype == NOTHING ) { 2788 if (!first) first = cur; 2789 optype = OP( noper ); 2790 } else { 2791 last = cur; 2792 } 2793 } else { 2794 /* 2795 Currently we assume that the trie can handle unicode and ascii 2796 matches fold cased matches. If this proves true then the following 2797 define will prevent tries in this situation. 2798 2799 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) 2800 */ 2801 #define TRIE_TYPE_IS_SAFE 1 2802 if ( last && TRIE_TYPE_IS_SAFE ) { 2803 make_trie( pRExC_state, 2804 startbranch, first, cur, tail, count, 2805 optype, depth+1 ); 2806 } 2807 if ( PL_regkind[ OP( noper ) ] == EXACT 2808 #ifdef NOJUMPTRIE 2809 && noper_next == tail 2810 #endif 2811 ){ 2812 count = 1; 2813 first = cur; 2814 optype = OP( noper ); 2815 } else { 2816 count = 0; 2817 first = NULL; 2818 optype = 0; 2819 } 2820 last = NULL; 2821 } 2822 } 2823 DEBUG_OPTIMISE_r({ 2824 regprop(RExC_rx, mysv, cur); 2825 PerlIO_printf( Perl_debug_log, 2826 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2, 2827 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); 2828 2829 }); 2830 2831 if ( last && TRIE_TYPE_IS_SAFE ) { 2832 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); 2833 #ifdef TRIE_STUDY_OPT 2834 if ( ((made == MADE_EXACT_TRIE && 2835 startbranch == first) 2836 || ( first_non_open == first )) && 2837 depth==0 ) { 2838 flags |= SCF_TRIE_RESTUDY; 2839 if ( startbranch == first 2840 && scan == tail ) 2841 { 2842 RExC_seen &=~REG_TOP_LEVEL_BRANCHES; 2843 } 2844 } 2845 #endif 2846 } 2847 } 2848 2849 } /* do trie */ 2850 2851 } 2852 else if ( code == BRANCHJ ) { /* single branch is optimized. */ 2853 scan = NEXTOPER(NEXTOPER(scan)); 2854 } else /* single branch is optimized. */ 2855 scan = NEXTOPER(scan); 2856 continue; 2857 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { 2858 scan_frame *newframe = NULL; 2859 I32 paren; 2860 regnode *start; 2861 regnode *end; 2862 2863 if (OP(scan) != SUSPEND) { 2864 /* set the pointer */ 2865 if (OP(scan) == GOSUB) { 2866 paren = ARG(scan); 2867 RExC_recurse[ARG2L(scan)] = scan; 2868 start = RExC_open_parens[paren-1]; 2869 end = RExC_close_parens[paren-1]; 2870 } else { 2871 paren = 0; 2872 start = RExC_rxi->program + 1; 2873 end = RExC_opend; 2874 } 2875 if (!recursed) { 2876 Newxz(recursed, (((RExC_npar)>>3) +1), U8); 2877 SAVEFREEPV(recursed); 2878 } 2879 if (!PAREN_TEST(recursed,paren+1)) { 2880 PAREN_SET(recursed,paren+1); 2881 Newx(newframe,1,scan_frame); 2882 } else { 2883 if (flags & SCF_DO_SUBSTR) { 2884 SCAN_COMMIT(pRExC_state,data,minlenp); 2885 data->longest = &(data->longest_float); 2886 } 2887 is_inf = is_inf_internal = 1; 2888 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 2889 cl_anything(pRExC_state, data->start_class); 2890 flags &= ~SCF_DO_STCLASS; 2891 } 2892 } else { 2893 Newx(newframe,1,scan_frame); 2894 paren = stopparen; 2895 start = scan+2; 2896 end = regnext(scan); 2897 } 2898 if (newframe) { 2899 assert(start); 2900 assert(end); 2901 SAVEFREEPV(newframe); 2902 newframe->next = regnext(scan); 2903 newframe->last = last; 2904 newframe->stop = stopparen; 2905 newframe->prev = frame; 2906 2907 frame = newframe; 2908 scan = start; 2909 stopparen = paren; 2910 last = end; 2911 2912 continue; 2913 } 2914 } 2915 else if (OP(scan) == EXACT) { 2916 I32 l = STR_LEN(scan); 2917 UV uc; 2918 if (UTF) { 2919 const U8 * const s = (U8*)STRING(scan); 2920 l = utf8_length(s, s + l); 2921 uc = utf8_to_uvchr(s, NULL); 2922 } else { 2923 uc = *((U8*)STRING(scan)); 2924 } 2925 min += l; 2926 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 2927 /* The code below prefers earlier match for fixed 2928 offset, later match for variable offset. */ 2929 if (data->last_end == -1) { /* Update the start info. */ 2930 data->last_start_min = data->pos_min; 2931 data->last_start_max = is_inf 2932 ? I32_MAX : data->pos_min + data->pos_delta; 2933 } 2934 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); 2935 if (UTF) 2936 SvUTF8_on(data->last_found); 2937 { 2938 SV * const sv = data->last_found; 2939 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 2940 mg_find(sv, PERL_MAGIC_utf8) : NULL; 2941 if (mg && mg->mg_len >= 0) 2942 mg->mg_len += utf8_length((U8*)STRING(scan), 2943 (U8*)STRING(scan)+STR_LEN(scan)); 2944 } 2945 data->last_end = data->pos_min + l; 2946 data->pos_min += l; /* As in the first entry. */ 2947 data->flags &= ~SF_BEFORE_EOL; 2948 } 2949 if (flags & SCF_DO_STCLASS_AND) { 2950 /* Check whether it is compatible with what we know already! */ 2951 int compat = 1; 2952 2953 if (uc >= 0x100 || 2954 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 2955 && !ANYOF_BITMAP_TEST(data->start_class, uc) 2956 && (!(data->start_class->flags & ANYOF_FOLD) 2957 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 2958 ) 2959 compat = 0; 2960 ANYOF_CLASS_ZERO(data->start_class); 2961 ANYOF_BITMAP_ZERO(data->start_class); 2962 if (compat) 2963 ANYOF_BITMAP_SET(data->start_class, uc); 2964 data->start_class->flags &= ~ANYOF_EOS; 2965 if (uc < 0x100) 2966 data->start_class->flags &= ~ANYOF_UNICODE_ALL; 2967 } 2968 else if (flags & SCF_DO_STCLASS_OR) { 2969 /* false positive possible if the class is case-folded */ 2970 if (uc < 0x100) 2971 ANYOF_BITMAP_SET(data->start_class, uc); 2972 else 2973 data->start_class->flags |= ANYOF_UNICODE_ALL; 2974 data->start_class->flags &= ~ANYOF_EOS; 2975 cl_and(data->start_class, and_withp); 2976 } 2977 flags &= ~SCF_DO_STCLASS; 2978 } 2979 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ 2980 I32 l = STR_LEN(scan); 2981 UV uc = *((U8*)STRING(scan)); 2982 2983 /* Search for fixed substrings supports EXACT only. */ 2984 if (flags & SCF_DO_SUBSTR) { 2985 assert(data); 2986 SCAN_COMMIT(pRExC_state, data, minlenp); 2987 } 2988 if (UTF) { 2989 const U8 * const s = (U8 *)STRING(scan); 2990 l = utf8_length(s, s + l); 2991 uc = utf8_to_uvchr(s, NULL); 2992 } 2993 min += l; 2994 if (flags & SCF_DO_SUBSTR) 2995 data->pos_min += l; 2996 if (flags & SCF_DO_STCLASS_AND) { 2997 /* Check whether it is compatible with what we know already! */ 2998 int compat = 1; 2999 3000 if (uc >= 0x100 || 3001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 3002 && !ANYOF_BITMAP_TEST(data->start_class, uc) 3003 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 3004 compat = 0; 3005 ANYOF_CLASS_ZERO(data->start_class); 3006 ANYOF_BITMAP_ZERO(data->start_class); 3007 if (compat) { 3008 ANYOF_BITMAP_SET(data->start_class, uc); 3009 data->start_class->flags &= ~ANYOF_EOS; 3010 data->start_class->flags |= ANYOF_FOLD; 3011 if (OP(scan) == EXACTFL) 3012 data->start_class->flags |= ANYOF_LOCALE; 3013 } 3014 } 3015 else if (flags & SCF_DO_STCLASS_OR) { 3016 if (data->start_class->flags & ANYOF_FOLD) { 3017 /* false positive possible if the class is case-folded. 3018 Assume that the locale settings are the same... */ 3019 if (uc < 0x100) 3020 ANYOF_BITMAP_SET(data->start_class, uc); 3021 data->start_class->flags &= ~ANYOF_EOS; 3022 } 3023 cl_and(data->start_class, and_withp); 3024 } 3025 flags &= ~SCF_DO_STCLASS; 3026 } 3027 else if (strchr((const char*)PL_varies,OP(scan))) { 3028 I32 mincount, maxcount, minnext, deltanext, fl = 0; 3029 I32 f = flags, pos_before = 0; 3030 regnode * const oscan = scan; 3031 struct regnode_charclass_class this_class; 3032 struct regnode_charclass_class *oclass = NULL; 3033 I32 next_is_eval = 0; 3034 3035 switch (PL_regkind[OP(scan)]) { 3036 case WHILEM: /* End of (?:...)* . */ 3037 scan = NEXTOPER(scan); 3038 goto finish; 3039 case PLUS: 3040 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 3041 next = NEXTOPER(scan); 3042 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { 3043 mincount = 1; 3044 maxcount = REG_INFTY; 3045 next = regnext(scan); 3046 scan = NEXTOPER(scan); 3047 goto do_curly; 3048 } 3049 } 3050 if (flags & SCF_DO_SUBSTR) 3051 data->pos_min++; 3052 min++; 3053 /* Fall through. */ 3054 case STAR: 3055 if (flags & SCF_DO_STCLASS) { 3056 mincount = 0; 3057 maxcount = REG_INFTY; 3058 next = regnext(scan); 3059 scan = NEXTOPER(scan); 3060 goto do_curly; 3061 } 3062 is_inf = is_inf_internal = 1; 3063 scan = regnext(scan); 3064 if (flags & SCF_DO_SUBSTR) { 3065 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ 3066 data->longest = &(data->longest_float); 3067 } 3068 goto optimize_curly_tail; 3069 case CURLY: 3070 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) 3071 && (scan->flags == stopparen)) 3072 { 3073 mincount = 1; 3074 maxcount = 1; 3075 } else { 3076 mincount = ARG1(scan); 3077 maxcount = ARG2(scan); 3078 } 3079 next = regnext(scan); 3080 if (OP(scan) == CURLYX) { 3081 I32 lp = (data ? *(data->last_closep) : 0); 3082 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); 3083 } 3084 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 3085 next_is_eval = (OP(scan) == EVAL); 3086 do_curly: 3087 if (flags & SCF_DO_SUBSTR) { 3088 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ 3089 pos_before = data->pos_min; 3090 } 3091 if (data) { 3092 fl = data->flags; 3093 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 3094 if (is_inf) 3095 data->flags |= SF_IS_INF; 3096 } 3097 if (flags & SCF_DO_STCLASS) { 3098 cl_init(pRExC_state, &this_class); 3099 oclass = data->start_class; 3100 data->start_class = &this_class; 3101 f |= SCF_DO_STCLASS_AND; 3102 f &= ~SCF_DO_STCLASS_OR; 3103 } 3104 /* These are the cases when once a subexpression 3105 fails at a particular position, it cannot succeed 3106 even after backtracking at the enclosing scope. 3107 3108 XXXX what if minimal match and we are at the 3109 initial run of {n,m}? */ 3110 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) 3111 f &= ~SCF_WHILEM_VISITED_POS; 3112 3113 /* This will finish on WHILEM, setting scan, or on NULL: */ 3114 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 3115 last, data, stopparen, recursed, NULL, 3116 (mincount == 0 3117 ? (f & ~SCF_DO_SUBSTR) : f),depth+1); 3118 3119 if (flags & SCF_DO_STCLASS) 3120 data->start_class = oclass; 3121 if (mincount == 0 || minnext == 0) { 3122 if (flags & SCF_DO_STCLASS_OR) { 3123 cl_or(pRExC_state, data->start_class, &this_class); 3124 } 3125 else if (flags & SCF_DO_STCLASS_AND) { 3126 /* Switch to OR mode: cache the old value of 3127 * data->start_class */ 3128 INIT_AND_WITHP; 3129 StructCopy(data->start_class, and_withp, 3130 struct regnode_charclass_class); 3131 flags &= ~SCF_DO_STCLASS_AND; 3132 StructCopy(&this_class, data->start_class, 3133 struct regnode_charclass_class); 3134 flags |= SCF_DO_STCLASS_OR; 3135 data->start_class->flags |= ANYOF_EOS; 3136 } 3137 } else { /* Non-zero len */ 3138 if (flags & SCF_DO_STCLASS_OR) { 3139 cl_or(pRExC_state, data->start_class, &this_class); 3140 cl_and(data->start_class, and_withp); 3141 } 3142 else if (flags & SCF_DO_STCLASS_AND) 3143 cl_and(data->start_class, &this_class); 3144 flags &= ~SCF_DO_STCLASS; 3145 } 3146 if (!scan) /* It was not CURLYX, but CURLY. */ 3147 scan = next; 3148 if ( /* ? quantifier ok, except for (?{ ... }) */ 3149 (next_is_eval || !(mincount == 0 && maxcount == 1)) 3150 && (minnext == 0) && (deltanext == 0) 3151 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 3152 && maxcount <= REG_INFTY/3 /* Complement check for big count */ 3153 && ckWARN(WARN_REGEXP)) 3154 { 3155 vWARN(RExC_parse, 3156 "Quantifier unexpected on zero-length expression"); 3157 } 3158 3159 min += minnext * mincount; 3160 is_inf_internal |= ((maxcount == REG_INFTY 3161 && (minnext + deltanext) > 0) 3162 || deltanext == I32_MAX); 3163 is_inf |= is_inf_internal; 3164 delta += (minnext + deltanext) * maxcount - minnext * mincount; 3165 3166 /* Try powerful optimization CURLYX => CURLYN. */ 3167 if ( OP(oscan) == CURLYX && data 3168 && data->flags & SF_IN_PAR 3169 && !(data->flags & SF_HAS_EVAL) 3170 && !deltanext && minnext == 1 ) { 3171 /* Try to optimize to CURLYN. */ 3172 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; 3173 regnode * const nxt1 = nxt; 3174 #ifdef DEBUGGING 3175 regnode *nxt2; 3176 #endif 3177 3178 /* Skip open. */ 3179 nxt = regnext(nxt); 3180 if (!strchr((const char*)PL_simple,OP(nxt)) 3181 && !(PL_regkind[OP(nxt)] == EXACT 3182 && STR_LEN(nxt) == 1)) 3183 goto nogo; 3184 #ifdef DEBUGGING 3185 nxt2 = nxt; 3186 #endif 3187 nxt = regnext(nxt); 3188 if (OP(nxt) != CLOSE) 3189 goto nogo; 3190 if (RExC_open_parens) { 3191 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ 3192 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ 3193 } 3194 /* Now we know that nxt2 is the only contents: */ 3195 oscan->flags = (U8)ARG(nxt); 3196 OP(oscan) = CURLYN; 3197 OP(nxt1) = NOTHING; /* was OPEN. */ 3198 3199 #ifdef DEBUGGING 3200 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 3201 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */ 3202 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */ 3203 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 3204 OP(nxt + 1) = OPTIMIZED; /* was count. */ 3205 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ 3206 #endif 3207 } 3208 nogo: 3209 3210 /* Try optimization CURLYX => CURLYM. */ 3211 if ( OP(oscan) == CURLYX && data 3212 && !(data->flags & SF_HAS_PAR) 3213 && !(data->flags & SF_HAS_EVAL) 3214 && !deltanext /* atom is fixed width */ 3215 && minnext != 0 /* CURLYM can't handle zero width */ 3216 ) { 3217 /* XXXX How to optimize if data == 0? */ 3218 /* Optimize to a simpler form. */ 3219 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ 3220 regnode *nxt2; 3221 3222 OP(oscan) = CURLYM; 3223 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 3224 && (OP(nxt2) != WHILEM)) 3225 nxt = nxt2; 3226 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 3227 /* Need to optimize away parenths. */ 3228 if (data->flags & SF_IN_PAR) { 3229 /* Set the parenth number. */ 3230 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ 3231 3232 if (OP(nxt) != CLOSE) 3233 FAIL("Panic opt close"); 3234 oscan->flags = (U8)ARG(nxt); 3235 if (RExC_open_parens) { 3236 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ 3237 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ 3238 } 3239 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 3240 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 3241 3242 #ifdef DEBUGGING 3243 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 3244 OP(nxt + 1) = OPTIMIZED; /* was count. */ 3245 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */ 3246 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */ 3247 #endif 3248 #if 0 3249 while ( nxt1 && (OP(nxt1) != WHILEM)) { 3250 regnode *nnxt = regnext(nxt1); 3251 3252 if (nnxt == nxt) { 3253 if (reg_off_by_arg[OP(nxt1)]) 3254 ARG_SET(nxt1, nxt2 - nxt1); 3255 else if (nxt2 - nxt1 < U16_MAX) 3256 NEXT_OFF(nxt1) = nxt2 - nxt1; 3257 else 3258 OP(nxt) = NOTHING; /* Cannot beautify */ 3259 } 3260 nxt1 = nnxt; 3261 } 3262 #endif 3263 /* Optimize again: */ 3264 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, 3265 NULL, stopparen, recursed, NULL, 0,depth+1); 3266 } 3267 else 3268 oscan->flags = 0; 3269 } 3270 else if ((OP(oscan) == CURLYX) 3271 && (flags & SCF_WHILEM_VISITED_POS) 3272 /* See the comment on a similar expression above. 3273 However, this time it not a subexpression 3274 we care about, but the expression itself. */ 3275 && (maxcount == REG_INFTY) 3276 && data && ++data->whilem_c < 16) { 3277 /* This stays as CURLYX, we can put the count/of pair. */ 3278 /* Find WHILEM (as in regexec.c) */ 3279 regnode *nxt = oscan + NEXT_OFF(oscan); 3280 3281 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ 3282 nxt += ARG(nxt); 3283 PREVOPER(nxt)->flags = (U8)(data->whilem_c 3284 | (RExC_whilem_seen << 4)); /* On WHILEM */ 3285 } 3286 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 3287 pars++; 3288 if (flags & SCF_DO_SUBSTR) { 3289 SV *last_str = NULL; 3290 int counted = mincount != 0; 3291 3292 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ 3293 #if defined(SPARC64_GCC_WORKAROUND) 3294 I32 b = 0; 3295 STRLEN l = 0; 3296 const char *s = NULL; 3297 I32 old = 0; 3298 3299 if (pos_before >= data->last_start_min) 3300 b = pos_before; 3301 else 3302 b = data->last_start_min; 3303 3304 l = 0; 3305 s = SvPV_const(data->last_found, l); 3306 old = b - data->last_start_min; 3307 3308 #else 3309 I32 b = pos_before >= data->last_start_min 3310 ? pos_before : data->last_start_min; 3311 STRLEN l; 3312 const char * const s = SvPV_const(data->last_found, l); 3313 I32 old = b - data->last_start_min; 3314 #endif 3315 3316 if (UTF) 3317 old = utf8_hop((U8*)s, old) - (U8*)s; 3318 3319 l -= old; 3320 /* Get the added string: */ 3321 last_str = newSVpvn(s + old, l); 3322 if (UTF) 3323 SvUTF8_on(last_str); 3324 if (deltanext == 0 && pos_before == b) { 3325 /* What was added is a constant string */ 3326 if (mincount > 1) { 3327 SvGROW(last_str, (mincount * l) + 1); 3328 repeatcpy(SvPVX(last_str) + l, 3329 SvPVX_const(last_str), l, mincount - 1); 3330 SvCUR_set(last_str, SvCUR(last_str) * mincount); 3331 /* Add additional parts. */ 3332 SvCUR_set(data->last_found, 3333 SvCUR(data->last_found) - l); 3334 sv_catsv(data->last_found, last_str); 3335 { 3336 SV * sv = data->last_found; 3337 MAGIC *mg = 3338 SvUTF8(sv) && SvMAGICAL(sv) ? 3339 mg_find(sv, PERL_MAGIC_utf8) : NULL; 3340 if (mg && mg->mg_len >= 0) 3341 mg->mg_len += CHR_SVLEN(last_str) - l; 3342 } 3343 data->last_end += l * (mincount - 1); 3344 } 3345 } else { 3346 /* start offset must point into the last copy */ 3347 data->last_start_min += minnext * (mincount - 1); 3348 data->last_start_max += is_inf ? I32_MAX 3349 : (maxcount - 1) * (minnext + data->pos_delta); 3350 } 3351 } 3352 /* It is counted once already... */ 3353 data->pos_min += minnext * (mincount - counted); 3354 data->pos_delta += - counted * deltanext + 3355 (minnext + deltanext) * maxcount - minnext * mincount; 3356 if (mincount != maxcount) { 3357 /* Cannot extend fixed substrings found inside 3358 the group. */ 3359 SCAN_COMMIT(pRExC_state,data,minlenp); 3360 if (mincount && last_str) { 3361 SV * const sv = data->last_found; 3362 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 3363 mg_find(sv, PERL_MAGIC_utf8) : NULL; 3364 3365 if (mg) 3366 mg->mg_len = -1; 3367 sv_setsv(sv, last_str); 3368 data->last_end = data->pos_min; 3369 data->last_start_min = 3370 data->pos_min - CHR_SVLEN(last_str); 3371 data->last_start_max = is_inf 3372 ? I32_MAX 3373 : data->pos_min + data->pos_delta 3374 - CHR_SVLEN(last_str); 3375 } 3376 data->longest = &(data->longest_float); 3377 } 3378 SvREFCNT_dec(last_str); 3379 } 3380 if (data && (fl & SF_HAS_EVAL)) 3381 data->flags |= SF_HAS_EVAL; 3382 optimize_curly_tail: 3383 if (OP(oscan) != CURLYX) { 3384 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING 3385 && NEXT_OFF(next)) 3386 NEXT_OFF(oscan) += NEXT_OFF(next); 3387 } 3388 continue; 3389 default: /* REF and CLUMP only? */ 3390 if (flags & SCF_DO_SUBSTR) { 3391 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3392 data->longest = &(data->longest_float); 3393 } 3394 is_inf = is_inf_internal = 1; 3395 if (flags & SCF_DO_STCLASS_OR) 3396 cl_anything(pRExC_state, data->start_class); 3397 flags &= ~SCF_DO_STCLASS; 3398 break; 3399 } 3400 } 3401 else if (OP(scan) == LNBREAK) { 3402 if (flags & SCF_DO_STCLASS) { 3403 int value = 0; 3404 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ 3405 if (flags & SCF_DO_STCLASS_AND) { 3406 for (value = 0; value < 256; value++) 3407 if (!is_VERTWS_cp(value)) 3408 ANYOF_BITMAP_CLEAR(data->start_class, value); 3409 } 3410 else { 3411 for (value = 0; value < 256; value++) 3412 if (is_VERTWS_cp(value)) 3413 ANYOF_BITMAP_SET(data->start_class, value); 3414 } 3415 if (flags & SCF_DO_STCLASS_OR) 3416 cl_and(data->start_class, and_withp); 3417 flags &= ~SCF_DO_STCLASS; 3418 } 3419 min += 1; 3420 delta += 1; 3421 if (flags & SCF_DO_SUBSTR) { 3422 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3423 data->pos_min += 1; 3424 data->pos_delta += 1; 3425 data->longest = &(data->longest_float); 3426 } 3427 3428 } 3429 else if (OP(scan) == FOLDCHAR) { 3430 int d = ARG(scan)==0xDF ? 1 : 2; 3431 flags &= ~SCF_DO_STCLASS; 3432 min += 1; 3433 delta += d; 3434 if (flags & SCF_DO_SUBSTR) { 3435 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3436 data->pos_min += 1; 3437 data->pos_delta += d; 3438 data->longest = &(data->longest_float); 3439 } 3440 } 3441 else if (strchr((const char*)PL_simple,OP(scan))) { 3442 int value = 0; 3443 3444 if (flags & SCF_DO_SUBSTR) { 3445 SCAN_COMMIT(pRExC_state,data,minlenp); 3446 data->pos_min++; 3447 } 3448 min++; 3449 if (flags & SCF_DO_STCLASS) { 3450 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ 3451 3452 /* Some of the logic below assumes that switching 3453 locale on will only add false positives. */ 3454 switch (PL_regkind[OP(scan)]) { 3455 case SANY: 3456 default: 3457 do_default: 3458 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ 3459 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 3460 cl_anything(pRExC_state, data->start_class); 3461 break; 3462 case REG_ANY: 3463 if (OP(scan) == SANY) 3464 goto do_default; 3465 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ 3466 value = (ANYOF_BITMAP_TEST(data->start_class,'\n') 3467 || (data->start_class->flags & ANYOF_CLASS)); 3468 cl_anything(pRExC_state, data->start_class); 3469 } 3470 if (flags & SCF_DO_STCLASS_AND || !value) 3471 ANYOF_BITMAP_CLEAR(data->start_class,'\n'); 3472 break; 3473 case ANYOF: 3474 if (flags & SCF_DO_STCLASS_AND) 3475 cl_and(data->start_class, 3476 (struct regnode_charclass_class*)scan); 3477 else 3478 cl_or(pRExC_state, data->start_class, 3479 (struct regnode_charclass_class*)scan); 3480 break; 3481 case ALNUM: 3482 if (flags & SCF_DO_STCLASS_AND) { 3483 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 3485 for (value = 0; value < 256; value++) 3486 if (!isALNUM(value)) 3487 ANYOF_BITMAP_CLEAR(data->start_class, value); 3488 } 3489 } 3490 else { 3491 if (data->start_class->flags & ANYOF_LOCALE) 3492 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 3493 else { 3494 for (value = 0; value < 256; value++) 3495 if (isALNUM(value)) 3496 ANYOF_BITMAP_SET(data->start_class, value); 3497 } 3498 } 3499 break; 3500 case ALNUML: 3501 if (flags & SCF_DO_STCLASS_AND) { 3502 if (data->start_class->flags & ANYOF_LOCALE) 3503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 3504 } 3505 else { 3506 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 3507 data->start_class->flags |= ANYOF_LOCALE; 3508 } 3509 break; 3510 case NALNUM: 3511 if (flags & SCF_DO_STCLASS_AND) { 3512 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3513 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 3514 for (value = 0; value < 256; value++) 3515 if (isALNUM(value)) 3516 ANYOF_BITMAP_CLEAR(data->start_class, value); 3517 } 3518 } 3519 else { 3520 if (data->start_class->flags & ANYOF_LOCALE) 3521 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 3522 else { 3523 for (value = 0; value < 256; value++) 3524 if (!isALNUM(value)) 3525 ANYOF_BITMAP_SET(data->start_class, value); 3526 } 3527 } 3528 break; 3529 case NALNUML: 3530 if (flags & SCF_DO_STCLASS_AND) { 3531 if (data->start_class->flags & ANYOF_LOCALE) 3532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 3533 } 3534 else { 3535 data->start_class->flags |= ANYOF_LOCALE; 3536 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 3537 } 3538 break; 3539 case SPACE: 3540 if (flags & SCF_DO_STCLASS_AND) { 3541 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3542 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 3543 for (value = 0; value < 256; value++) 3544 if (!isSPACE(value)) 3545 ANYOF_BITMAP_CLEAR(data->start_class, value); 3546 } 3547 } 3548 else { 3549 if (data->start_class->flags & ANYOF_LOCALE) 3550 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 3551 else { 3552 for (value = 0; value < 256; value++) 3553 if (isSPACE(value)) 3554 ANYOF_BITMAP_SET(data->start_class, value); 3555 } 3556 } 3557 break; 3558 case SPACEL: 3559 if (flags & SCF_DO_STCLASS_AND) { 3560 if (data->start_class->flags & ANYOF_LOCALE) 3561 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 3562 } 3563 else { 3564 data->start_class->flags |= ANYOF_LOCALE; 3565 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 3566 } 3567 break; 3568 case NSPACE: 3569 if (flags & SCF_DO_STCLASS_AND) { 3570 if (!(data->start_class->flags & ANYOF_LOCALE)) { 3571 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 3572 for (value = 0; value < 256; value++) 3573 if (isSPACE(value)) 3574 ANYOF_BITMAP_CLEAR(data->start_class, value); 3575 } 3576 } 3577 else { 3578 if (data->start_class->flags & ANYOF_LOCALE) 3579 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 3580 else { 3581 for (value = 0; value < 256; value++) 3582 if (!isSPACE(value)) 3583 ANYOF_BITMAP_SET(data->start_class, value); 3584 } 3585 } 3586 break; 3587 case NSPACEL: 3588 if (flags & SCF_DO_STCLASS_AND) { 3589 if (data->start_class->flags & ANYOF_LOCALE) { 3590 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 3591 for (value = 0; value < 256; value++) 3592 if (!isSPACE(value)) 3593 ANYOF_BITMAP_CLEAR(data->start_class, value); 3594 } 3595 } 3596 else { 3597 data->start_class->flags |= ANYOF_LOCALE; 3598 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 3599 } 3600 break; 3601 case DIGIT: 3602 if (flags & SCF_DO_STCLASS_AND) { 3603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); 3604 for (value = 0; value < 256; value++) 3605 if (!isDIGIT(value)) 3606 ANYOF_BITMAP_CLEAR(data->start_class, value); 3607 } 3608 else { 3609 if (data->start_class->flags & ANYOF_LOCALE) 3610 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); 3611 else { 3612 for (value = 0; value < 256; value++) 3613 if (isDIGIT(value)) 3614 ANYOF_BITMAP_SET(data->start_class, value); 3615 } 3616 } 3617 break; 3618 case NDIGIT: 3619 if (flags & SCF_DO_STCLASS_AND) { 3620 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); 3621 for (value = 0; value < 256; value++) 3622 if (isDIGIT(value)) 3623 ANYOF_BITMAP_CLEAR(data->start_class, value); 3624 } 3625 else { 3626 if (data->start_class->flags & ANYOF_LOCALE) 3627 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); 3628 else { 3629 for (value = 0; value < 256; value++) 3630 if (!isDIGIT(value)) 3631 ANYOF_BITMAP_SET(data->start_class, value); 3632 } 3633 } 3634 break; 3635 CASE_SYNST_FNC(VERTWS); 3636 CASE_SYNST_FNC(HORIZWS); 3637 3638 } 3639 if (flags & SCF_DO_STCLASS_OR) 3640 cl_and(data->start_class, and_withp); 3641 flags &= ~SCF_DO_STCLASS; 3642 } 3643 } 3644 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { 3645 data->flags |= (OP(scan) == MEOL 3646 ? SF_BEFORE_MEOL 3647 : SF_BEFORE_SEOL); 3648 } 3649 else if ( PL_regkind[OP(scan)] == BRANCHJ 3650 /* Lookbehind, or need to calculate parens/evals/stclass: */ 3651 && (scan->flags || data || (flags & SCF_DO_STCLASS)) 3652 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { 3653 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY 3654 || OP(scan) == UNLESSM ) 3655 { 3656 /* Negative Lookahead/lookbehind 3657 In this case we can't do fixed string optimisation. 3658 */ 3659 3660 I32 deltanext, minnext, fake = 0; 3661 regnode *nscan; 3662 struct regnode_charclass_class intrnl; 3663 int f = 0; 3664 3665 data_fake.flags = 0; 3666 if (data) { 3667 data_fake.whilem_c = data->whilem_c; 3668 data_fake.last_closep = data->last_closep; 3669 } 3670 else 3671 data_fake.last_closep = &fake; 3672 data_fake.pos_delta = delta; 3673 if ( flags & SCF_DO_STCLASS && !scan->flags 3674 && OP(scan) == IFMATCH ) { /* Lookahead */ 3675 cl_init(pRExC_state, &intrnl); 3676 data_fake.start_class = &intrnl; 3677 f |= SCF_DO_STCLASS_AND; 3678 } 3679 if (flags & SCF_WHILEM_VISITED_POS) 3680 f |= SCF_WHILEM_VISITED_POS; 3681 next = regnext(scan); 3682 nscan = NEXTOPER(NEXTOPER(scan)); 3683 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, 3684 last, &data_fake, stopparen, recursed, NULL, f, depth+1); 3685 if (scan->flags) { 3686 if (deltanext) { 3687 FAIL("Variable length lookbehind not implemented"); 3688 } 3689 else if (minnext > (I32)U8_MAX) { 3690 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); 3691 } 3692 scan->flags = (U8)minnext; 3693 } 3694 if (data) { 3695 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3696 pars++; 3697 if (data_fake.flags & SF_HAS_EVAL) 3698 data->flags |= SF_HAS_EVAL; 3699 data->whilem_c = data_fake.whilem_c; 3700 } 3701 if (f & SCF_DO_STCLASS_AND) { 3702 const int was = (data->start_class->flags & ANYOF_EOS); 3703 3704 cl_and(data->start_class, &intrnl); 3705 if (was) 3706 data->start_class->flags |= ANYOF_EOS; 3707 } 3708 } 3709 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY 3710 else { 3711 /* Positive Lookahead/lookbehind 3712 In this case we can do fixed string optimisation, 3713 but we must be careful about it. Note in the case of 3714 lookbehind the positions will be offset by the minimum 3715 length of the pattern, something we won't know about 3716 until after the recurse. 3717 */ 3718 I32 deltanext, fake = 0; 3719 regnode *nscan; 3720 struct regnode_charclass_class intrnl; 3721 int f = 0; 3722 /* We use SAVEFREEPV so that when the full compile 3723 is finished perl will clean up the allocated 3724 minlens when its all done. This was we don't 3725 have to worry about freeing them when we know 3726 they wont be used, which would be a pain. 3727 */ 3728 I32 *minnextp; 3729 Newx( minnextp, 1, I32 ); 3730 SAVEFREEPV(minnextp); 3731 3732 if (data) { 3733 StructCopy(data, &data_fake, scan_data_t); 3734 if ((flags & SCF_DO_SUBSTR) && data->last_found) { 3735 f |= SCF_DO_SUBSTR; 3736 if (scan->flags) 3737 SCAN_COMMIT(pRExC_state, &data_fake,minlenp); 3738 data_fake.last_found=newSVsv(data->last_found); 3739 } 3740 } 3741 else 3742 data_fake.last_closep = &fake; 3743 data_fake.flags = 0; 3744 data_fake.pos_delta = delta; 3745 if (is_inf) 3746 data_fake.flags |= SF_IS_INF; 3747 if ( flags & SCF_DO_STCLASS && !scan->flags 3748 && OP(scan) == IFMATCH ) { /* Lookahead */ 3749 cl_init(pRExC_state, &intrnl); 3750 data_fake.start_class = &intrnl; 3751 f |= SCF_DO_STCLASS_AND; 3752 } 3753 if (flags & SCF_WHILEM_VISITED_POS) 3754 f |= SCF_WHILEM_VISITED_POS; 3755 next = regnext(scan); 3756 nscan = NEXTOPER(NEXTOPER(scan)); 3757 3758 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, 3759 last, &data_fake, stopparen, recursed, NULL, f,depth+1); 3760 if (scan->flags) { 3761 if (deltanext) { 3762 FAIL("Variable length lookbehind not implemented"); 3763 } 3764 else if (*minnextp > (I32)U8_MAX) { 3765 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); 3766 } 3767 scan->flags = (U8)*minnextp; 3768 } 3769 3770 *minnextp += min; 3771 3772 if (f & SCF_DO_STCLASS_AND) { 3773 const int was = (data->start_class->flags & ANYOF_EOS); 3774 3775 cl_and(data->start_class, &intrnl); 3776 if (was) 3777 data->start_class->flags |= ANYOF_EOS; 3778 } 3779 if (data) { 3780 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3781 pars++; 3782 if (data_fake.flags & SF_HAS_EVAL) 3783 data->flags |= SF_HAS_EVAL; 3784 data->whilem_c = data_fake.whilem_c; 3785 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { 3786 if (RExC_rx->minlen<*minnextp) 3787 RExC_rx->minlen=*minnextp; 3788 SCAN_COMMIT(pRExC_state, &data_fake, minnextp); 3789 SvREFCNT_dec(data_fake.last_found); 3790 3791 if ( data_fake.minlen_fixed != minlenp ) 3792 { 3793 data->offset_fixed= data_fake.offset_fixed; 3794 data->minlen_fixed= data_fake.minlen_fixed; 3795 data->lookbehind_fixed+= scan->flags; 3796 } 3797 if ( data_fake.minlen_float != minlenp ) 3798 { 3799 data->minlen_float= data_fake.minlen_float; 3800 data->offset_float_min=data_fake.offset_float_min; 3801 data->offset_float_max=data_fake.offset_float_max; 3802 data->lookbehind_float+= scan->flags; 3803 } 3804 } 3805 } 3806 3807 3808 } 3809 #endif 3810 } 3811 else if (OP(scan) == OPEN) { 3812 if (stopparen != (I32)ARG(scan)) 3813 pars++; 3814 } 3815 else if (OP(scan) == CLOSE) { 3816 if (stopparen == (I32)ARG(scan)) { 3817 break; 3818 } 3819 if ((I32)ARG(scan) == is_par) { 3820 next = regnext(scan); 3821 3822 if ( next && (OP(next) != WHILEM) && next < last) 3823 is_par = 0; /* Disable optimization */ 3824 } 3825 if (data) 3826 *(data->last_closep) = ARG(scan); 3827 } 3828 else if (OP(scan) == EVAL) { 3829 if (data) 3830 data->flags |= SF_HAS_EVAL; 3831 } 3832 else if ( PL_regkind[OP(scan)] == ENDLIKE ) { 3833 if (flags & SCF_DO_SUBSTR) { 3834 SCAN_COMMIT(pRExC_state,data,minlenp); 3835 flags &= ~SCF_DO_SUBSTR; 3836 } 3837 if (data && OP(scan)==ACCEPT) { 3838 data->flags |= SCF_SEEN_ACCEPT; 3839 if (stopmin > min) 3840 stopmin = min; 3841 } 3842 } 3843 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ 3844 { 3845 if (flags & SCF_DO_SUBSTR) { 3846 SCAN_COMMIT(pRExC_state,data,minlenp); 3847 data->longest = &(data->longest_float); 3848 } 3849 is_inf = is_inf_internal = 1; 3850 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 3851 cl_anything(pRExC_state, data->start_class); 3852 flags &= ~SCF_DO_STCLASS; 3853 } 3854 else if (OP(scan) == GPOS) { 3855 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && 3856 !(delta || is_inf || (data && data->pos_delta))) 3857 { 3858 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) 3859 RExC_rx->extflags |= RXf_ANCH_GPOS; 3860 if (RExC_rx->gofs < (U32)min) 3861 RExC_rx->gofs = min; 3862 } else { 3863 RExC_rx->extflags |= RXf_GPOS_FLOAT; 3864 RExC_rx->gofs = 0; 3865 } 3866 } 3867 #ifdef TRIE_STUDY_OPT 3868 #ifdef FULL_TRIE_STUDY 3869 else if (PL_regkind[OP(scan)] == TRIE) { 3870 /* NOTE - There is similar code to this block above for handling 3871 BRANCH nodes on the initial study. If you change stuff here 3872 check there too. */ 3873 regnode *trie_node= scan; 3874 regnode *tail= regnext(scan); 3875 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 3876 I32 max1 = 0, min1 = I32_MAX; 3877 struct regnode_charclass_class accum; 3878 3879 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ 3880 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ 3881 if (flags & SCF_DO_STCLASS) 3882 cl_init_zero(pRExC_state, &accum); 3883 3884 if (!trie->jump) { 3885 min1= trie->minlen; 3886 max1= trie->maxlen; 3887 } else { 3888 const regnode *nextbranch= NULL; 3889 U32 word; 3890 3891 for ( word=1 ; word <= trie->wordcount ; word++) 3892 { 3893 I32 deltanext=0, minnext=0, f = 0, fake; 3894 struct regnode_charclass_class this_class; 3895 3896 data_fake.flags = 0; 3897 if (data) { 3898 data_fake.whilem_c = data->whilem_c; 3899 data_fake.last_closep = data->last_closep; 3900 } 3901 else 3902 data_fake.last_closep = &fake; 3903 data_fake.pos_delta = delta; 3904 if (flags & SCF_DO_STCLASS) { 3905 cl_init(pRExC_state, &this_class); 3906 data_fake.start_class = &this_class; 3907 f = SCF_DO_STCLASS_AND; 3908 } 3909 if (flags & SCF_WHILEM_VISITED_POS) 3910 f |= SCF_WHILEM_VISITED_POS; 3911 3912 if (trie->jump[word]) { 3913 if (!nextbranch) 3914 nextbranch = trie_node + trie->jump[0]; 3915 scan= trie_node + trie->jump[word]; 3916 /* We go from the jump point to the branch that follows 3917 it. Note this means we need the vestigal unused branches 3918 even though they arent otherwise used. 3919 */ 3920 minnext = study_chunk(pRExC_state, &scan, minlenp, 3921 &deltanext, (regnode *)nextbranch, &data_fake, 3922 stopparen, recursed, NULL, f,depth+1); 3923 } 3924 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 3925 nextbranch= regnext((regnode*)nextbranch); 3926 3927 if (min1 > (I32)(minnext + trie->minlen)) 3928 min1 = minnext + trie->minlen; 3929 if (max1 < (I32)(minnext + deltanext + trie->maxlen)) 3930 max1 = minnext + deltanext + trie->maxlen; 3931 if (deltanext == I32_MAX) 3932 is_inf = is_inf_internal = 1; 3933 3934 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 3935 pars++; 3936 if (data_fake.flags & SCF_SEEN_ACCEPT) { 3937 if ( stopmin > min + min1) 3938 stopmin = min + min1; 3939 flags &= ~SCF_DO_SUBSTR; 3940 if (data) 3941 data->flags |= SCF_SEEN_ACCEPT; 3942 } 3943 if (data) { 3944 if (data_fake.flags & SF_HAS_EVAL) 3945 data->flags |= SF_HAS_EVAL; 3946 data->whilem_c = data_fake.whilem_c; 3947 } 3948 if (flags & SCF_DO_STCLASS) 3949 cl_or(pRExC_state, &accum, &this_class); 3950 } 3951 } 3952 if (flags & SCF_DO_SUBSTR) { 3953 data->pos_min += min1; 3954 data->pos_delta += max1 - min1; 3955 if (max1 != min1 || is_inf) 3956 data->longest = &(data->longest_float); 3957 } 3958 min += min1; 3959 delta += max1 - min1; 3960 if (flags & SCF_DO_STCLASS_OR) { 3961 cl_or(pRExC_state, data->start_class, &accum); 3962 if (min1) { 3963 cl_and(data->start_class, and_withp); 3964 flags &= ~SCF_DO_STCLASS; 3965 } 3966 } 3967 else if (flags & SCF_DO_STCLASS_AND) { 3968 if (min1) { 3969 cl_and(data->start_class, &accum); 3970 flags &= ~SCF_DO_STCLASS; 3971 } 3972 else { 3973 /* Switch to OR mode: cache the old value of 3974 * data->start_class */ 3975 INIT_AND_WITHP; 3976 StructCopy(data->start_class, and_withp, 3977 struct regnode_charclass_class); 3978 flags &= ~SCF_DO_STCLASS_AND; 3979 StructCopy(&accum, data->start_class, 3980 struct regnode_charclass_class); 3981 flags |= SCF_DO_STCLASS_OR; 3982 data->start_class->flags |= ANYOF_EOS; 3983 } 3984 } 3985 scan= tail; 3986 continue; 3987 } 3988 #else 3989 else if (PL_regkind[OP(scan)] == TRIE) { 3990 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; 3991 U8*bang=NULL; 3992 3993 min += trie->minlen; 3994 delta += (trie->maxlen - trie->minlen); 3995 flags &= ~SCF_DO_STCLASS; /* xxx */ 3996 if (flags & SCF_DO_SUBSTR) { 3997 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ 3998 data->pos_min += trie->minlen; 3999 data->pos_delta += (trie->maxlen - trie->minlen); 4000 if (trie->maxlen != trie->minlen) 4001 data->longest = &(data->longest_float); 4002 } 4003 if (trie->jump) /* no more substrings -- for now /grr*/ 4004 flags &= ~SCF_DO_SUBSTR; 4005 } 4006 #endif /* old or new */ 4007 #endif /* TRIE_STUDY_OPT */ 4008 4009 /* Else: zero-length, ignore. */ 4010 scan = regnext(scan); 4011 } 4012 if (frame) { 4013 last = frame->last; 4014 scan = frame->next; 4015 stopparen = frame->stop; 4016 frame = frame->prev; 4017 goto fake_study_recurse; 4018 } 4019 4020 finish: 4021 assert(!frame); 4022 DEBUG_STUDYDATA("pre-fin:",data,depth); 4023 4024 *scanp = scan; 4025 *deltap = is_inf_internal ? I32_MAX : delta; 4026 if (flags & SCF_DO_SUBSTR && is_inf) 4027 data->pos_delta = I32_MAX - data->pos_min; 4028 if (is_par > (I32)U8_MAX) 4029 is_par = 0; 4030 if (is_par && pars==1 && data) { 4031 data->flags |= SF_IN_PAR; 4032 data->flags &= ~SF_HAS_PAR; 4033 } 4034 else if (pars && data) { 4035 data->flags |= SF_HAS_PAR; 4036 data->flags &= ~SF_IN_PAR; 4037 } 4038 if (flags & SCF_DO_STCLASS_OR) 4039 cl_and(data->start_class, and_withp); 4040 if (flags & SCF_TRIE_RESTUDY) 4041 data->flags |= SCF_TRIE_RESTUDY; 4042 4043 DEBUG_STUDYDATA("post-fin:",data,depth); 4044 4045 return min < stopmin ? min : stopmin; 4046 } 4047 4048 STATIC U32 4049 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) 4050 { 4051 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; 4052 4053 Renewc(RExC_rxi->data, 4054 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), 4055 char, struct reg_data); 4056 if(count) 4057 Renew(RExC_rxi->data->what, count + n, U8); 4058 else 4059 Newx(RExC_rxi->data->what, n, U8); 4060 RExC_rxi->data->count = count + n; 4061 Copy(s, RExC_rxi->data->what + count, n, U8); 4062 return count; 4063 } 4064 4065 /*XXX: todo make this not included in a non debugging perl */ 4066 #ifndef PERL_IN_XSUB_RE 4067 void 4068 Perl_reginitcolors(pTHX) 4069 { 4070 dVAR; 4071 const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); 4072 if (s) { 4073 char *t = savepv(s); 4074 int i = 0; 4075 PL_colors[0] = t; 4076 while (++i < 6) { 4077 t = strchr(t, '\t'); 4078 if (t) { 4079 *t = '\0'; 4080 PL_colors[i] = ++t; 4081 } 4082 else 4083 PL_colors[i] = t = (char *)""; 4084 } 4085 } else { 4086 int i = 0; 4087 while (i < 6) 4088 PL_colors[i++] = (char *)""; 4089 } 4090 PL_colorset = 1; 4091 } 4092 #endif 4093 4094 4095 #ifdef TRIE_STUDY_OPT 4096 #define CHECK_RESTUDY_GOTO \ 4097 if ( \ 4098 (data.flags & SCF_TRIE_RESTUDY) \ 4099 && ! restudied++ \ 4100 ) goto reStudy 4101 #else 4102 #define CHECK_RESTUDY_GOTO 4103 #endif 4104 4105 /* 4106 - pregcomp - compile a regular expression into internal code 4107 * 4108 * We can't allocate space until we know how big the compiled form will be, 4109 * but we can't compile it (and thus know how big it is) until we've got a 4110 * place to put the code. So we cheat: we compile it twice, once with code 4111 * generation turned off and size counting turned on, and once "for real". 4112 * This also means that we don't allocate space until we are sure that the 4113 * thing really will compile successfully, and we never have to move the 4114 * code and thus invalidate pointers into it. (Note that it has to be in 4115 * one piece because free() must be able to free it all.) [NB: not true in perl] 4116 * 4117 * Beware that the optimization-preparation code in here knows about some 4118 * of the structure of the compiled regexp. [I'll say.] 4119 */ 4120 4121 4122 4123 #ifndef PERL_IN_XSUB_RE 4124 #define RE_ENGINE_PTR &PL_core_reg_engine 4125 #else 4126 extern const struct regexp_engine my_reg_engine; 4127 #define RE_ENGINE_PTR &my_reg_engine 4128 #endif 4129 4130 #ifndef PERL_IN_XSUB_RE 4131 REGEXP * 4132 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) 4133 { 4134 dVAR; 4135 HV * const table = GvHV(PL_hintgv); 4136 /* Dispatch a request to compile a regexp to correct 4137 regexp engine. */ 4138 if (table) { 4139 SV **ptr= hv_fetchs(table, "regcomp", FALSE); 4140 GET_RE_DEBUG_FLAGS_DECL; 4141 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) { 4142 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr)); 4143 DEBUG_COMPILE_r({ 4144 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", 4145 SvIV(*ptr)); 4146 }); 4147 return CALLREGCOMP_ENG(eng, pattern, flags); 4148 } 4149 } 4150 return Perl_re_compile(aTHX_ pattern, flags); 4151 } 4152 #endif 4153 4154 REGEXP * 4155 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) 4156 { 4157 dVAR; 4158 register REGEXP *r; 4159 register regexp_internal *ri; 4160 STRLEN plen; 4161 char* exp = SvPV((SV*)pattern, plen); 4162 char* xend = exp + plen; 4163 regnode *scan; 4164 I32 flags; 4165 I32 minlen = 0; 4166 I32 sawplus = 0; 4167 I32 sawopen = 0; 4168 scan_data_t data; 4169 RExC_state_t RExC_state; 4170 RExC_state_t * const pRExC_state = &RExC_state; 4171 #ifdef TRIE_STUDY_OPT 4172 int restudied= 0; 4173 RExC_state_t copyRExC_state; 4174 #endif 4175 GET_RE_DEBUG_FLAGS_DECL; 4176 DEBUG_r(if (!PL_colorset) reginitcolors()); 4177 4178 RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8; 4179 4180 DEBUG_COMPILE_r({ 4181 SV *dsv= sv_newmortal(); 4182 RE_PV_QUOTED_DECL(s, RExC_utf8, 4183 dsv, exp, plen, 60); 4184 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", 4185 PL_colors[4],PL_colors[5],s); 4186 }); 4187 4188 redo_first_pass: 4189 RExC_precomp = exp; 4190 RExC_flags = pm_flags; 4191 RExC_sawback = 0; 4192 4193 RExC_seen = 0; 4194 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 4195 RExC_seen_evals = 0; 4196 RExC_extralen = 0; 4197 4198 /* First pass: determine size, legality. */ 4199 RExC_parse = exp; 4200 RExC_start = exp; 4201 RExC_end = xend; 4202 RExC_naughty = 0; 4203 RExC_npar = 1; 4204 RExC_nestroot = 0; 4205 RExC_size = 0L; 4206 RExC_emit = &PL_regdummy; 4207 RExC_whilem_seen = 0; 4208 RExC_charnames = NULL; 4209 RExC_open_parens = NULL; 4210 RExC_close_parens = NULL; 4211 RExC_opend = NULL; 4212 RExC_paren_names = NULL; 4213 #ifdef DEBUGGING 4214 RExC_paren_name_list = NULL; 4215 #endif 4216 RExC_recurse = NULL; 4217 RExC_recurse_count = 0; 4218 4219 #if 0 /* REGC() is (currently) a NOP at the first pass. 4220 * Clever compilers notice this and complain. --jhi */ 4221 REGC((U8)REG_MAGIC, (char*)RExC_emit); 4222 #endif 4223 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n")); 4224 if (reg(pRExC_state, 0, &flags,1) == NULL) { 4225 RExC_precomp = NULL; 4226 return(NULL); 4227 } 4228 if (RExC_utf8 && !RExC_orig_utf8) { 4229 /* It's possible to write a regexp in ascii that represents Unicode 4230 codepoints outside of the byte range, such as via \x{100}. If we 4231 detect such a sequence we have to convert the entire pattern to utf8 4232 and then recompile, as our sizing calculation will have been based 4233 on 1 byte == 1 character, but we will need to use utf8 to encode 4234 at least some part of the pattern, and therefore must convert the whole 4235 thing. 4236 XXX: somehow figure out how to make this less expensive... 4237 -- dmq */ 4238 STRLEN len = plen; 4239 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, 4240 "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); 4241 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); 4242 xend = exp + len; 4243 RExC_orig_utf8 = RExC_utf8; 4244 SAVEFREEPV(exp); 4245 goto redo_first_pass; 4246 } 4247 DEBUG_PARSE_r({ 4248 PerlIO_printf(Perl_debug_log, 4249 "Required size %"IVdf" nodes\n" 4250 "Starting second pass (creation)\n", 4251 (IV)RExC_size); 4252 RExC_lastnum=0; 4253 RExC_lastparse=NULL; 4254 }); 4255 /* Small enough for pointer-storage convention? 4256 If extralen==0, this means that we will not need long jumps. */ 4257 if (RExC_size >= 0x10000L && RExC_extralen) 4258 RExC_size += RExC_extralen; 4259 else 4260 RExC_extralen = 0; 4261 if (RExC_whilem_seen > 15) 4262 RExC_whilem_seen = 15; 4263 4264 /* Allocate space and zero-initialize. Note, the two step process 4265 of zeroing when in debug mode, thus anything assigned has to 4266 happen after that */ 4267 Newxz(r, 1, regexp); 4268 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), 4269 char, regexp_internal); 4270 if ( r == NULL || ri == NULL ) 4271 FAIL("Regexp out of space"); 4272 #ifdef DEBUGGING 4273 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ 4274 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); 4275 #else 4276 /* bulk initialize base fields with 0. */ 4277 Zero(ri, sizeof(regexp_internal), char); 4278 #endif 4279 4280 /* non-zero initialization begins here */ 4281 RXi_SET( r, ri ); 4282 r->engine= RE_ENGINE_PTR; 4283 r->refcnt = 1; 4284 r->prelen = plen; 4285 r->extflags = pm_flags; 4286 { 4287 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); 4288 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); 4289 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); 4290 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); 4291 const char *fptr = STD_PAT_MODS; /*"msix"*/ 4292 char *p; 4293 r->wraplen = r->prelen + has_minus + has_p + has_runon 4294 + (sizeof(STD_PAT_MODS) - 1) 4295 + (sizeof("(?:)") - 1); 4296 4297 Newx(r->wrapped, r->wraplen + 1, char ); 4298 p = r->wrapped; 4299 *p++='('; *p++='?'; 4300 if (has_p) 4301 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ 4302 { 4303 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; 4304 char *colon = r + 1; 4305 char ch; 4306 4307 while((ch = *fptr++)) { 4308 if(reganch & 1) 4309 *p++ = ch; 4310 else 4311 *r-- = ch; 4312 reganch >>= 1; 4313 } 4314 if(has_minus) { 4315 *r = '-'; 4316 p = colon; 4317 } 4318 } 4319 4320 *p++ = ':'; 4321 Copy(RExC_precomp, p, r->prelen, char); 4322 r->precomp = p; 4323 p += r->prelen; 4324 if (has_runon) 4325 *p++ = '\n'; 4326 *p++ = ')'; 4327 *p = 0; 4328 } 4329 4330 r->intflags = 0; 4331 r->nparens = RExC_npar - 1; /* set early to validate backrefs */ 4332 4333 if (RExC_seen & REG_SEEN_RECURSE) { 4334 Newxz(RExC_open_parens, RExC_npar,regnode *); 4335 SAVEFREEPV(RExC_open_parens); 4336 Newxz(RExC_close_parens,RExC_npar,regnode *); 4337 SAVEFREEPV(RExC_close_parens); 4338 } 4339 4340 /* Useful during FAIL. */ 4341 #ifdef RE_TRACK_PATTERN_OFFSETS 4342 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ 4343 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, 4344 "%s %"UVuf" bytes for offset annotations.\n", 4345 ri->u.offsets ? "Got" : "Couldn't get", 4346 (UV)((2*RExC_size+1) * sizeof(U32)))); 4347 #endif 4348 SetProgLen(ri,RExC_size); 4349 RExC_rx = r; 4350 RExC_rxi = ri; 4351 4352 /* Second pass: emit code. */ 4353 RExC_flags = pm_flags; /* don't let top level (?i) bleed */ 4354 RExC_parse = exp; 4355 RExC_end = xend; 4356 RExC_naughty = 0; 4357 RExC_npar = 1; 4358 RExC_emit_start = ri->program; 4359 RExC_emit = ri->program; 4360 RExC_emit_bound = ri->program + RExC_size + 1; 4361 4362 /* Store the count of eval-groups for security checks: */ 4363 RExC_rx->seen_evals = RExC_seen_evals; 4364 REGC((U8)REG_MAGIC, (char*) RExC_emit++); 4365 if (reg(pRExC_state, 0, &flags,1) == NULL) { 4366 ReREFCNT_dec(r); 4367 return(NULL); 4368 } 4369 /* XXXX To minimize changes to RE engine we always allocate 4370 3-units-long substrs field. */ 4371 Newx(r->substrs, 1, struct reg_substr_data); 4372 if (RExC_recurse_count) { 4373 Newxz(RExC_recurse,RExC_recurse_count,regnode *); 4374 SAVEFREEPV(RExC_recurse); 4375 } 4376 4377 reStudy: 4378 r->minlen = minlen = sawplus = sawopen = 0; 4379 Zero(r->substrs, 1, struct reg_substr_data); 4380 4381 #ifdef TRIE_STUDY_OPT 4382 if ( restudied ) { 4383 U32 seen=RExC_seen; 4384 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); 4385 4386 RExC_state = copyRExC_state; 4387 if (seen & REG_TOP_LEVEL_BRANCHES) 4388 RExC_seen |= REG_TOP_LEVEL_BRANCHES; 4389 else 4390 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; 4391 if (data.last_found) { 4392 SvREFCNT_dec(data.longest_fixed); 4393 SvREFCNT_dec(data.longest_float); 4394 SvREFCNT_dec(data.last_found); 4395 } 4396 StructCopy(&zero_scan_data, &data, scan_data_t); 4397 } else { 4398 StructCopy(&zero_scan_data, &data, scan_data_t); 4399 copyRExC_state = RExC_state; 4400 } 4401 #else 4402 StructCopy(&zero_scan_data, &data, scan_data_t); 4403 #endif 4404 4405 /* Dig out information for optimizations. */ 4406 r->extflags = RExC_flags; /* was pm_op */ 4407 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ 4408 4409 if (UTF) 4410 r->extflags |= RXf_UTF8; /* Unicode in it? */ 4411 ri->regstclass = NULL; 4412 if (RExC_naughty >= 10) /* Probably an expensive pattern. */ 4413 r->intflags |= PREGf_NAUGHTY; 4414 scan = ri->program + 1; /* First BRANCH. */ 4415 4416 /* testing for BRANCH here tells us whether there is "must appear" 4417 data in the pattern. If there is then we can use it for optimisations */ 4418 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ 4419 I32 fake; 4420 STRLEN longest_float_length, longest_fixed_length; 4421 struct regnode_charclass_class ch_class; /* pointed to by data */ 4422 int stclass_flag; 4423 I32 last_close = 0; /* pointed to by data */ 4424 regnode *first= scan; 4425 regnode *first_next= regnext(first); 4426 4427 /* Skip introductions and multiplicators >= 1. */ 4428 while ((OP(first) == OPEN && (sawopen = 1)) || 4429 /* An OR of *one* alternative - should not happen now. */ 4430 (OP(first) == BRANCH && OP(first_next) != BRANCH) || 4431 /* for now we can't handle lookbehind IFMATCH*/ 4432 (OP(first) == IFMATCH && !first->flags) || 4433 (OP(first) == PLUS) || 4434 (OP(first) == MINMOD) || 4435 /* An {n,m} with n>0 */ 4436 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || 4437 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) 4438 { 4439 4440 if (OP(first) == PLUS) 4441 sawplus = 1; 4442 else 4443 first += regarglen[OP(first)]; 4444 if (OP(first) == IFMATCH) { 4445 first = NEXTOPER(first); 4446 first += EXTRA_STEP_2ARGS; 4447 } else /* XXX possible optimisation for /(?=)/ */ 4448 first = NEXTOPER(first); 4449 first_next= regnext(first); 4450 } 4451 4452 /* Starting-point info. */ 4453 again: 4454 DEBUG_PEEP("first:",first,0); 4455 /* Ignore EXACT as we deal with it later. */ 4456 if (PL_regkind[OP(first)] == EXACT) { 4457 if (OP(first) == EXACT) 4458 NOOP; /* Empty, get anchored substr later. */ 4459 else if ((OP(first) == EXACTF || OP(first) == EXACTFL)) 4460 ri->regstclass = first; 4461 } 4462 #ifdef TRIE_STCLASS 4463 else if (PL_regkind[OP(first)] == TRIE && 4464 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) 4465 { 4466 regnode *trie_op; 4467 /* this can happen only on restudy */ 4468 if ( OP(first) == TRIE ) { 4469 struct regnode_1 *trieop = (struct regnode_1 *) 4470 PerlMemShared_calloc(1, sizeof(struct regnode_1)); 4471 StructCopy(first,trieop,struct regnode_1); 4472 trie_op=(regnode *)trieop; 4473 } else { 4474 struct regnode_charclass *trieop = (struct regnode_charclass *) 4475 PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); 4476 StructCopy(first,trieop,struct regnode_charclass); 4477 trie_op=(regnode *)trieop; 4478 } 4479 OP(trie_op)+=2; 4480 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); 4481 ri->regstclass = trie_op; 4482 } 4483 #endif 4484 else if (strchr((const char*)PL_simple,OP(first))) 4485 ri->regstclass = first; 4486 else if (PL_regkind[OP(first)] == BOUND || 4487 PL_regkind[OP(first)] == NBOUND) 4488 ri->regstclass = first; 4489 else if (PL_regkind[OP(first)] == BOL) { 4490 r->extflags |= (OP(first) == MBOL 4491 ? RXf_ANCH_MBOL 4492 : (OP(first) == SBOL 4493 ? RXf_ANCH_SBOL 4494 : RXf_ANCH_BOL)); 4495 first = NEXTOPER(first); 4496 goto again; 4497 } 4498 else if (OP(first) == GPOS) { 4499 r->extflags |= RXf_ANCH_GPOS; 4500 first = NEXTOPER(first); 4501 goto again; 4502 } 4503 else if ((!sawopen || !RExC_sawback) && 4504 (OP(first) == STAR && 4505 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && 4506 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) 4507 { 4508 /* turn .* into ^.* with an implied $*=1 */ 4509 const int type = 4510 (OP(NEXTOPER(first)) == REG_ANY) 4511 ? RXf_ANCH_MBOL 4512 : RXf_ANCH_SBOL; 4513 r->extflags |= type; 4514 r->intflags |= PREGf_IMPLICIT; 4515 first = NEXTOPER(first); 4516 goto again; 4517 } 4518 if (sawplus && (!sawopen || !RExC_sawback) 4519 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ 4520 /* x+ must match at the 1st pos of run of x's */ 4521 r->intflags |= PREGf_SKIP; 4522 4523 /* Scan is after the zeroth branch, first is atomic matcher. */ 4524 #ifdef TRIE_STUDY_OPT 4525 DEBUG_PARSE_r( 4526 if (!restudied) 4527 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 4528 (IV)(first - scan + 1)) 4529 ); 4530 #else 4531 DEBUG_PARSE_r( 4532 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 4533 (IV)(first - scan + 1)) 4534 ); 4535 #endif 4536 4537 4538 /* 4539 * If there's something expensive in the r.e., find the 4540 * longest literal string that must appear and make it the 4541 * regmust. Resolve ties in favor of later strings, since 4542 * the regstart check works with the beginning of the r.e. 4543 * and avoiding duplication strengthens checking. Not a 4544 * strong reason, but sufficient in the absence of others. 4545 * [Now we resolve ties in favor of the earlier string if 4546 * it happens that c_offset_min has been invalidated, since the 4547 * earlier string may buy us something the later one won't.] 4548 */ 4549 4550 data.longest_fixed = newSVpvs(""); 4551 data.longest_float = newSVpvs(""); 4552 data.last_found = newSVpvs(""); 4553 data.longest = &(data.longest_fixed); 4554 first = scan; 4555 if (!ri->regstclass) { 4556 cl_init(pRExC_state, &ch_class); 4557 data.start_class = &ch_class; 4558 stclass_flag = SCF_DO_STCLASS_AND; 4559 } else /* XXXX Check for BOUND? */ 4560 stclass_flag = 0; 4561 data.last_closep = &last_close; 4562 4563 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ 4564 &data, -1, NULL, NULL, 4565 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); 4566 4567 4568 CHECK_RESTUDY_GOTO; 4569 4570 4571 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) 4572 && data.last_start_min == 0 && data.last_end > 0 4573 && !RExC_seen_zerolen 4574 && !(RExC_seen & REG_SEEN_VERBARG) 4575 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) 4576 r->extflags |= RXf_CHECK_ALL; 4577 scan_commit(pRExC_state, &data,&minlen,0); 4578 SvREFCNT_dec(data.last_found); 4579 4580 /* Note that code very similar to this but for anchored string 4581 follows immediately below, changes may need to be made to both. 4582 Be careful. 4583 */ 4584 longest_float_length = CHR_SVLEN(data.longest_float); 4585 if (longest_float_length 4586 || (data.flags & SF_FL_BEFORE_EOL 4587 && (!(data.flags & SF_FL_BEFORE_MEOL) 4588 || (RExC_flags & RXf_PMf_MULTILINE)))) 4589 { 4590 I32 t,ml; 4591 4592 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ 4593 && data.offset_fixed == data.offset_float_min 4594 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) 4595 goto remove_float; /* As in (a)+. */ 4596 4597 /* copy the information about the longest float from the reg_scan_data 4598 over to the program. */ 4599 if (SvUTF8(data.longest_float)) { 4600 r->float_utf8 = data.longest_float; 4601 r->float_substr = NULL; 4602 } else { 4603 r->float_substr = data.longest_float; 4604 r->float_utf8 = NULL; 4605 } 4606 /* float_end_shift is how many chars that must be matched that 4607 follow this item. We calculate it ahead of time as once the 4608 lookbehind offset is added in we lose the ability to correctly 4609 calculate it.*/ 4610 ml = data.minlen_float ? *(data.minlen_float) 4611 : (I32)longest_float_length; 4612 r->float_end_shift = ml - data.offset_float_min 4613 - longest_float_length + (SvTAIL(data.longest_float) != 0) 4614 + data.lookbehind_float; 4615 r->float_min_offset = data.offset_float_min - data.lookbehind_float; 4616 r->float_max_offset = data.offset_float_max; 4617 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ 4618 r->float_max_offset -= data.lookbehind_float; 4619 4620 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ 4621 && (!(data.flags & SF_FL_BEFORE_MEOL) 4622 || (RExC_flags & RXf_PMf_MULTILINE))); 4623 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); 4624 } 4625 else { 4626 remove_float: 4627 r->float_substr = r->float_utf8 = NULL; 4628 SvREFCNT_dec(data.longest_float); 4629 longest_float_length = 0; 4630 } 4631 4632 /* Note that code very similar to this but for floating string 4633 is immediately above, changes may need to be made to both. 4634 Be careful. 4635 */ 4636 longest_fixed_length = CHR_SVLEN(data.longest_fixed); 4637 if (longest_fixed_length 4638 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ 4639 && (!(data.flags & SF_FIX_BEFORE_MEOL) 4640 || (RExC_flags & RXf_PMf_MULTILINE)))) 4641 { 4642 I32 t,ml; 4643 4644 /* copy the information about the longest fixed 4645 from the reg_scan_data over to the program. */ 4646 if (SvUTF8(data.longest_fixed)) { 4647 r->anchored_utf8 = data.longest_fixed; 4648 r->anchored_substr = NULL; 4649 } else { 4650 r->anchored_substr = data.longest_fixed; 4651 r->anchored_utf8 = NULL; 4652 } 4653 /* fixed_end_shift is how many chars that must be matched that 4654 follow this item. We calculate it ahead of time as once the 4655 lookbehind offset is added in we lose the ability to correctly 4656 calculate it.*/ 4657 ml = data.minlen_fixed ? *(data.minlen_fixed) 4658 : (I32)longest_fixed_length; 4659 r->anchored_end_shift = ml - data.offset_fixed 4660 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0) 4661 + data.lookbehind_fixed; 4662 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; 4663 4664 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ 4665 && (!(data.flags & SF_FIX_BEFORE_MEOL) 4666 || (RExC_flags & RXf_PMf_MULTILINE))); 4667 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); 4668 } 4669 else { 4670 r->anchored_substr = r->anchored_utf8 = NULL; 4671 SvREFCNT_dec(data.longest_fixed); 4672 longest_fixed_length = 0; 4673 } 4674 if (ri->regstclass 4675 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) 4676 ri->regstclass = NULL; 4677 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) 4678 && stclass_flag 4679 && !(data.start_class->flags & ANYOF_EOS) 4680 && !cl_is_anything(data.start_class)) 4681 { 4682 const U32 n = add_data(pRExC_state, 1, "f"); 4683 4684 Newx(RExC_rxi->data->data[n], 1, 4685 struct regnode_charclass_class); 4686 StructCopy(data.start_class, 4687 (struct regnode_charclass_class*)RExC_rxi->data->data[n], 4688 struct regnode_charclass_class); 4689 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 4690 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 4691 DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); 4692 regprop(r, sv, (regnode*)data.start_class); 4693 PerlIO_printf(Perl_debug_log, 4694 "synthetic stclass \"%s\".\n", 4695 SvPVX_const(sv));}); 4696 } 4697 4698 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ 4699 if (longest_fixed_length > longest_float_length) { 4700 r->check_end_shift = r->anchored_end_shift; 4701 r->check_substr = r->anchored_substr; 4702 r->check_utf8 = r->anchored_utf8; 4703 r->check_offset_min = r->check_offset_max = r->anchored_offset; 4704 if (r->extflags & RXf_ANCH_SINGLE) 4705 r->extflags |= RXf_NOSCAN; 4706 } 4707 else { 4708 r->check_end_shift = r->float_end_shift; 4709 r->check_substr = r->float_substr; 4710 r->check_utf8 = r->float_utf8; 4711 r->check_offset_min = r->float_min_offset; 4712 r->check_offset_max = r->float_max_offset; 4713 } 4714 /* XXXX Currently intuiting is not compatible with ANCH_GPOS. 4715 This should be changed ASAP! */ 4716 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { 4717 r->extflags |= RXf_USE_INTUIT; 4718 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) 4719 r->extflags |= RXf_INTUIT_TAIL; 4720 } 4721 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) 4722 if ( (STRLEN)minlen < longest_float_length ) 4723 minlen= longest_float_length; 4724 if ( (STRLEN)minlen < longest_fixed_length ) 4725 minlen= longest_fixed_length; 4726 */ 4727 } 4728 else { 4729 /* Several toplevels. Best we can is to set minlen. */ 4730 I32 fake; 4731 struct regnode_charclass_class ch_class; 4732 I32 last_close = 0; 4733 4734 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); 4735 4736 scan = ri->program + 1; 4737 cl_init(pRExC_state, &ch_class); 4738 data.start_class = &ch_class; 4739 data.last_closep = &last_close; 4740 4741 4742 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, 4743 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); 4744 4745 CHECK_RESTUDY_GOTO; 4746 4747 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 4748 = r->float_substr = r->float_utf8 = NULL; 4749 if (!(data.start_class->flags & ANYOF_EOS) 4750 && !cl_is_anything(data.start_class)) 4751 { 4752 const U32 n = add_data(pRExC_state, 1, "f"); 4753 4754 Newx(RExC_rxi->data->data[n], 1, 4755 struct regnode_charclass_class); 4756 StructCopy(data.start_class, 4757 (struct regnode_charclass_class*)RExC_rxi->data->data[n], 4758 struct regnode_charclass_class); 4759 ri->regstclass = (regnode*)RExC_rxi->data->data[n]; 4760 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 4761 DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); 4762 regprop(r, sv, (regnode*)data.start_class); 4763 PerlIO_printf(Perl_debug_log, 4764 "synthetic stclass \"%s\".\n", 4765 SvPVX_const(sv));}); 4766 } 4767 } 4768 4769 /* Guard against an embedded (?=) or (?<=) with a longer minlen than 4770 the "real" pattern. */ 4771 DEBUG_OPTIMISE_r({ 4772 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", 4773 (IV)minlen, (IV)r->minlen); 4774 }); 4775 r->minlenret = minlen; 4776 if (r->minlen < minlen) 4777 r->minlen = minlen; 4778 4779 if (RExC_seen & REG_SEEN_GPOS) 4780 r->extflags |= RXf_GPOS_SEEN; 4781 if (RExC_seen & REG_SEEN_LOOKBEHIND) 4782 r->extflags |= RXf_LOOKBEHIND_SEEN; 4783 if (RExC_seen & REG_SEEN_EVAL) 4784 r->extflags |= RXf_EVAL_SEEN; 4785 if (RExC_seen & REG_SEEN_CANY) 4786 r->extflags |= RXf_CANY_SEEN; 4787 if (RExC_seen & REG_SEEN_VERBARG) 4788 r->intflags |= PREGf_VERBARG_SEEN; 4789 if (RExC_seen & REG_SEEN_CUTGROUP) 4790 r->intflags |= PREGf_CUTGROUP_SEEN; 4791 if (RExC_paren_names) 4792 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); 4793 else 4794 r->paren_names = NULL; 4795 4796 #ifdef STUPID_PATTERN_CHECKS 4797 if (r->prelen == 0) 4798 r->extflags |= RXf_NULL; 4799 if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') 4800 /* XXX: this should happen BEFORE we compile */ 4801 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 4802 else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3)) 4803 r->extflags |= RXf_WHITE; 4804 else if (r->prelen == 1 && r->precomp[0] == '^') 4805 r->extflags |= RXf_START_ONLY; 4806 #else 4807 if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') 4808 /* XXX: this should happen BEFORE we compile */ 4809 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 4810 else { 4811 regnode *first = ri->program + 1; 4812 U8 fop = OP(first); 4813 U8 nop = OP(NEXTOPER(first)); 4814 4815 if (PL_regkind[fop] == NOTHING && nop == END) 4816 r->extflags |= RXf_NULL; 4817 else if (PL_regkind[fop] == BOL && nop == END) 4818 r->extflags |= RXf_START_ONLY; 4819 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) 4820 r->extflags |= RXf_WHITE; 4821 } 4822 #endif 4823 #ifdef DEBUGGING 4824 if (RExC_paren_names) { 4825 ri->name_list_idx = add_data( pRExC_state, 1, "p" ); 4826 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); 4827 } else 4828 #endif 4829 ri->name_list_idx = 0; 4830 4831 if (RExC_recurse_count) { 4832 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { 4833 const regnode *scan = RExC_recurse[RExC_recurse_count-1]; 4834 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); 4835 } 4836 } 4837 Newxz(r->offs, RExC_npar, regexp_paren_pair); 4838 /* assume we don't need to swap parens around before we match */ 4839 4840 DEBUG_DUMP_r({ 4841 PerlIO_printf(Perl_debug_log,"Final program:\n"); 4842 regdump(r); 4843 }); 4844 #ifdef RE_TRACK_PATTERN_OFFSETS 4845 DEBUG_OFFSETS_r(if (ri->u.offsets) { 4846 const U32 len = ri->u.offsets[0]; 4847 U32 i; 4848 GET_RE_DEBUG_FLAGS_DECL; 4849 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); 4850 for (i = 1; i <= len; i++) { 4851 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) 4852 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", 4853 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); 4854 } 4855 PerlIO_printf(Perl_debug_log, "\n"); 4856 }); 4857 #endif 4858 return(r); 4859 } 4860 4861 #undef RE_ENGINE_PTR 4862 4863 4864 SV* 4865 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, 4866 const U32 flags) 4867 { 4868 PERL_UNUSED_ARG(value); 4869 4870 if (flags & RXapif_FETCH) { 4871 return reg_named_buff_fetch(rx, key, flags); 4872 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { 4873 Perl_croak(aTHX_ PL_no_modify); 4874 return NULL; 4875 } else if (flags & RXapif_EXISTS) { 4876 return reg_named_buff_exists(rx, key, flags) 4877 ? &PL_sv_yes 4878 : &PL_sv_no; 4879 } else if (flags & RXapif_REGNAMES) { 4880 return reg_named_buff_all(rx, flags); 4881 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { 4882 return reg_named_buff_scalar(rx, flags); 4883 } else { 4884 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); 4885 return NULL; 4886 } 4887 } 4888 4889 SV* 4890 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, 4891 const U32 flags) 4892 { 4893 PERL_UNUSED_ARG(lastkey); 4894 4895 if (flags & RXapif_FIRSTKEY) 4896 return reg_named_buff_firstkey(rx, flags); 4897 else if (flags & RXapif_NEXTKEY) 4898 return reg_named_buff_nextkey(rx, flags); 4899 else { 4900 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); 4901 return NULL; 4902 } 4903 } 4904 4905 SV* 4906 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags) 4907 { 4908 AV *retarray = NULL; 4909 SV *ret; 4910 if (flags & RXapif_ALL) 4911 retarray=newAV(); 4912 4913 if (rx && rx->paren_names) { 4914 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); 4915 if (he_str) { 4916 IV i; 4917 SV* sv_dat=HeVAL(he_str); 4918 I32 *nums=(I32*)SvPVX(sv_dat); 4919 for ( i=0; i<SvIVX(sv_dat); i++ ) { 4920 if ((I32)(rx->nparens) >= nums[i] 4921 && rx->offs[nums[i]].start != -1 4922 && rx->offs[nums[i]].end != -1) 4923 { 4924 ret = newSVpvs(""); 4925 CALLREG_NUMBUF_FETCH(rx,nums[i],ret); 4926 if (!retarray) 4927 return ret; 4928 } else { 4929 ret = newSVsv(&PL_sv_undef); 4930 } 4931 if (retarray) { 4932 SvREFCNT_inc_simple_void(ret); 4933 av_push(retarray, ret); 4934 } 4935 } 4936 if (retarray) 4937 return newRV((SV*)retarray); 4938 } 4939 } 4940 return NULL; 4941 } 4942 4943 bool 4944 Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, 4945 const U32 flags) 4946 { 4947 if (rx && rx->paren_names) { 4948 if (flags & RXapif_ALL) { 4949 return hv_exists_ent(rx->paren_names, key, 0); 4950 } else { 4951 SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags); 4952 if (sv) { 4953 SvREFCNT_dec(sv); 4954 return TRUE; 4955 } else { 4956 return FALSE; 4957 } 4958 } 4959 } else { 4960 return FALSE; 4961 } 4962 } 4963 4964 SV* 4965 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags) 4966 { 4967 if ( rx && rx->paren_names ) { 4968 (void)hv_iterinit(rx->paren_names); 4969 4970 return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY); 4971 } else { 4972 return FALSE; 4973 } 4974 } 4975 4976 SV* 4977 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) 4978 { 4979 if (rx && rx->paren_names) { 4980 HV *hv = rx->paren_names; 4981 HE *temphe; 4982 while ( (temphe = hv_iternext_flags(hv,0)) ) { 4983 IV i; 4984 IV parno = 0; 4985 SV* sv_dat = HeVAL(temphe); 4986 I32 *nums = (I32*)SvPVX(sv_dat); 4987 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 4988 if ((I32)(rx->lastcloseparen) >= nums[i] && 4989 rx->offs[nums[i]].start != -1 && 4990 rx->offs[nums[i]].end != -1) 4991 { 4992 parno = nums[i]; 4993 break; 4994 } 4995 } 4996 if (parno || flags & RXapif_ALL) { 4997 STRLEN len; 4998 char *pv = HePV(temphe, len); 4999 return newSVpvn(pv,len); 5000 } 5001 } 5002 } 5003 return NULL; 5004 } 5005 5006 SV* 5007 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) 5008 { 5009 SV *ret; 5010 AV *av; 5011 I32 length; 5012 5013 if (rx && rx->paren_names) { 5014 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { 5015 return newSViv(HvTOTALKEYS(rx->paren_names)); 5016 } else if (flags & RXapif_ONE) { 5017 ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES)); 5018 av = (AV*)SvRV(ret); 5019 length = av_len(av); 5020 return newSViv(length + 1); 5021 } else { 5022 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); 5023 return NULL; 5024 } 5025 } 5026 return &PL_sv_undef; 5027 } 5028 5029 SV* 5030 Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) 5031 { 5032 AV *av = newAV(); 5033 5034 if (rx && rx->paren_names) { 5035 HV *hv= rx->paren_names; 5036 HE *temphe; 5037 (void)hv_iterinit(hv); 5038 while ( (temphe = hv_iternext_flags(hv,0)) ) { 5039 IV i; 5040 IV parno = 0; 5041 SV* sv_dat = HeVAL(temphe); 5042 I32 *nums = (I32*)SvPVX(sv_dat); 5043 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 5044 if ((I32)(rx->lastcloseparen) >= nums[i] && 5045 rx->offs[nums[i]].start != -1 && 5046 rx->offs[nums[i]].end != -1) 5047 { 5048 parno = nums[i]; 5049 break; 5050 } 5051 } 5052 if (parno || flags & RXapif_ALL) { 5053 STRLEN len; 5054 char *pv = HePV(temphe, len); 5055 av_push(av, newSVpvn(pv,len)); 5056 } 5057 } 5058 } 5059 5060 return newRV((SV*)av); 5061 } 5062 5063 void 5064 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv) 5065 { 5066 char *s = NULL; 5067 I32 i = 0; 5068 I32 s1, t1; 5069 5070 if (!rx->subbeg) { 5071 sv_setsv(sv,&PL_sv_undef); 5072 return; 5073 } 5074 else 5075 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { 5076 /* $` */ 5077 i = rx->offs[0].start; 5078 s = rx->subbeg; 5079 } 5080 else 5081 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { 5082 /* $' */ 5083 s = rx->subbeg + rx->offs[0].end; 5084 i = rx->sublen - rx->offs[0].end; 5085 } 5086 else 5087 if ( 0 <= paren && paren <= (I32)rx->nparens && 5088 (s1 = rx->offs[paren].start) != -1 && 5089 (t1 = rx->offs[paren].end) != -1) 5090 { 5091 /* $& $1 ... */ 5092 i = t1 - s1; 5093 s = rx->subbeg + s1; 5094 } else { 5095 sv_setsv(sv,&PL_sv_undef); 5096 return; 5097 } 5098 assert(rx->sublen >= (s - rx->subbeg) + i ); 5099 if (i >= 0) { 5100 const int oldtainted = PL_tainted; 5101 TAINT_NOT; 5102 sv_setpvn(sv, s, i); 5103 PL_tainted = oldtainted; 5104 if ( (rx->extflags & RXf_CANY_SEEN) 5105 ? (RX_MATCH_UTF8(rx) 5106 && (!i || is_utf8_string((U8*)s, i))) 5107 : (RX_MATCH_UTF8(rx)) ) 5108 { 5109 SvUTF8_on(sv); 5110 } 5111 else 5112 SvUTF8_off(sv); 5113 if (PL_tainting) { 5114 if (RX_MATCH_TAINTED(rx)) { 5115 if (SvTYPE(sv) >= SVt_PVMG) { 5116 MAGIC* const mg = SvMAGIC(sv); 5117 MAGIC* mgt; 5118 PL_tainted = 1; 5119 SvMAGIC_set(sv, mg->mg_moremagic); 5120 SvTAINT(sv); 5121 if ((mgt = SvMAGIC(sv))) { 5122 mg->mg_moremagic = mgt; 5123 SvMAGIC_set(sv, mg); 5124 } 5125 } else { 5126 PL_tainted = 1; 5127 SvTAINT(sv); 5128 } 5129 } else 5130 SvTAINTED_off(sv); 5131 } 5132 } else { 5133 sv_setsv(sv,&PL_sv_undef); 5134 return; 5135 } 5136 } 5137 5138 void 5139 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, 5140 SV const * const value) 5141 { 5142 PERL_UNUSED_ARG(rx); 5143 PERL_UNUSED_ARG(paren); 5144 PERL_UNUSED_ARG(value); 5145 5146 if (!PL_localizing) 5147 Perl_croak(aTHX_ PL_no_modify); 5148 } 5149 5150 I32 5151 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, 5152 const I32 paren) 5153 { 5154 I32 i; 5155 I32 s1, t1; 5156 5157 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ 5158 switch (paren) { 5159 /* $` / ${^PREMATCH} */ 5160 case RX_BUFF_IDX_PREMATCH: 5161 if (rx->offs[0].start != -1) { 5162 i = rx->offs[0].start; 5163 if (i > 0) { 5164 s1 = 0; 5165 t1 = i; 5166 goto getlen; 5167 } 5168 } 5169 return 0; 5170 /* $' / ${^POSTMATCH} */ 5171 case RX_BUFF_IDX_POSTMATCH: 5172 if (rx->offs[0].end != -1) { 5173 i = rx->sublen - rx->offs[0].end; 5174 if (i > 0) { 5175 s1 = rx->offs[0].end; 5176 t1 = rx->sublen; 5177 goto getlen; 5178 } 5179 } 5180 return 0; 5181 /* $& / ${^MATCH}, $1, $2, ... */ 5182 default: 5183 if (paren <= (I32)rx->nparens && 5184 (s1 = rx->offs[paren].start) != -1 && 5185 (t1 = rx->offs[paren].end) != -1) 5186 { 5187 i = t1 - s1; 5188 goto getlen; 5189 } else { 5190 if (ckWARN(WARN_UNINITIALIZED)) 5191 report_uninit((SV*)sv); 5192 return 0; 5193 } 5194 } 5195 getlen: 5196 if (i > 0 && RX_MATCH_UTF8(rx)) { 5197 const char * const s = rx->subbeg + s1; 5198 const U8 *ep; 5199 STRLEN el; 5200 5201 i = t1 - s1; 5202 if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) 5203 i = el; 5204 } 5205 return i; 5206 } 5207 5208 SV* 5209 Perl_reg_qr_package(pTHX_ REGEXP * const rx) 5210 { 5211 PERL_UNUSED_ARG(rx); 5212 return newSVpvs("Regexp"); 5213 } 5214 5215 /* Scans the name of a named buffer from the pattern. 5216 * If flags is REG_RSN_RETURN_NULL returns null. 5217 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name 5218 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding 5219 * to the parsed name as looked up in the RExC_paren_names hash. 5220 * If there is an error throws a vFAIL().. type exception. 5221 */ 5222 5223 #define REG_RSN_RETURN_NULL 0 5224 #define REG_RSN_RETURN_NAME 1 5225 #define REG_RSN_RETURN_DATA 2 5226 5227 STATIC SV* 5228 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { 5229 char *name_start = RExC_parse; 5230 5231 if (isIDFIRST_lazy_if(RExC_parse, UTF)) { 5232 /* skip IDFIRST by using do...while */ 5233 if (UTF) 5234 do { 5235 RExC_parse += UTF8SKIP(RExC_parse); 5236 } while (isALNUM_utf8((U8*)RExC_parse)); 5237 else 5238 do { 5239 RExC_parse++; 5240 } while (isALNUM(*RExC_parse)); 5241 } 5242 5243 if ( flags ) { 5244 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start, 5245 (int)(RExC_parse - name_start))); 5246 if (UTF) 5247 SvUTF8_on(sv_name); 5248 if ( flags == REG_RSN_RETURN_NAME) 5249 return sv_name; 5250 else if (flags==REG_RSN_RETURN_DATA) { 5251 HE *he_str = NULL; 5252 SV *sv_dat = NULL; 5253 if ( ! sv_name ) /* should not happen*/ 5254 Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); 5255 if (RExC_paren_names) 5256 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); 5257 if ( he_str ) 5258 sv_dat = HeVAL(he_str); 5259 if ( ! sv_dat ) 5260 vFAIL("Reference to nonexistent named group"); 5261 return sv_dat; 5262 } 5263 else { 5264 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name"); 5265 } 5266 /* NOT REACHED */ 5267 } 5268 return NULL; 5269 } 5270 5271 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ 5272 int rem=(int)(RExC_end - RExC_parse); \ 5273 int cut; \ 5274 int num; \ 5275 int iscut=0; \ 5276 if (rem>10) { \ 5277 rem=10; \ 5278 iscut=1; \ 5279 } \ 5280 cut=10-rem; \ 5281 if (RExC_lastparse!=RExC_parse) \ 5282 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ 5283 rem, RExC_parse, \ 5284 cut + 4, \ 5285 iscut ? "..." : "<" \ 5286 ); \ 5287 else \ 5288 PerlIO_printf(Perl_debug_log,"%16s",""); \ 5289 \ 5290 if (SIZE_ONLY) \ 5291 num = RExC_size + 1; \ 5292 else \ 5293 num=REG_NODE_NUM(RExC_emit); \ 5294 if (RExC_lastnum!=num) \ 5295 PerlIO_printf(Perl_debug_log,"|%4d",num); \ 5296 else \ 5297 PerlIO_printf(Perl_debug_log,"|%4s",""); \ 5298 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ 5299 (int)((depth*2)), "", \ 5300 (funcname) \ 5301 ); \ 5302 RExC_lastnum=num; \ 5303 RExC_lastparse=RExC_parse; \ 5304 }) 5305 5306 5307 5308 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ 5309 DEBUG_PARSE_MSG((funcname)); \ 5310 PerlIO_printf(Perl_debug_log,"%4s","\n"); \ 5311 }) 5312 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ 5313 DEBUG_PARSE_MSG((funcname)); \ 5314 PerlIO_printf(Perl_debug_log,fmt "\n",args); \ 5315 }) 5316 /* 5317 - reg - regular expression, i.e. main body or parenthesized thing 5318 * 5319 * Caller must absorb opening parenthesis. 5320 * 5321 * Combining parenthesis handling with the base level of regular expression 5322 * is a trifle forced, but the need to tie the tails of the branches to what 5323 * follows makes it hard to avoid. 5324 */ 5325 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) 5326 #ifdef DEBUGGING 5327 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) 5328 #else 5329 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) 5330 #endif 5331 5332 STATIC regnode * 5333 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) 5334 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ 5335 { 5336 dVAR; 5337 register regnode *ret; /* Will be the head of the group. */ 5338 register regnode *br; 5339 register regnode *lastbr; 5340 register regnode *ender = NULL; 5341 register I32 parno = 0; 5342 I32 flags; 5343 U32 oregflags = RExC_flags; 5344 bool have_branch = 0; 5345 bool is_open = 0; 5346 I32 freeze_paren = 0; 5347 I32 after_freeze = 0; 5348 5349 /* for (?g), (?gc), and (?o) warnings; warning 5350 about (?c) will warn about (?g) -- japhy */ 5351 5352 #define WASTED_O 0x01 5353 #define WASTED_G 0x02 5354 #define WASTED_C 0x04 5355 #define WASTED_GC (0x02|0x04) 5356 I32 wastedflags = 0x00; 5357 5358 char * parse_start = RExC_parse; /* MJD */ 5359 char * const oregcomp_parse = RExC_parse; 5360 5361 GET_RE_DEBUG_FLAGS_DECL; 5362 DEBUG_PARSE("reg "); 5363 5364 *flagp = 0; /* Tentatively. */ 5365 5366 5367 /* Make an OPEN node, if parenthesized. */ 5368 if (paren) { 5369 if ( *RExC_parse == '*') { /* (*VERB:ARG) */ 5370 char *start_verb = RExC_parse; 5371 STRLEN verb_len = 0; 5372 char *start_arg = NULL; 5373 unsigned char op = 0; 5374 int argok = 1; 5375 int internal_argval = 0; /* internal_argval is only useful if !argok */ 5376 while ( *RExC_parse && *RExC_parse != ')' ) { 5377 if ( *RExC_parse == ':' ) { 5378 start_arg = RExC_parse + 1; 5379 break; 5380 } 5381 RExC_parse++; 5382 } 5383 ++start_verb; 5384 verb_len = RExC_parse - start_verb; 5385 if ( start_arg ) { 5386 RExC_parse++; 5387 while ( *RExC_parse && *RExC_parse != ')' ) 5388 RExC_parse++; 5389 if ( *RExC_parse != ')' ) 5390 vFAIL("Unterminated verb pattern argument"); 5391 if ( RExC_parse == start_arg ) 5392 start_arg = NULL; 5393 } else { 5394 if ( *RExC_parse != ')' ) 5395 vFAIL("Unterminated verb pattern"); 5396 } 5397 5398 switch ( *start_verb ) { 5399 case 'A': /* (*ACCEPT) */ 5400 if ( memEQs(start_verb,verb_len,"ACCEPT") ) { 5401 op = ACCEPT; 5402 internal_argval = RExC_nestroot; 5403 } 5404 break; 5405 case 'C': /* (*COMMIT) */ 5406 if ( memEQs(start_verb,verb_len,"COMMIT") ) 5407 op = COMMIT; 5408 break; 5409 case 'F': /* (*FAIL) */ 5410 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { 5411 op = OPFAIL; 5412 argok = 0; 5413 } 5414 break; 5415 case ':': /* (*:NAME) */ 5416 case 'M': /* (*MARK:NAME) */ 5417 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { 5418 op = MARKPOINT; 5419 argok = -1; 5420 } 5421 break; 5422 case 'P': /* (*PRUNE) */ 5423 if ( memEQs(start_verb,verb_len,"PRUNE") ) 5424 op = PRUNE; 5425 break; 5426 case 'S': /* (*SKIP) */ 5427 if ( memEQs(start_verb,verb_len,"SKIP") ) 5428 op = SKIP; 5429 break; 5430 case 'T': /* (*THEN) */ 5431 /* [19:06] <TimToady> :: is then */ 5432 if ( memEQs(start_verb,verb_len,"THEN") ) { 5433 op = CUTGROUP; 5434 RExC_seen |= REG_SEEN_CUTGROUP; 5435 } 5436 break; 5437 } 5438 if ( ! op ) { 5439 RExC_parse++; 5440 vFAIL3("Unknown verb pattern '%.*s'", 5441 verb_len, start_verb); 5442 } 5443 if ( argok ) { 5444 if ( start_arg && internal_argval ) { 5445 vFAIL3("Verb pattern '%.*s' may not have an argument", 5446 verb_len, start_verb); 5447 } else if ( argok < 0 && !start_arg ) { 5448 vFAIL3("Verb pattern '%.*s' has a mandatory argument", 5449 verb_len, start_verb); 5450 } else { 5451 ret = reganode(pRExC_state, op, internal_argval); 5452 if ( ! internal_argval && ! SIZE_ONLY ) { 5453 if (start_arg) { 5454 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); 5455 ARG(ret) = add_data( pRExC_state, 1, "S" ); 5456 RExC_rxi->data->data[ARG(ret)]=(void*)sv; 5457 ret->flags = 0; 5458 } else { 5459 ret->flags = 1; 5460 } 5461 } 5462 } 5463 if (!internal_argval) 5464 RExC_seen |= REG_SEEN_VERBARG; 5465 } else if ( start_arg ) { 5466 vFAIL3("Verb pattern '%.*s' may not have an argument", 5467 verb_len, start_verb); 5468 } else { 5469 ret = reg_node(pRExC_state, op); 5470 } 5471 nextchar(pRExC_state); 5472 return ret; 5473 } else 5474 if (*RExC_parse == '?') { /* (?...) */ 5475 bool is_logical = 0; 5476 const char * const seqstart = RExC_parse; 5477 5478 RExC_parse++; 5479 paren = *RExC_parse++; 5480 ret = NULL; /* For look-ahead/behind. */ 5481 switch (paren) { 5482 5483 case 'P': /* (?P...) variants for those used to PCRE/Python */ 5484 paren = *RExC_parse++; 5485 if ( paren == '<') /* (?P<...>) named capture */ 5486 goto named_capture; 5487 else if (paren == '>') { /* (?P>name) named recursion */ 5488 goto named_recursion; 5489 } 5490 else if (paren == '=') { /* (?P=...) named backref */ 5491 /* this pretty much dupes the code for \k<NAME> in regatom(), if 5492 you change this make sure you change that */ 5493 char* name_start = RExC_parse; 5494 U32 num = 0; 5495 SV *sv_dat = reg_scan_name(pRExC_state, 5496 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5497 if (RExC_parse == name_start || *RExC_parse != ')') 5498 vFAIL2("Sequence %.3s... not terminated",parse_start); 5499 5500 if (!SIZE_ONLY) { 5501 num = add_data( pRExC_state, 1, "S" ); 5502 RExC_rxi->data->data[num]=(void*)sv_dat; 5503 SvREFCNT_inc_simple_void(sv_dat); 5504 } 5505 RExC_sawback = 1; 5506 ret = reganode(pRExC_state, 5507 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), 5508 num); 5509 *flagp |= HASWIDTH; 5510 5511 Set_Node_Offset(ret, parse_start+1); 5512 Set_Node_Cur_Length(ret); /* MJD */ 5513 5514 nextchar(pRExC_state); 5515 return ret; 5516 } 5517 RExC_parse++; 5518 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5519 /*NOTREACHED*/ 5520 case '<': /* (?<...) */ 5521 if (*RExC_parse == '!') 5522 paren = ','; 5523 else if (*RExC_parse != '=') 5524 named_capture: 5525 { /* (?<...>) */ 5526 char *name_start; 5527 SV *svname; 5528 paren= '>'; 5529 case '\'': /* (?'...') */ 5530 name_start= RExC_parse; 5531 svname = reg_scan_name(pRExC_state, 5532 SIZE_ONLY ? /* reverse test from the others */ 5533 REG_RSN_RETURN_NAME : 5534 REG_RSN_RETURN_NULL); 5535 if (RExC_parse == name_start) { 5536 RExC_parse++; 5537 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5538 /*NOTREACHED*/ 5539 } 5540 if (*RExC_parse != paren) 5541 vFAIL2("Sequence (?%c... not terminated", 5542 paren=='>' ? '<' : paren); 5543 if (SIZE_ONLY) { 5544 HE *he_str; 5545 SV *sv_dat = NULL; 5546 if (!svname) /* shouldnt happen */ 5547 Perl_croak(aTHX_ 5548 "panic: reg_scan_name returned NULL"); 5549 if (!RExC_paren_names) { 5550 RExC_paren_names= newHV(); 5551 sv_2mortal((SV*)RExC_paren_names); 5552 #ifdef DEBUGGING 5553 RExC_paren_name_list= newAV(); 5554 sv_2mortal((SV*)RExC_paren_name_list); 5555 #endif 5556 } 5557 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); 5558 if ( he_str ) 5559 sv_dat = HeVAL(he_str); 5560 if ( ! sv_dat ) { 5561 /* croak baby croak */ 5562 Perl_croak(aTHX_ 5563 "panic: paren_name hash element allocation failed"); 5564 } else if ( SvPOK(sv_dat) ) { 5565 /* (?|...) can mean we have dupes so scan to check 5566 its already been stored. Maybe a flag indicating 5567 we are inside such a construct would be useful, 5568 but the arrays are likely to be quite small, so 5569 for now we punt -- dmq */ 5570 IV count = SvIV(sv_dat); 5571 I32 *pv = (I32*)SvPVX(sv_dat); 5572 IV i; 5573 for ( i = 0 ; i < count ; i++ ) { 5574 if ( pv[i] == RExC_npar ) { 5575 count = 0; 5576 break; 5577 } 5578 } 5579 if ( count ) { 5580 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); 5581 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); 5582 pv[count] = RExC_npar; 5583 SvIVX(sv_dat)++; 5584 } 5585 } else { 5586 (void)SvUPGRADE(sv_dat,SVt_PVNV); 5587 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); 5588 SvIOK_on(sv_dat); 5589 SvIVX(sv_dat)= 1; 5590 } 5591 #ifdef DEBUGGING 5592 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) 5593 SvREFCNT_dec(svname); 5594 #endif 5595 5596 /*sv_dump(sv_dat);*/ 5597 } 5598 nextchar(pRExC_state); 5599 paren = 1; 5600 goto capturing_parens; 5601 } 5602 RExC_seen |= REG_SEEN_LOOKBEHIND; 5603 RExC_parse++; 5604 case '=': /* (?=...) */ 5605 case '!': /* (?!...) */ 5606 RExC_seen_zerolen++; 5607 if (*RExC_parse == ')') { 5608 ret=reg_node(pRExC_state, OPFAIL); 5609 nextchar(pRExC_state); 5610 return ret; 5611 } 5612 break; 5613 case '|': /* (?|...) */ 5614 /* branch reset, behave like a (?:...) except that 5615 buffers in alternations share the same numbers */ 5616 paren = ':'; 5617 after_freeze = freeze_paren = RExC_npar; 5618 break; 5619 case ':': /* (?:...) */ 5620 case '>': /* (?>...) */ 5621 break; 5622 case '$': /* (?$...) */ 5623 case '@': /* (?@...) */ 5624 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 5625 break; 5626 case '#': /* (?#...) */ 5627 while (*RExC_parse && *RExC_parse != ')') 5628 RExC_parse++; 5629 if (*RExC_parse != ')') 5630 FAIL("Sequence (?#... not terminated"); 5631 nextchar(pRExC_state); 5632 *flagp = TRYAGAIN; 5633 return NULL; 5634 case '0' : /* (?0) */ 5635 case 'R' : /* (?R) */ 5636 if (*RExC_parse != ')') 5637 FAIL("Sequence (?R) not terminated"); 5638 ret = reg_node(pRExC_state, GOSTART); 5639 *flagp |= POSTPONED; 5640 nextchar(pRExC_state); 5641 return ret; 5642 /*notreached*/ 5643 { /* named and numeric backreferences */ 5644 I32 num; 5645 case '&': /* (?&NAME) */ 5646 parse_start = RExC_parse - 1; 5647 named_recursion: 5648 { 5649 SV *sv_dat = reg_scan_name(pRExC_state, 5650 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5651 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 5652 } 5653 goto gen_recurse_regop; 5654 /* NOT REACHED */ 5655 case '+': 5656 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 5657 RExC_parse++; 5658 vFAIL("Illegal pattern"); 5659 } 5660 goto parse_recursion; 5661 /* NOT REACHED*/ 5662 case '-': /* (?-1) */ 5663 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { 5664 RExC_parse--; /* rewind to let it be handled later */ 5665 goto parse_flags; 5666 } 5667 /*FALLTHROUGH */ 5668 case '1': case '2': case '3': case '4': /* (?1) */ 5669 case '5': case '6': case '7': case '8': case '9': 5670 RExC_parse--; 5671 parse_recursion: 5672 num = atoi(RExC_parse); 5673 parse_start = RExC_parse - 1; /* MJD */ 5674 if (*RExC_parse == '-') 5675 RExC_parse++; 5676 while (isDIGIT(*RExC_parse)) 5677 RExC_parse++; 5678 if (*RExC_parse!=')') 5679 vFAIL("Expecting close bracket"); 5680 5681 gen_recurse_regop: 5682 if ( paren == '-' ) { 5683 /* 5684 Diagram of capture buffer numbering. 5685 Top line is the normal capture buffer numbers 5686 Botton line is the negative indexing as from 5687 the X (the (?-2)) 5688 5689 + 1 2 3 4 5 X 6 7 5690 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ 5691 - 5 4 3 2 1 X x x 5692 5693 */ 5694 num = RExC_npar + num; 5695 if (num < 1) { 5696 RExC_parse++; 5697 vFAIL("Reference to nonexistent group"); 5698 } 5699 } else if ( paren == '+' ) { 5700 num = RExC_npar + num - 1; 5701 } 5702 5703 ret = reganode(pRExC_state, GOSUB, num); 5704 if (!SIZE_ONLY) { 5705 if (num > (I32)RExC_rx->nparens) { 5706 RExC_parse++; 5707 vFAIL("Reference to nonexistent group"); 5708 } 5709 ARG2L_SET( ret, RExC_recurse_count++); 5710 RExC_emit++; 5711 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 5712 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); 5713 } else { 5714 RExC_size++; 5715 } 5716 RExC_seen |= REG_SEEN_RECURSE; 5717 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ 5718 Set_Node_Offset(ret, parse_start); /* MJD */ 5719 5720 *flagp |= POSTPONED; 5721 nextchar(pRExC_state); 5722 return ret; 5723 } /* named and numeric backreferences */ 5724 /* NOT REACHED */ 5725 5726 case '?': /* (??...) */ 5727 is_logical = 1; 5728 if (*RExC_parse != '{') { 5729 RExC_parse++; 5730 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5731 /*NOTREACHED*/ 5732 } 5733 *flagp |= POSTPONED; 5734 paren = *RExC_parse++; 5735 /* FALL THROUGH */ 5736 case '{': /* (?{...}) */ 5737 { 5738 I32 count = 1; 5739 U32 n = 0; 5740 char c; 5741 char *s = RExC_parse; 5742 5743 RExC_seen_zerolen++; 5744 RExC_seen |= REG_SEEN_EVAL; 5745 while (count && (c = *RExC_parse)) { 5746 if (c == '\\') { 5747 if (RExC_parse[1]) 5748 RExC_parse++; 5749 } 5750 else if (c == '{') 5751 count++; 5752 else if (c == '}') 5753 count--; 5754 RExC_parse++; 5755 } 5756 if (*RExC_parse != ')') { 5757 RExC_parse = s; 5758 vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); 5759 } 5760 if (!SIZE_ONLY) { 5761 PAD *pad; 5762 OP_4tree *sop, *rop; 5763 SV * const sv = newSVpvn(s, RExC_parse - 1 - s); 5764 5765 ENTER; 5766 Perl_save_re_context(aTHX); 5767 rop = sv_compile_2op(sv, &sop, "re", &pad); 5768 sop->op_private |= OPpREFCOUNTED; 5769 /* re_dup will OpREFCNT_inc */ 5770 OpREFCNT_set(sop, 1); 5771 LEAVE; 5772 5773 n = add_data(pRExC_state, 3, "nop"); 5774 RExC_rxi->data->data[n] = (void*)rop; 5775 RExC_rxi->data->data[n+1] = (void*)sop; 5776 RExC_rxi->data->data[n+2] = (void*)pad; 5777 SvREFCNT_dec(sv); 5778 } 5779 else { /* First pass */ 5780 if (PL_reginterp_cnt < ++RExC_seen_evals 5781 && IN_PERL_RUNTIME) 5782 /* No compiled RE interpolated, has runtime 5783 components ===> unsafe. */ 5784 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 5785 if (PL_tainting && PL_tainted) 5786 FAIL("Eval-group in insecure regular expression"); 5787 #if PERL_VERSION > 8 5788 if (IN_PERL_COMPILETIME) 5789 PL_cv_has_eval = 1; 5790 #endif 5791 } 5792 5793 nextchar(pRExC_state); 5794 if (is_logical) { 5795 ret = reg_node(pRExC_state, LOGICAL); 5796 if (!SIZE_ONLY) 5797 ret->flags = 2; 5798 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); 5799 /* deal with the length of this later - MJD */ 5800 return ret; 5801 } 5802 ret = reganode(pRExC_state, EVAL, n); 5803 Set_Node_Length(ret, RExC_parse - parse_start + 1); 5804 Set_Node_Offset(ret, parse_start); 5805 return ret; 5806 } 5807 case '(': /* (?(?{...})...) and (?(?=...)...) */ 5808 { 5809 int is_define= 0; 5810 if (RExC_parse[0] == '?') { /* (?(?...)) */ 5811 if (RExC_parse[1] == '=' || RExC_parse[1] == '!' 5812 || RExC_parse[1] == '<' 5813 || RExC_parse[1] == '{') { /* Lookahead or eval. */ 5814 I32 flag; 5815 5816 ret = reg_node(pRExC_state, LOGICAL); 5817 if (!SIZE_ONLY) 5818 ret->flags = 1; 5819 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1)); 5820 goto insert_if; 5821 } 5822 } 5823 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ 5824 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ 5825 { 5826 char ch = RExC_parse[0] == '<' ? '>' : '\''; 5827 char *name_start= RExC_parse++; 5828 U32 num = 0; 5829 SV *sv_dat=reg_scan_name(pRExC_state, 5830 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5831 if (RExC_parse == name_start || *RExC_parse != ch) 5832 vFAIL2("Sequence (?(%c... not terminated", 5833 (ch == '>' ? '<' : ch)); 5834 RExC_parse++; 5835 if (!SIZE_ONLY) { 5836 num = add_data( pRExC_state, 1, "S" ); 5837 RExC_rxi->data->data[num]=(void*)sv_dat; 5838 SvREFCNT_inc_simple_void(sv_dat); 5839 } 5840 ret = reganode(pRExC_state,NGROUPP,num); 5841 goto insert_if_check_paren; 5842 } 5843 else if (RExC_parse[0] == 'D' && 5844 RExC_parse[1] == 'E' && 5845 RExC_parse[2] == 'F' && 5846 RExC_parse[3] == 'I' && 5847 RExC_parse[4] == 'N' && 5848 RExC_parse[5] == 'E') 5849 { 5850 ret = reganode(pRExC_state,DEFINEP,0); 5851 RExC_parse +=6 ; 5852 is_define = 1; 5853 goto insert_if_check_paren; 5854 } 5855 else if (RExC_parse[0] == 'R') { 5856 RExC_parse++; 5857 parno = 0; 5858 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 5859 parno = atoi(RExC_parse++); 5860 while (isDIGIT(*RExC_parse)) 5861 RExC_parse++; 5862 } else if (RExC_parse[0] == '&') { 5863 SV *sv_dat; 5864 RExC_parse++; 5865 sv_dat = reg_scan_name(pRExC_state, 5866 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 5867 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 5868 } 5869 ret = reganode(pRExC_state,INSUBP,parno); 5870 goto insert_if_check_paren; 5871 } 5872 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 5873 /* (?(1)...) */ 5874 char c; 5875 parno = atoi(RExC_parse++); 5876 5877 while (isDIGIT(*RExC_parse)) 5878 RExC_parse++; 5879 ret = reganode(pRExC_state, GROUPP, parno); 5880 5881 insert_if_check_paren: 5882 if ((c = *nextchar(pRExC_state)) != ')') 5883 vFAIL("Switch condition not recognized"); 5884 insert_if: 5885 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); 5886 br = regbranch(pRExC_state, &flags, 1,depth+1); 5887 if (br == NULL) 5888 br = reganode(pRExC_state, LONGJMP, 0); 5889 else 5890 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); 5891 c = *nextchar(pRExC_state); 5892 if (flags&HASWIDTH) 5893 *flagp |= HASWIDTH; 5894 if (c == '|') { 5895 if (is_define) 5896 vFAIL("(?(DEFINE)....) does not allow branches"); 5897 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ 5898 regbranch(pRExC_state, &flags, 1,depth+1); 5899 REGTAIL(pRExC_state, ret, lastbr); 5900 if (flags&HASWIDTH) 5901 *flagp |= HASWIDTH; 5902 c = *nextchar(pRExC_state); 5903 } 5904 else 5905 lastbr = NULL; 5906 if (c != ')') 5907 vFAIL("Switch (?(condition)... contains too many branches"); 5908 ender = reg_node(pRExC_state, TAIL); 5909 REGTAIL(pRExC_state, br, ender); 5910 if (lastbr) { 5911 REGTAIL(pRExC_state, lastbr, ender); 5912 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); 5913 } 5914 else 5915 REGTAIL(pRExC_state, ret, ender); 5916 RExC_size++; /* XXX WHY do we need this?!! 5917 For large programs it seems to be required 5918 but I can't figure out why. -- dmq*/ 5919 return ret; 5920 } 5921 else { 5922 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); 5923 } 5924 } 5925 case 0: 5926 RExC_parse--; /* for vFAIL to print correctly */ 5927 vFAIL("Sequence (? incomplete"); 5928 break; 5929 default: 5930 --RExC_parse; 5931 parse_flags: /* (?i) */ 5932 { 5933 U32 posflags = 0, negflags = 0; 5934 U32 *flagsp = &posflags; 5935 5936 while (*RExC_parse) { 5937 /* && strchr("iogcmsx", *RExC_parse) */ 5938 /* (?g), (?gc) and (?o) are useless here 5939 and must be globally applied -- japhy */ 5940 switch (*RExC_parse) { 5941 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); 5942 case ONCE_PAT_MOD: /* 'o' */ 5943 case GLOBAL_PAT_MOD: /* 'g' */ 5944 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 5945 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; 5946 if (! (wastedflags & wflagbit) ) { 5947 wastedflags |= wflagbit; 5948 vWARN5( 5949 RExC_parse + 1, 5950 "Useless (%s%c) - %suse /%c modifier", 5951 flagsp == &negflags ? "?-" : "?", 5952 *RExC_parse, 5953 flagsp == &negflags ? "don't " : "", 5954 *RExC_parse 5955 ); 5956 } 5957 } 5958 break; 5959 5960 case CONTINUE_PAT_MOD: /* 'c' */ 5961 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 5962 if (! (wastedflags & WASTED_C) ) { 5963 wastedflags |= WASTED_GC; 5964 vWARN3( 5965 RExC_parse + 1, 5966 "Useless (%sc) - %suse /gc modifier", 5967 flagsp == &negflags ? "?-" : "?", 5968 flagsp == &negflags ? "don't " : "" 5969 ); 5970 } 5971 } 5972 break; 5973 case KEEPCOPY_PAT_MOD: /* 'p' */ 5974 if (flagsp == &negflags) { 5975 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) 5976 vWARN(RExC_parse + 1,"Useless use of (?-p)"); 5977 } else { 5978 *flagsp |= RXf_PMf_KEEPCOPY; 5979 } 5980 break; 5981 case '-': 5982 if (flagsp == &negflags) { 5983 RExC_parse++; 5984 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 5985 /*NOTREACHED*/ 5986 } 5987 flagsp = &negflags; 5988 wastedflags = 0; /* reset so (?g-c) warns twice */ 5989 break; 5990 case ':': 5991 paren = ':'; 5992 /*FALLTHROUGH*/ 5993 case ')': 5994 RExC_flags |= posflags; 5995 RExC_flags &= ~negflags; 5996 if (paren != ':') { 5997 oregflags |= posflags; 5998 oregflags &= ~negflags; 5999 } 6000 nextchar(pRExC_state); 6001 if (paren != ':') { 6002 *flagp = TRYAGAIN; 6003 return NULL; 6004 } else { 6005 ret = NULL; 6006 goto parse_rest; 6007 } 6008 /*NOTREACHED*/ 6009 default: 6010 RExC_parse++; 6011 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 6012 /*NOTREACHED*/ 6013 } 6014 ++RExC_parse; 6015 } 6016 }} /* one for the default block, one for the switch */ 6017 } 6018 else { /* (...) */ 6019 capturing_parens: 6020 parno = RExC_npar; 6021 RExC_npar++; 6022 6023 ret = reganode(pRExC_state, OPEN, parno); 6024 if (!SIZE_ONLY ){ 6025 if (!RExC_nestroot) 6026 RExC_nestroot = parno; 6027 if (RExC_seen & REG_SEEN_RECURSE 6028 && !RExC_open_parens[parno-1]) 6029 { 6030 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 6031 "Setting open paren #%"IVdf" to %d\n", 6032 (IV)parno, REG_NODE_NUM(ret))); 6033 RExC_open_parens[parno-1]= ret; 6034 } 6035 } 6036 Set_Node_Length(ret, 1); /* MJD */ 6037 Set_Node_Offset(ret, RExC_parse); /* MJD */ 6038 is_open = 1; 6039 } 6040 } 6041 else /* ! paren */ 6042 ret = NULL; 6043 6044 parse_rest: 6045 /* Pick up the branches, linking them together. */ 6046 parse_start = RExC_parse; /* MJD */ 6047 br = regbranch(pRExC_state, &flags, 1,depth+1); 6048 /* branch_len = (paren != 0); */ 6049 6050 if (br == NULL) 6051 return(NULL); 6052 if (*RExC_parse == '|') { 6053 if (!SIZE_ONLY && RExC_extralen) { 6054 reginsert(pRExC_state, BRANCHJ, br, depth+1); 6055 } 6056 else { /* MJD */ 6057 reginsert(pRExC_state, BRANCH, br, depth+1); 6058 Set_Node_Length(br, paren != 0); 6059 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); 6060 } 6061 have_branch = 1; 6062 if (SIZE_ONLY) 6063 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ 6064 } 6065 else if (paren == ':') { 6066 *flagp |= flags&SIMPLE; 6067 } 6068 if (is_open) { /* Starts with OPEN. */ 6069 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ 6070 } 6071 else if (paren != '?') /* Not Conditional */ 6072 ret = br; 6073 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 6074 lastbr = br; 6075 while (*RExC_parse == '|') { 6076 if (!SIZE_ONLY && RExC_extralen) { 6077 ender = reganode(pRExC_state, LONGJMP,0); 6078 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ 6079 } 6080 if (SIZE_ONLY) 6081 RExC_extralen += 2; /* Account for LONGJMP. */ 6082 nextchar(pRExC_state); 6083 if (freeze_paren) { 6084 if (RExC_npar > after_freeze) 6085 after_freeze = RExC_npar; 6086 RExC_npar = freeze_paren; 6087 } 6088 br = regbranch(pRExC_state, &flags, 0, depth+1); 6089 6090 if (br == NULL) 6091 return(NULL); 6092 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ 6093 lastbr = br; 6094 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); 6095 } 6096 6097 if (have_branch || paren != ':') { 6098 /* Make a closing node, and hook it on the end. */ 6099 switch (paren) { 6100 case ':': 6101 ender = reg_node(pRExC_state, TAIL); 6102 break; 6103 case 1: 6104 ender = reganode(pRExC_state, CLOSE, parno); 6105 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { 6106 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, 6107 "Setting close paren #%"IVdf" to %d\n", 6108 (IV)parno, REG_NODE_NUM(ender))); 6109 RExC_close_parens[parno-1]= ender; 6110 if (RExC_nestroot == parno) 6111 RExC_nestroot = 0; 6112 } 6113 Set_Node_Offset(ender,RExC_parse+1); /* MJD */ 6114 Set_Node_Length(ender,1); /* MJD */ 6115 break; 6116 case '<': 6117 case ',': 6118 case '=': 6119 case '!': 6120 *flagp &= ~HASWIDTH; 6121 /* FALL THROUGH */ 6122 case '>': 6123 ender = reg_node(pRExC_state, SUCCEED); 6124 break; 6125 case 0: 6126 ender = reg_node(pRExC_state, END); 6127 if (!SIZE_ONLY) { 6128 assert(!RExC_opend); /* there can only be one! */ 6129 RExC_opend = ender; 6130 } 6131 break; 6132 } 6133 REGTAIL(pRExC_state, lastbr, ender); 6134 6135 if (have_branch && !SIZE_ONLY) { 6136 if (depth==1) 6137 RExC_seen |= REG_TOP_LEVEL_BRANCHES; 6138 6139 /* Hook the tails of the branches to the closing node. */ 6140 for (br = ret; br; br = regnext(br)) { 6141 const U8 op = PL_regkind[OP(br)]; 6142 if (op == BRANCH) { 6143 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); 6144 } 6145 else if (op == BRANCHJ) { 6146 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); 6147 } 6148 } 6149 } 6150 } 6151 6152 { 6153 const char *p; 6154 static const char parens[] = "=!<,>"; 6155 6156 if (paren && (p = strchr(parens, paren))) { 6157 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 6158 int flag = (p - parens) > 1; 6159 6160 if (paren == '>') 6161 node = SUSPEND, flag = 0; 6162 reginsert(pRExC_state, node,ret, depth+1); 6163 Set_Node_Cur_Length(ret); 6164 Set_Node_Offset(ret, parse_start + 1); 6165 ret->flags = flag; 6166 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); 6167 } 6168 } 6169 6170 /* Check for proper termination. */ 6171 if (paren) { 6172 RExC_flags = oregflags; 6173 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { 6174 RExC_parse = oregcomp_parse; 6175 vFAIL("Unmatched ("); 6176 } 6177 } 6178 else if (!paren && RExC_parse < RExC_end) { 6179 if (*RExC_parse == ')') { 6180 RExC_parse++; 6181 vFAIL("Unmatched )"); 6182 } 6183 else 6184 FAIL("Junk on end of regexp"); /* "Can't happen". */ 6185 /* NOTREACHED */ 6186 } 6187 if (after_freeze) 6188 RExC_npar = after_freeze; 6189 return(ret); 6190 } 6191 6192 /* 6193 - regbranch - one alternative of an | operator 6194 * 6195 * Implements the concatenation operator. 6196 */ 6197 STATIC regnode * 6198 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) 6199 { 6200 dVAR; 6201 register regnode *ret; 6202 register regnode *chain = NULL; 6203 register regnode *latest; 6204 I32 flags = 0, c = 0; 6205 GET_RE_DEBUG_FLAGS_DECL; 6206 DEBUG_PARSE("brnc"); 6207 6208 if (first) 6209 ret = NULL; 6210 else { 6211 if (!SIZE_ONLY && RExC_extralen) 6212 ret = reganode(pRExC_state, BRANCHJ,0); 6213 else { 6214 ret = reg_node(pRExC_state, BRANCH); 6215 Set_Node_Length(ret, 1); 6216 } 6217 } 6218 6219 if (!first && SIZE_ONLY) 6220 RExC_extralen += 1; /* BRANCHJ */ 6221 6222 *flagp = WORST; /* Tentatively. */ 6223 6224 RExC_parse--; 6225 nextchar(pRExC_state); 6226 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 6227 flags &= ~TRYAGAIN; 6228 latest = regpiece(pRExC_state, &flags,depth+1); 6229 if (latest == NULL) { 6230 if (flags & TRYAGAIN) 6231 continue; 6232 return(NULL); 6233 } 6234 else if (ret == NULL) 6235 ret = latest; 6236 *flagp |= flags&(HASWIDTH|POSTPONED); 6237 if (chain == NULL) /* First piece. */ 6238 *flagp |= flags&SPSTART; 6239 else { 6240 RExC_naughty++; 6241 REGTAIL(pRExC_state, chain, latest); 6242 } 6243 chain = latest; 6244 c++; 6245 } 6246 if (chain == NULL) { /* Loop ran zero times. */ 6247 chain = reg_node(pRExC_state, NOTHING); 6248 if (ret == NULL) 6249 ret = chain; 6250 } 6251 if (c == 1) { 6252 *flagp |= flags&SIMPLE; 6253 } 6254 6255 return ret; 6256 } 6257 6258 /* 6259 - regpiece - something followed by possible [*+?] 6260 * 6261 * Note that the branching code sequences used for ? and the general cases 6262 * of * and + are somewhat optimized: they use the same NOTHING node as 6263 * both the endmarker for their branch list and the body of the last branch. 6264 * It might seem that this node could be dispensed with entirely, but the 6265 * endmarker role is not redundant. 6266 */ 6267 STATIC regnode * 6268 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 6269 { 6270 dVAR; 6271 register regnode *ret; 6272 register char op; 6273 register char *next; 6274 I32 flags; 6275 const char * const origparse = RExC_parse; 6276 I32 min; 6277 I32 max = REG_INFTY; 6278 char *parse_start; 6279 const char *maxpos = NULL; 6280 GET_RE_DEBUG_FLAGS_DECL; 6281 DEBUG_PARSE("piec"); 6282 6283 ret = regatom(pRExC_state, &flags,depth+1); 6284 if (ret == NULL) { 6285 if (flags & TRYAGAIN) 6286 *flagp |= TRYAGAIN; 6287 return(NULL); 6288 } 6289 6290 op = *RExC_parse; 6291 6292 if (op == '{' && regcurly(RExC_parse)) { 6293 maxpos = NULL; 6294 parse_start = RExC_parse; /* MJD */ 6295 next = RExC_parse + 1; 6296 while (isDIGIT(*next) || *next == ',') { 6297 if (*next == ',') { 6298 if (maxpos) 6299 break; 6300 else 6301 maxpos = next; 6302 } 6303 next++; 6304 } 6305 if (*next == '}') { /* got one */ 6306 if (!maxpos) 6307 maxpos = next; 6308 RExC_parse++; 6309 min = atoi(RExC_parse); 6310 if (*maxpos == ',') 6311 maxpos++; 6312 else 6313 maxpos = RExC_parse; 6314 max = atoi(maxpos); 6315 if (!max && *maxpos != '0') 6316 max = REG_INFTY; /* meaning "infinity" */ 6317 else if (max >= REG_INFTY) 6318 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 6319 RExC_parse = next; 6320 nextchar(pRExC_state); 6321 6322 do_curly: 6323 if ((flags&SIMPLE)) { 6324 RExC_naughty += 2 + RExC_naughty / 2; 6325 reginsert(pRExC_state, CURLY, ret, depth+1); 6326 Set_Node_Offset(ret, parse_start+1); /* MJD */ 6327 Set_Node_Cur_Length(ret); 6328 } 6329 else { 6330 regnode * const w = reg_node(pRExC_state, WHILEM); 6331 6332 w->flags = 0; 6333 REGTAIL(pRExC_state, ret, w); 6334 if (!SIZE_ONLY && RExC_extralen) { 6335 reginsert(pRExC_state, LONGJMP,ret, depth+1); 6336 reginsert(pRExC_state, NOTHING,ret, depth+1); 6337 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ 6338 } 6339 reginsert(pRExC_state, CURLYX,ret, depth+1); 6340 /* MJD hk */ 6341 Set_Node_Offset(ret, parse_start+1); 6342 Set_Node_Length(ret, 6343 op == '{' ? (RExC_parse - parse_start) : 1); 6344 6345 if (!SIZE_ONLY && RExC_extralen) 6346 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ 6347 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); 6348 if (SIZE_ONLY) 6349 RExC_whilem_seen++, RExC_extralen += 3; 6350 RExC_naughty += 4 + RExC_naughty; /* compound interest */ 6351 } 6352 ret->flags = 0; 6353 6354 if (min > 0) 6355 *flagp = WORST; 6356 if (max > 0) 6357 *flagp |= HASWIDTH; 6358 if (max && max < min) 6359 vFAIL("Can't do {n,m} with n > m"); 6360 if (!SIZE_ONLY) { 6361 ARG1_SET(ret, (U16)min); 6362 ARG2_SET(ret, (U16)max); 6363 } 6364 6365 goto nest_check; 6366 } 6367 } 6368 6369 if (!ISMULT1(op)) { 6370 *flagp = flags; 6371 return(ret); 6372 } 6373 6374 #if 0 /* Now runtime fix should be reliable. */ 6375 6376 /* if this is reinstated, don't forget to put this back into perldiag: 6377 6378 =item Regexp *+ operand could be empty at {#} in regex m/%s/ 6379 6380 (F) The part of the regexp subject to either the * or + quantifier 6381 could match an empty string. The {#} shows in the regular 6382 expression about where the problem was discovered. 6383 6384 */ 6385 6386 if (!(flags&HASWIDTH) && op != '?') 6387 vFAIL("Regexp *+ operand could be empty"); 6388 #endif 6389 6390 parse_start = RExC_parse; 6391 nextchar(pRExC_state); 6392 6393 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); 6394 6395 if (op == '*' && (flags&SIMPLE)) { 6396 reginsert(pRExC_state, STAR, ret, depth+1); 6397 ret->flags = 0; 6398 RExC_naughty += 4; 6399 } 6400 else if (op == '*') { 6401 min = 0; 6402 goto do_curly; 6403 } 6404 else if (op == '+' && (flags&SIMPLE)) { 6405 reginsert(pRExC_state, PLUS, ret, depth+1); 6406 ret->flags = 0; 6407 RExC_naughty += 3; 6408 } 6409 else if (op == '+') { 6410 min = 1; 6411 goto do_curly; 6412 } 6413 else if (op == '?') { 6414 min = 0; max = 1; 6415 goto do_curly; 6416 } 6417 nest_check: 6418 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { 6419 vWARN3(RExC_parse, 6420 "%.*s matches null string many times", 6421 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), 6422 origparse); 6423 } 6424 6425 if (RExC_parse < RExC_end && *RExC_parse == '?') { 6426 nextchar(pRExC_state); 6427 reginsert(pRExC_state, MINMOD, ret, depth+1); 6428 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); 6429 } 6430 #ifndef REG_ALLOW_MINMOD_SUSPEND 6431 else 6432 #endif 6433 if (RExC_parse < RExC_end && *RExC_parse == '+') { 6434 regnode *ender; 6435 nextchar(pRExC_state); 6436 ender = reg_node(pRExC_state, SUCCEED); 6437 REGTAIL(pRExC_state, ret, ender); 6438 reginsert(pRExC_state, SUSPEND, ret, depth+1); 6439 ret->flags = 0; 6440 ender = reg_node(pRExC_state, TAIL); 6441 REGTAIL(pRExC_state, ret, ender); 6442 /*ret= ender;*/ 6443 } 6444 6445 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { 6446 RExC_parse++; 6447 vFAIL("Nested quantifiers"); 6448 } 6449 6450 return(ret); 6451 } 6452 6453 6454 /* reg_namedseq(pRExC_state,UVp) 6455 6456 This is expected to be called by a parser routine that has 6457 recognized'\N' and needs to handle the rest. RExC_parse is 6458 expected to point at the first char following the N at the time 6459 of the call. 6460 6461 If valuep is non-null then it is assumed that we are parsing inside 6462 of a charclass definition and the first codepoint in the resolved 6463 string is returned via *valuep and the routine will return NULL. 6464 In this mode if a multichar string is returned from the charnames 6465 handler a warning will be issued, and only the first char in the 6466 sequence will be examined. If the string returned is zero length 6467 then the value of *valuep is undefined and NON-NULL will 6468 be returned to indicate failure. (This will NOT be a valid pointer 6469 to a regnode.) 6470 6471 If value is null then it is assumed that we are parsing normal text 6472 and inserts a new EXACT node into the program containing the resolved 6473 string and returns a pointer to the new node. If the string is 6474 zerolength a NOTHING node is emitted. 6475 6476 On success RExC_parse is set to the char following the endbrace. 6477 Parsing failures will generate a fatal errorvia vFAIL(...) 6478 6479 NOTE: We cache all results from the charnames handler locally in 6480 the RExC_charnames hash (created on first use) to prevent a charnames 6481 handler from playing silly-buggers and returning a short string and 6482 then a long string for a given pattern. Since the regexp program 6483 size is calculated during an initial parse this would result 6484 in a buffer overrun so we cache to prevent the charname result from 6485 changing during the course of the parse. 6486 6487 */ 6488 STATIC regnode * 6489 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) 6490 { 6491 char * name; /* start of the content of the name */ 6492 char * endbrace; /* endbrace following the name */ 6493 SV *sv_str = NULL; 6494 SV *sv_name = NULL; 6495 STRLEN len; /* this has various purposes throughout the code */ 6496 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */ 6497 regnode *ret = NULL; 6498 6499 if (*RExC_parse != '{') { 6500 vFAIL("Missing braces on \\N{}"); 6501 } 6502 name = RExC_parse+1; 6503 endbrace = strchr(RExC_parse, '}'); 6504 if ( ! endbrace ) { 6505 RExC_parse++; 6506 vFAIL("Missing right brace on \\N{}"); 6507 } 6508 RExC_parse = endbrace + 1; 6509 6510 6511 /* RExC_parse points at the beginning brace, 6512 endbrace points at the last */ 6513 if ( name[0]=='U' && name[1]=='+' ) { 6514 /* its a "Unicode hex" notation {U+89AB} */ 6515 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES 6516 | PERL_SCAN_DISALLOW_PREFIX 6517 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); 6518 UV cp; 6519 char string; 6520 len = (STRLEN)(endbrace - name - 2); 6521 cp = grok_hex(name + 2, &len, &fl, NULL); 6522 if ( len != (STRLEN)(endbrace - name - 2) ) { 6523 cp = 0xFFFD; 6524 } 6525 if (cp > 0xff) 6526 RExC_utf8 = 1; 6527 if ( valuep ) { 6528 *valuep = cp; 6529 return NULL; 6530 } 6531 string = (char)cp; 6532 sv_str= newSVpvn(&string, 1); 6533 } else { 6534 /* fetch the charnames handler for this scope */ 6535 HV * const table = GvHV(PL_hintgv); 6536 SV **cvp= table ? 6537 hv_fetchs(table, "charnames", FALSE) : 6538 NULL; 6539 SV *cv= cvp ? *cvp : NULL; 6540 HE *he_str; 6541 int count; 6542 /* create an SV with the name as argument */ 6543 sv_name = newSVpvn(name, endbrace - name); 6544 6545 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) { 6546 vFAIL2("Constant(\\N{%s}) unknown: " 6547 "(possibly a missing \"use charnames ...\")", 6548 SvPVX(sv_name)); 6549 } 6550 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */ 6551 vFAIL2("Constant(\\N{%s}): " 6552 "$^H{charnames} is not defined",SvPVX(sv_name)); 6553 } 6554 6555 6556 6557 if (!RExC_charnames) { 6558 /* make sure our cache is allocated */ 6559 RExC_charnames = newHV(); 6560 sv_2mortal((SV*)RExC_charnames); 6561 } 6562 /* see if we have looked this one up before */ 6563 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 ); 6564 if ( he_str ) { 6565 sv_str = HeVAL(he_str); 6566 cached = 1; 6567 } else { 6568 dSP ; 6569 6570 ENTER ; 6571 SAVETMPS ; 6572 PUSHMARK(SP) ; 6573 6574 XPUSHs(sv_name); 6575 6576 PUTBACK ; 6577 6578 count= call_sv(cv, G_SCALAR); 6579 6580 if (count == 1) { /* XXXX is this right? dmq */ 6581 sv_str = POPs; 6582 SvREFCNT_inc_simple_void(sv_str); 6583 } 6584 6585 SPAGAIN ; 6586 PUTBACK ; 6587 FREETMPS ; 6588 LEAVE ; 6589 6590 if ( !sv_str || !SvOK(sv_str) ) { 6591 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} " 6592 "did not return a defined value",SvPVX(sv_name)); 6593 } 6594 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0)) 6595 cached = 1; 6596 } 6597 } 6598 if (valuep) { 6599 char *p = SvPV(sv_str, len); 6600 if (len) { 6601 STRLEN numlen = 1; 6602 if ( SvUTF8(sv_str) ) { 6603 *valuep = utf8_to_uvchr((U8*)p, &numlen); 6604 if (*valuep > 0x7F) 6605 RExC_utf8 = 1; 6606 /* XXXX 6607 We have to turn on utf8 for high bit chars otherwise 6608 we get failures with 6609 6610 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i 6611 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i 6612 6613 This is different from what \x{} would do with the same 6614 codepoint, where the condition is > 0xFF. 6615 - dmq 6616 */ 6617 6618 6619 } else { 6620 *valuep = (UV)*p; 6621 /* warn if we havent used the whole string? */ 6622 } 6623 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) { 6624 vWARN2(RExC_parse, 6625 "Ignoring excess chars from \\N{%s} in character class", 6626 SvPVX(sv_name) 6627 ); 6628 } 6629 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 6630 vWARN2(RExC_parse, 6631 "Ignoring zero length \\N{%s} in character class", 6632 SvPVX(sv_name) 6633 ); 6634 } 6635 if (sv_name) 6636 SvREFCNT_dec(sv_name); 6637 if (!cached) 6638 SvREFCNT_dec(sv_str); 6639 return len ? NULL : (regnode *)&len; 6640 } else if(SvCUR(sv_str)) { 6641 6642 char *s; 6643 char *p, *pend; 6644 STRLEN charlen = 1; 6645 #ifdef DEBUGGING 6646 char * parse_start = name-3; /* needed for the offsets */ 6647 #endif 6648 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */ 6649 6650 ret = reg_node(pRExC_state, 6651 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); 6652 s= STRING(ret); 6653 6654 if ( RExC_utf8 && !SvUTF8(sv_str) ) { 6655 sv_utf8_upgrade(sv_str); 6656 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) { 6657 RExC_utf8= 1; 6658 } 6659 6660 p = SvPV(sv_str, len); 6661 pend = p + len; 6662 /* len is the length written, charlen is the size the char read */ 6663 for ( len = 0; p < pend; p += charlen ) { 6664 if (UTF) { 6665 UV uvc = utf8_to_uvchr((U8*)p, &charlen); 6666 if (FOLD) { 6667 STRLEN foldlen,numlen; 6668 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; 6669 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen); 6670 /* Emit all the Unicode characters. */ 6671 6672 for (foldbuf = tmpbuf; 6673 foldlen; 6674 foldlen -= numlen) 6675 { 6676 uvc = utf8_to_uvchr(foldbuf, &numlen); 6677 if (numlen > 0) { 6678 const STRLEN unilen = reguni(pRExC_state, uvc, s); 6679 s += unilen; 6680 len += unilen; 6681 /* In EBCDIC the numlen 6682 * and unilen can differ. */ 6683 foldbuf += numlen; 6684 if (numlen >= foldlen) 6685 break; 6686 } 6687 else 6688 break; /* "Can't happen." */ 6689 } 6690 } else { 6691 const STRLEN unilen = reguni(pRExC_state, uvc, s); 6692 if (unilen > 0) { 6693 s += unilen; 6694 len += unilen; 6695 } 6696 } 6697 } else { 6698 len++; 6699 REGC(*p, s++); 6700 } 6701 } 6702 if (SIZE_ONLY) { 6703 RExC_size += STR_SZ(len); 6704 } else { 6705 STR_LEN(ret) = len; 6706 RExC_emit += STR_SZ(len); 6707 } 6708 Set_Node_Cur_Length(ret); /* MJD */ 6709 RExC_parse--; 6710 nextchar(pRExC_state); 6711 } else { 6712 ret = reg_node(pRExC_state,NOTHING); 6713 } 6714 if (!cached) { 6715 SvREFCNT_dec(sv_str); 6716 } 6717 if (sv_name) { 6718 SvREFCNT_dec(sv_name); 6719 } 6720 return ret; 6721 6722 } 6723 6724 6725 /* 6726 * reg_recode 6727 * 6728 * It returns the code point in utf8 for the value in *encp. 6729 * value: a code value in the source encoding 6730 * encp: a pointer to an Encode object 6731 * 6732 * If the result from Encode is not a single character, 6733 * it returns U+FFFD (Replacement character) and sets *encp to NULL. 6734 */ 6735 STATIC UV 6736 S_reg_recode(pTHX_ const char value, SV **encp) 6737 { 6738 STRLEN numlen = 1; 6739 SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); 6740 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); 6741 const STRLEN newlen = SvCUR(sv); 6742 UV uv = UNICODE_REPLACEMENT; 6743 6744 if (newlen) 6745 uv = SvUTF8(sv) 6746 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) 6747 : *(U8*)s; 6748 6749 if (!newlen || numlen != newlen) { 6750 uv = UNICODE_REPLACEMENT; 6751 *encp = NULL; 6752 } 6753 return uv; 6754 } 6755 6756 6757 /* 6758 - regatom - the lowest level 6759 6760 Try to identify anything special at the start of the pattern. If there 6761 is, then handle it as required. This may involve generating a single regop, 6762 such as for an assertion; or it may involve recursing, such as to 6763 handle a () structure. 6764 6765 If the string doesn't start with something special then we gobble up 6766 as much literal text as we can. 6767 6768 Once we have been able to handle whatever type of thing started the 6769 sequence, we return. 6770 6771 Note: we have to be careful with escapes, as they can be both literal 6772 and special, and in the case of \10 and friends can either, depending 6773 on context. Specifically there are two seperate switches for handling 6774 escape sequences, with the one for handling literal escapes requiring 6775 a dummy entry for all of the special escapes that are actually handled 6776 by the other. 6777 */ 6778 6779 STATIC regnode * 6780 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 6781 { 6782 dVAR; 6783 register regnode *ret = NULL; 6784 I32 flags; 6785 char *parse_start = RExC_parse; 6786 GET_RE_DEBUG_FLAGS_DECL; 6787 DEBUG_PARSE("atom"); 6788 *flagp = WORST; /* Tentatively. */ 6789 6790 6791 tryagain: 6792 switch ((U8)*RExC_parse) { 6793 case '^': 6794 RExC_seen_zerolen++; 6795 nextchar(pRExC_state); 6796 if (RExC_flags & RXf_PMf_MULTILINE) 6797 ret = reg_node(pRExC_state, MBOL); 6798 else if (RExC_flags & RXf_PMf_SINGLELINE) 6799 ret = reg_node(pRExC_state, SBOL); 6800 else 6801 ret = reg_node(pRExC_state, BOL); 6802 Set_Node_Length(ret, 1); /* MJD */ 6803 break; 6804 case '$': 6805 nextchar(pRExC_state); 6806 if (*RExC_parse) 6807 RExC_seen_zerolen++; 6808 if (RExC_flags & RXf_PMf_MULTILINE) 6809 ret = reg_node(pRExC_state, MEOL); 6810 else if (RExC_flags & RXf_PMf_SINGLELINE) 6811 ret = reg_node(pRExC_state, SEOL); 6812 else 6813 ret = reg_node(pRExC_state, EOL); 6814 Set_Node_Length(ret, 1); /* MJD */ 6815 break; 6816 case '.': 6817 nextchar(pRExC_state); 6818 if (RExC_flags & RXf_PMf_SINGLELINE) 6819 ret = reg_node(pRExC_state, SANY); 6820 else 6821 ret = reg_node(pRExC_state, REG_ANY); 6822 *flagp |= HASWIDTH|SIMPLE; 6823 RExC_naughty++; 6824 Set_Node_Length(ret, 1); /* MJD */ 6825 break; 6826 case '[': 6827 { 6828 char * const oregcomp_parse = ++RExC_parse; 6829 ret = regclass(pRExC_state,depth+1); 6830 if (*RExC_parse != ']') { 6831 RExC_parse = oregcomp_parse; 6832 vFAIL("Unmatched ["); 6833 } 6834 nextchar(pRExC_state); 6835 *flagp |= HASWIDTH|SIMPLE; 6836 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ 6837 break; 6838 } 6839 case '(': 6840 nextchar(pRExC_state); 6841 ret = reg(pRExC_state, 1, &flags,depth+1); 6842 if (ret == NULL) { 6843 if (flags & TRYAGAIN) { 6844 if (RExC_parse == RExC_end) { 6845 /* Make parent create an empty node if needed. */ 6846 *flagp |= TRYAGAIN; 6847 return(NULL); 6848 } 6849 goto tryagain; 6850 } 6851 return(NULL); 6852 } 6853 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); 6854 break; 6855 case '|': 6856 case ')': 6857 if (flags & TRYAGAIN) { 6858 *flagp |= TRYAGAIN; 6859 return NULL; 6860 } 6861 vFAIL("Internal urp"); 6862 /* Supposed to be caught earlier. */ 6863 break; 6864 case '{': 6865 if (!regcurly(RExC_parse)) { 6866 RExC_parse++; 6867 goto defchar; 6868 } 6869 /* FALL THROUGH */ 6870 case '?': 6871 case '+': 6872 case '*': 6873 RExC_parse++; 6874 vFAIL("Quantifier follows nothing"); 6875 break; 6876 case 0xDF: 6877 case 0xC3: 6878 case 0xCE: 6879 do_foldchar: 6880 if (!LOC && FOLD) { 6881 U32 len,cp; 6882 len=0; /* silence a spurious compiler warning */ 6883 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { 6884 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ 6885 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ 6886 ret = reganode(pRExC_state, FOLDCHAR, cp); 6887 Set_Node_Length(ret, 1); /* MJD */ 6888 nextchar(pRExC_state); /* kill whitespace under /x */ 6889 return ret; 6890 } 6891 } 6892 goto outer_default; 6893 case '\\': 6894 /* Special Escapes 6895 6896 This switch handles escape sequences that resolve to some kind 6897 of special regop and not to literal text. Escape sequnces that 6898 resolve to literal text are handled below in the switch marked 6899 "Literal Escapes". 6900 6901 Every entry in this switch *must* have a corresponding entry 6902 in the literal escape switch. However, the opposite is not 6903 required, as the default for this switch is to jump to the 6904 literal text handling code. 6905 */ 6906 switch ((U8)*++RExC_parse) { 6907 case 0xDF: 6908 case 0xC3: 6909 case 0xCE: 6910 goto do_foldchar; 6911 /* Special Escapes */ 6912 case 'A': 6913 RExC_seen_zerolen++; 6914 ret = reg_node(pRExC_state, SBOL); 6915 *flagp |= SIMPLE; 6916 goto finish_meta_pat; 6917 case 'G': 6918 ret = reg_node(pRExC_state, GPOS); 6919 RExC_seen |= REG_SEEN_GPOS; 6920 *flagp |= SIMPLE; 6921 goto finish_meta_pat; 6922 case 'K': 6923 RExC_seen_zerolen++; 6924 ret = reg_node(pRExC_state, KEEPS); 6925 *flagp |= SIMPLE; 6926 /* XXX:dmq : disabling in-place substitution seems to 6927 * be necessary here to avoid cases of memory corruption, as 6928 * with: C<$_="x" x 80; s/x\K/y/> -- rgs 6929 */ 6930 RExC_seen |= REG_SEEN_LOOKBEHIND; 6931 goto finish_meta_pat; 6932 case 'Z': 6933 ret = reg_node(pRExC_state, SEOL); 6934 *flagp |= SIMPLE; 6935 RExC_seen_zerolen++; /* Do not optimize RE away */ 6936 goto finish_meta_pat; 6937 case 'z': 6938 ret = reg_node(pRExC_state, EOS); 6939 *flagp |= SIMPLE; 6940 RExC_seen_zerolen++; /* Do not optimize RE away */ 6941 goto finish_meta_pat; 6942 case 'C': 6943 ret = reg_node(pRExC_state, CANY); 6944 RExC_seen |= REG_SEEN_CANY; 6945 *flagp |= HASWIDTH|SIMPLE; 6946 goto finish_meta_pat; 6947 case 'X': 6948 ret = reg_node(pRExC_state, CLUMP); 6949 *flagp |= HASWIDTH; 6950 goto finish_meta_pat; 6951 case 'w': 6952 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); 6953 *flagp |= HASWIDTH|SIMPLE; 6954 goto finish_meta_pat; 6955 case 'W': 6956 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); 6957 *flagp |= HASWIDTH|SIMPLE; 6958 goto finish_meta_pat; 6959 case 'b': 6960 RExC_seen_zerolen++; 6961 RExC_seen |= REG_SEEN_LOOKBEHIND; 6962 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); 6963 *flagp |= SIMPLE; 6964 goto finish_meta_pat; 6965 case 'B': 6966 RExC_seen_zerolen++; 6967 RExC_seen |= REG_SEEN_LOOKBEHIND; 6968 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); 6969 *flagp |= SIMPLE; 6970 goto finish_meta_pat; 6971 case 's': 6972 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); 6973 *flagp |= HASWIDTH|SIMPLE; 6974 goto finish_meta_pat; 6975 case 'S': 6976 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); 6977 *flagp |= HASWIDTH|SIMPLE; 6978 goto finish_meta_pat; 6979 case 'd': 6980 ret = reg_node(pRExC_state, DIGIT); 6981 *flagp |= HASWIDTH|SIMPLE; 6982 goto finish_meta_pat; 6983 case 'D': 6984 ret = reg_node(pRExC_state, NDIGIT); 6985 *flagp |= HASWIDTH|SIMPLE; 6986 goto finish_meta_pat; 6987 case 'R': 6988 ret = reg_node(pRExC_state, LNBREAK); 6989 *flagp |= HASWIDTH|SIMPLE; 6990 goto finish_meta_pat; 6991 case 'h': 6992 ret = reg_node(pRExC_state, HORIZWS); 6993 *flagp |= HASWIDTH|SIMPLE; 6994 goto finish_meta_pat; 6995 case 'H': 6996 ret = reg_node(pRExC_state, NHORIZWS); 6997 *flagp |= HASWIDTH|SIMPLE; 6998 goto finish_meta_pat; 6999 case 'v': 7000 ret = reg_node(pRExC_state, VERTWS); 7001 *flagp |= HASWIDTH|SIMPLE; 7002 goto finish_meta_pat; 7003 case 'V': 7004 ret = reg_node(pRExC_state, NVERTWS); 7005 *flagp |= HASWIDTH|SIMPLE; 7006 finish_meta_pat: 7007 nextchar(pRExC_state); 7008 Set_Node_Length(ret, 2); /* MJD */ 7009 break; 7010 case 'p': 7011 case 'P': 7012 { 7013 char* const oldregxend = RExC_end; 7014 #ifdef DEBUGGING 7015 char* parse_start = RExC_parse - 2; 7016 #endif 7017 7018 if (RExC_parse[1] == '{') { 7019 /* a lovely hack--pretend we saw [\pX] instead */ 7020 RExC_end = strchr(RExC_parse, '}'); 7021 if (!RExC_end) { 7022 const U8 c = (U8)*RExC_parse; 7023 RExC_parse += 2; 7024 RExC_end = oldregxend; 7025 vFAIL2("Missing right brace on \\%c{}", c); 7026 } 7027 RExC_end++; 7028 } 7029 else { 7030 RExC_end = RExC_parse + 2; 7031 if (RExC_end > oldregxend) 7032 RExC_end = oldregxend; 7033 } 7034 RExC_parse--; 7035 7036 ret = regclass(pRExC_state,depth+1); 7037 7038 RExC_end = oldregxend; 7039 RExC_parse--; 7040 7041 Set_Node_Offset(ret, parse_start + 2); 7042 Set_Node_Cur_Length(ret); 7043 nextchar(pRExC_state); 7044 *flagp |= HASWIDTH|SIMPLE; 7045 } 7046 break; 7047 case 'N': 7048 /* Handle \N{NAME} here and not below because it can be 7049 multicharacter. join_exact() will join them up later on. 7050 Also this makes sure that things like /\N{BLAH}+/ and 7051 \N{BLAH} being multi char Just Happen. dmq*/ 7052 ++RExC_parse; 7053 ret= reg_namedseq(pRExC_state, NULL); 7054 break; 7055 case 'k': /* Handle \k<NAME> and \k'NAME' */ 7056 parse_named_seq: 7057 { 7058 char ch= RExC_parse[1]; 7059 if (ch != '<' && ch != '\'' && ch != '{') { 7060 RExC_parse++; 7061 vFAIL2("Sequence %.2s... not terminated",parse_start); 7062 } else { 7063 /* this pretty much dupes the code for (?P=...) in reg(), if 7064 you change this make sure you change that */ 7065 char* name_start = (RExC_parse += 2); 7066 U32 num = 0; 7067 SV *sv_dat = reg_scan_name(pRExC_state, 7068 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); 7069 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; 7070 if (RExC_parse == name_start || *RExC_parse != ch) 7071 vFAIL2("Sequence %.3s... not terminated",parse_start); 7072 7073 if (!SIZE_ONLY) { 7074 num = add_data( pRExC_state, 1, "S" ); 7075 RExC_rxi->data->data[num]=(void*)sv_dat; 7076 SvREFCNT_inc_simple_void(sv_dat); 7077 } 7078 7079 RExC_sawback = 1; 7080 ret = reganode(pRExC_state, 7081 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), 7082 num); 7083 *flagp |= HASWIDTH; 7084 7085 /* override incorrect value set in reganode MJD */ 7086 Set_Node_Offset(ret, parse_start+1); 7087 Set_Node_Cur_Length(ret); /* MJD */ 7088 nextchar(pRExC_state); 7089 7090 } 7091 break; 7092 } 7093 case 'g': 7094 case '1': case '2': case '3': case '4': 7095 case '5': case '6': case '7': case '8': case '9': 7096 { 7097 I32 num; 7098 bool isg = *RExC_parse == 'g'; 7099 bool isrel = 0; 7100 bool hasbrace = 0; 7101 if (isg) { 7102 RExC_parse++; 7103 if (*RExC_parse == '{') { 7104 RExC_parse++; 7105 hasbrace = 1; 7106 } 7107 if (*RExC_parse == '-') { 7108 RExC_parse++; 7109 isrel = 1; 7110 } 7111 if (hasbrace && !isDIGIT(*RExC_parse)) { 7112 if (isrel) RExC_parse--; 7113 RExC_parse -= 2; 7114 goto parse_named_seq; 7115 } } 7116 num = atoi(RExC_parse); 7117 if (isg && num == 0) 7118 vFAIL("Reference to invalid group 0"); 7119 if (isrel) { 7120 num = RExC_npar - num; 7121 if (num < 1) 7122 vFAIL("Reference to nonexistent or unclosed group"); 7123 } 7124 if (!isg && num > 9 && num >= RExC_npar) 7125 goto defchar; 7126 else { 7127 char * const parse_start = RExC_parse - 1; /* MJD */ 7128 while (isDIGIT(*RExC_parse)) 7129 RExC_parse++; 7130 if (parse_start == RExC_parse - 1) 7131 vFAIL("Unterminated \\g... pattern"); 7132 if (hasbrace) { 7133 if (*RExC_parse != '}') 7134 vFAIL("Unterminated \\g{...} pattern"); 7135 RExC_parse++; 7136 } 7137 if (!SIZE_ONLY) { 7138 if (num > (I32)RExC_rx->nparens) 7139 vFAIL("Reference to nonexistent group"); 7140 } 7141 RExC_sawback = 1; 7142 ret = reganode(pRExC_state, 7143 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF), 7144 num); 7145 *flagp |= HASWIDTH; 7146 7147 /* override incorrect value set in reganode MJD */ 7148 Set_Node_Offset(ret, parse_start+1); 7149 Set_Node_Cur_Length(ret); /* MJD */ 7150 RExC_parse--; 7151 nextchar(pRExC_state); 7152 } 7153 } 7154 break; 7155 case '\0': 7156 if (RExC_parse >= RExC_end) 7157 FAIL("Trailing \\"); 7158 /* FALL THROUGH */ 7159 default: 7160 /* Do not generate "unrecognized" warnings here, we fall 7161 back into the quick-grab loop below */ 7162 parse_start--; 7163 goto defchar; 7164 } 7165 break; 7166 7167 case '#': 7168 if (RExC_flags & RXf_PMf_EXTENDED) { 7169 if ( reg_skipcomment( pRExC_state ) ) 7170 goto tryagain; 7171 } 7172 /* FALL THROUGH */ 7173 7174 default: 7175 outer_default:{ 7176 register STRLEN len; 7177 register UV ender; 7178 register char *p; 7179 char *s; 7180 STRLEN foldlen; 7181 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; 7182 7183 parse_start = RExC_parse - 1; 7184 7185 RExC_parse++; 7186 7187 defchar: 7188 ender = 0; 7189 ret = reg_node(pRExC_state, 7190 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); 7191 s = STRING(ret); 7192 for (len = 0, p = RExC_parse - 1; 7193 len < 127 && p < RExC_end; 7194 len++) 7195 { 7196 char * const oldp = p; 7197 7198 if (RExC_flags & RXf_PMf_EXTENDED) 7199 p = regwhite( pRExC_state, p ); 7200 switch ((U8)*p) { 7201 case 0xDF: 7202 case 0xC3: 7203 case 0xCE: 7204 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) 7205 goto normal_default; 7206 case '^': 7207 case '$': 7208 case '.': 7209 case '[': 7210 case '(': 7211 case ')': 7212 case '|': 7213 goto loopdone; 7214 case '\\': 7215 /* Literal Escapes Switch 7216 7217 This switch is meant to handle escape sequences that 7218 resolve to a literal character. 7219 7220 Every escape sequence that represents something 7221 else, like an assertion or a char class, is handled 7222 in the switch marked 'Special Escapes' above in this 7223 routine, but also has an entry here as anything that 7224 isn't explicitly mentioned here will be treated as 7225 an unescaped equivalent literal. 7226 */ 7227 7228 switch ((U8)*++p) { 7229 /* These are all the special escapes. */ 7230 case 0xDF: 7231 case 0xC3: 7232 case 0xCE: 7233 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) 7234 goto normal_default; 7235 case 'A': /* Start assertion */ 7236 case 'b': case 'B': /* Word-boundary assertion*/ 7237 case 'C': /* Single char !DANGEROUS! */ 7238 case 'd': case 'D': /* digit class */ 7239 case 'g': case 'G': /* generic-backref, pos assertion */ 7240 case 'h': case 'H': /* HORIZWS */ 7241 case 'k': case 'K': /* named backref, keep marker */ 7242 case 'N': /* named char sequence */ 7243 case 'p': case 'P': /* Unicode property */ 7244 case 'R': /* LNBREAK */ 7245 case 's': case 'S': /* space class */ 7246 case 'v': case 'V': /* VERTWS */ 7247 case 'w': case 'W': /* word class */ 7248 case 'X': /* eXtended Unicode "combining character sequence" */ 7249 case 'z': case 'Z': /* End of line/string assertion */ 7250 --p; 7251 goto loopdone; 7252 7253 /* Anything after here is an escape that resolves to a 7254 literal. (Except digits, which may or may not) 7255 */ 7256 case 'n': 7257 ender = '\n'; 7258 p++; 7259 break; 7260 case 'r': 7261 ender = '\r'; 7262 p++; 7263 break; 7264 case 't': 7265 ender = '\t'; 7266 p++; 7267 break; 7268 case 'f': 7269 ender = '\f'; 7270 p++; 7271 break; 7272 case 'e': 7273 ender = ASCII_TO_NATIVE('\033'); 7274 p++; 7275 break; 7276 case 'a': 7277 ender = ASCII_TO_NATIVE('\007'); 7278 p++; 7279 break; 7280 case 'x': 7281 if (*++p == '{') { 7282 char* const e = strchr(p, '}'); 7283 7284 if (!e) { 7285 RExC_parse = p + 1; 7286 vFAIL("Missing right brace on \\x{}"); 7287 } 7288 else { 7289 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 7290 | PERL_SCAN_DISALLOW_PREFIX; 7291 STRLEN numlen = e - p - 1; 7292 ender = grok_hex(p + 1, &numlen, &flags, NULL); 7293 if (ender > 0xff) 7294 RExC_utf8 = 1; 7295 p = e + 1; 7296 } 7297 } 7298 else { 7299 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 7300 STRLEN numlen = 2; 7301 ender = grok_hex(p, &numlen, &flags, NULL); 7302 p += numlen; 7303 } 7304 if (PL_encoding && ender < 0x100) 7305 goto recode_encoding; 7306 break; 7307 case 'c': 7308 p++; 7309 ender = UCHARAT(p++); 7310 ender = toCTRL(ender); 7311 break; 7312 case '0': case '1': case '2': case '3':case '4': 7313 case '5': case '6': case '7': case '8':case '9': 7314 if (*p == '0' || 7315 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { 7316 I32 flags = 0; 7317 STRLEN numlen = 3; 7318 ender = grok_oct(p, &numlen, &flags, NULL); 7319 p += numlen; 7320 } 7321 else { 7322 --p; 7323 goto loopdone; 7324 } 7325 if (PL_encoding && ender < 0x100) 7326 goto recode_encoding; 7327 break; 7328 recode_encoding: 7329 { 7330 SV* enc = PL_encoding; 7331 ender = reg_recode((const char)(U8)ender, &enc); 7332 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) 7333 vWARN(p, "Invalid escape in the specified encoding"); 7334 RExC_utf8 = 1; 7335 } 7336 break; 7337 case '\0': 7338 if (p >= RExC_end) 7339 FAIL("Trailing \\"); 7340 /* FALL THROUGH */ 7341 default: 7342 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP)) 7343 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); 7344 goto normal_default; 7345 } 7346 break; 7347 default: 7348 normal_default: 7349 if (UTF8_IS_START(*p) && UTF) { 7350 STRLEN numlen; 7351 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 7352 &numlen, UTF8_ALLOW_DEFAULT); 7353 p += numlen; 7354 } 7355 else 7356 ender = *p++; 7357 break; 7358 } 7359 if ( RExC_flags & RXf_PMf_EXTENDED) 7360 p = regwhite( pRExC_state, p ); 7361 if (UTF && FOLD) { 7362 /* Prime the casefolded buffer. */ 7363 ender = toFOLD_uni(ender, tmpbuf, &foldlen); 7364 } 7365 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ 7366 if (len) 7367 p = oldp; 7368 else if (UTF) { 7369 if (FOLD) { 7370 /* Emit all the Unicode characters. */ 7371 STRLEN numlen; 7372 for (foldbuf = tmpbuf; 7373 foldlen; 7374 foldlen -= numlen) { 7375 ender = utf8_to_uvchr(foldbuf, &numlen); 7376 if (numlen > 0) { 7377 const STRLEN unilen = reguni(pRExC_state, ender, s); 7378 s += unilen; 7379 len += unilen; 7380 /* In EBCDIC the numlen 7381 * and unilen can differ. */ 7382 foldbuf += numlen; 7383 if (numlen >= foldlen) 7384 break; 7385 } 7386 else 7387 break; /* "Can't happen." */ 7388 } 7389 } 7390 else { 7391 const STRLEN unilen = reguni(pRExC_state, ender, s); 7392 if (unilen > 0) { 7393 s += unilen; 7394 len += unilen; 7395 } 7396 } 7397 } 7398 else { 7399 len++; 7400 REGC((char)ender, s++); 7401 } 7402 break; 7403 } 7404 if (UTF) { 7405 if (FOLD) { 7406 /* Emit all the Unicode characters. */ 7407 STRLEN numlen; 7408 for (foldbuf = tmpbuf; 7409 foldlen; 7410 foldlen -= numlen) { 7411 ender = utf8_to_uvchr(foldbuf, &numlen); 7412 if (numlen > 0) { 7413 const STRLEN unilen = reguni(pRExC_state, ender, s); 7414 len += unilen; 7415 s += unilen; 7416 /* In EBCDIC the numlen 7417 * and unilen can differ. */ 7418 foldbuf += numlen; 7419 if (numlen >= foldlen) 7420 break; 7421 } 7422 else 7423 break; 7424 } 7425 } 7426 else { 7427 const STRLEN unilen = reguni(pRExC_state, ender, s); 7428 if (unilen > 0) { 7429 s += unilen; 7430 len += unilen; 7431 } 7432 } 7433 len--; 7434 } 7435 else 7436 REGC((char)ender, s++); 7437 } 7438 loopdone: 7439 RExC_parse = p - 1; 7440 Set_Node_Cur_Length(ret); /* MJD */ 7441 nextchar(pRExC_state); 7442 { 7443 /* len is STRLEN which is unsigned, need to copy to signed */ 7444 IV iv = len; 7445 if (iv < 0) 7446 vFAIL("Internal disaster"); 7447 } 7448 if (len > 0) 7449 *flagp |= HASWIDTH; 7450 if (len == 1 && UNI_IS_INVARIANT(ender)) 7451 *flagp |= SIMPLE; 7452 7453 if (SIZE_ONLY) 7454 RExC_size += STR_SZ(len); 7455 else { 7456 STR_LEN(ret) = len; 7457 RExC_emit += STR_SZ(len); 7458 } 7459 } 7460 break; 7461 } 7462 7463 return(ret); 7464 } 7465 7466 STATIC char * 7467 S_regwhite( RExC_state_t *pRExC_state, char *p ) 7468 { 7469 const char *e = RExC_end; 7470 while (p < e) { 7471 if (isSPACE(*p)) 7472 ++p; 7473 else if (*p == '#') { 7474 bool ended = 0; 7475 do { 7476 if (*p++ == '\n') { 7477 ended = 1; 7478 break; 7479 } 7480 } while (p < e); 7481 if (!ended) 7482 RExC_seen |= REG_SEEN_RUN_ON_COMMENT; 7483 } 7484 else 7485 break; 7486 } 7487 return p; 7488 } 7489 7490 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 7491 Character classes ([:foo:]) can also be negated ([:^foo:]). 7492 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 7493 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 7494 but trigger failures because they are currently unimplemented. */ 7495 7496 #define POSIXCC_DONE(c) ((c) == ':') 7497 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 7498 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 7499 7500 STATIC I32 7501 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) 7502 { 7503 dVAR; 7504 I32 namedclass = OOB_NAMEDCLASS; 7505 7506 if (value == '[' && RExC_parse + 1 < RExC_end && 7507 /* I smell either [: or [= or [. -- POSIX has been here, right? */ 7508 POSIXCC(UCHARAT(RExC_parse))) { 7509 const char c = UCHARAT(RExC_parse); 7510 char* const s = RExC_parse++; 7511 7512 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) 7513 RExC_parse++; 7514 if (RExC_parse == RExC_end) 7515 /* Grandfather lone [:, [=, [. */ 7516 RExC_parse = s; 7517 else { 7518 const char* const t = RExC_parse++; /* skip over the c */ 7519 assert(*t == c); 7520 7521 if (UCHARAT(RExC_parse) == ']') { 7522 const char *posixcc = s + 1; 7523 RExC_parse++; /* skip over the ending ] */ 7524 7525 if (*s == ':') { 7526 const I32 complement = *posixcc == '^' ? *posixcc++ : 0; 7527 const I32 skip = t - posixcc; 7528 7529 /* Initially switch on the length of the name. */ 7530 switch (skip) { 7531 case 4: 7532 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ 7533 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM; 7534 break; 7535 case 5: 7536 /* Names all of length 5. */ 7537 /* alnum alpha ascii blank cntrl digit graph lower 7538 print punct space upper */ 7539 /* Offset 4 gives the best switch position. */ 7540 switch (posixcc[4]) { 7541 case 'a': 7542 if (memEQ(posixcc, "alph", 4)) /* alpha */ 7543 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA; 7544 break; 7545 case 'e': 7546 if (memEQ(posixcc, "spac", 4)) /* space */ 7547 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; 7548 break; 7549 case 'h': 7550 if (memEQ(posixcc, "grap", 4)) /* graph */ 7551 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH; 7552 break; 7553 case 'i': 7554 if (memEQ(posixcc, "asci", 4)) /* ascii */ 7555 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; 7556 break; 7557 case 'k': 7558 if (memEQ(posixcc, "blan", 4)) /* blank */ 7559 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK; 7560 break; 7561 case 'l': 7562 if (memEQ(posixcc, "cntr", 4)) /* cntrl */ 7563 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL; 7564 break; 7565 case 'm': 7566 if (memEQ(posixcc, "alnu", 4)) /* alnum */ 7567 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; 7568 break; 7569 case 'r': 7570 if (memEQ(posixcc, "lowe", 4)) /* lower */ 7571 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER; 7572 else if (memEQ(posixcc, "uppe", 4)) /* upper */ 7573 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER; 7574 break; 7575 case 't': 7576 if (memEQ(posixcc, "digi", 4)) /* digit */ 7577 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT; 7578 else if (memEQ(posixcc, "prin", 4)) /* print */ 7579 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT; 7580 else if (memEQ(posixcc, "punc", 4)) /* punct */ 7581 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT; 7582 break; 7583 } 7584 break; 7585 case 6: 7586 if (memEQ(posixcc, "xdigit", 6)) 7587 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; 7588 break; 7589 } 7590 7591 if (namedclass == OOB_NAMEDCLASS) 7592 Simple_vFAIL3("POSIX class [:%.*s:] unknown", 7593 t - s - 1, s + 1); 7594 assert (posixcc[skip] == ':'); 7595 assert (posixcc[skip+1] == ']'); 7596 } else if (!SIZE_ONLY) { 7597 /* [[=foo=]] and [[.foo.]] are still future. */ 7598 7599 /* adjust RExC_parse so the warning shows after 7600 the class closes */ 7601 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') 7602 RExC_parse++; 7603 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 7604 } 7605 } else { 7606 /* Maternal grandfather: 7607 * "[:" ending in ":" but not in ":]" */ 7608 RExC_parse = s; 7609 } 7610 } 7611 } 7612 7613 return namedclass; 7614 } 7615 7616 STATIC void 7617 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) 7618 { 7619 dVAR; 7620 if (POSIXCC(UCHARAT(RExC_parse))) { 7621 const char *s = RExC_parse; 7622 const char c = *s++; 7623 7624 while (isALNUM(*s)) 7625 s++; 7626 if (*s && c == *s && s[1] == ']') { 7627 if (ckWARN(WARN_REGEXP)) 7628 vWARN3(s+2, 7629 "POSIX syntax [%c %c] belongs inside character classes", 7630 c, c); 7631 7632 /* [[=foo=]] and [[.foo.]] are still future. */ 7633 if (POSIXCC_NOTYET(c)) { 7634 /* adjust RExC_parse so the error shows after 7635 the class closes */ 7636 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']') 7637 NOOP; 7638 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 7639 } 7640 } 7641 } 7642 } 7643 7644 7645 #define _C_C_T_(NAME,TEST,WORD) \ 7646 ANYOF_##NAME: \ 7647 if (LOC) \ 7648 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ 7649 else { \ 7650 for (value = 0; value < 256; value++) \ 7651 if (TEST) \ 7652 ANYOF_BITMAP_SET(ret, value); \ 7653 } \ 7654 yesno = '+'; \ 7655 what = WORD; \ 7656 break; \ 7657 case ANYOF_N##NAME: \ 7658 if (LOC) \ 7659 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ 7660 else { \ 7661 for (value = 0; value < 256; value++) \ 7662 if (!TEST) \ 7663 ANYOF_BITMAP_SET(ret, value); \ 7664 } \ 7665 yesno = '!'; \ 7666 what = WORD; \ 7667 break 7668 7669 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \ 7670 ANYOF_##NAME: \ 7671 for (value = 0; value < 256; value++) \ 7672 if (TEST) \ 7673 ANYOF_BITMAP_SET(ret, value); \ 7674 yesno = '+'; \ 7675 what = WORD; \ 7676 break; \ 7677 case ANYOF_N##NAME: \ 7678 for (value = 0; value < 256; value++) \ 7679 if (!TEST) \ 7680 ANYOF_BITMAP_SET(ret, value); \ 7681 yesno = '!'; \ 7682 what = WORD; \ 7683 break 7684 7685 /* 7686 parse a class specification and produce either an ANYOF node that 7687 matches the pattern or if the pattern matches a single char only and 7688 that char is < 256 and we are case insensitive then we produce an 7689 EXACT node instead. 7690 */ 7691 7692 STATIC regnode * 7693 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) 7694 { 7695 dVAR; 7696 register UV nextvalue; 7697 register IV prevvalue = OOB_UNICODE; 7698 register IV range = 0; 7699 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ 7700 register regnode *ret; 7701 STRLEN numlen; 7702 IV namedclass; 7703 char *rangebegin = NULL; 7704 bool need_class = 0; 7705 SV *listsv = NULL; 7706 UV n; 7707 bool optimize_invert = TRUE; 7708 AV* unicode_alternate = NULL; 7709 #ifdef EBCDIC 7710 UV literal_endpoint = 0; 7711 #endif 7712 UV stored = 0; /* number of chars stored in the class */ 7713 7714 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in 7715 case we need to change the emitted regop to an EXACT. */ 7716 const char * orig_parse = RExC_parse; 7717 GET_RE_DEBUG_FLAGS_DECL; 7718 #ifndef DEBUGGING 7719 PERL_UNUSED_ARG(depth); 7720 #endif 7721 7722 DEBUG_PARSE("clas"); 7723 7724 /* Assume we are going to generate an ANYOF node. */ 7725 ret = reganode(pRExC_state, ANYOF, 0); 7726 7727 if (!SIZE_ONLY) 7728 ANYOF_FLAGS(ret) = 0; 7729 7730 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ 7731 RExC_naughty++; 7732 RExC_parse++; 7733 if (!SIZE_ONLY) 7734 ANYOF_FLAGS(ret) |= ANYOF_INVERT; 7735 } 7736 7737 if (SIZE_ONLY) { 7738 RExC_size += ANYOF_SKIP; 7739 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ 7740 } 7741 else { 7742 RExC_emit += ANYOF_SKIP; 7743 if (FOLD) 7744 ANYOF_FLAGS(ret) |= ANYOF_FOLD; 7745 if (LOC) 7746 ANYOF_FLAGS(ret) |= ANYOF_LOCALE; 7747 ANYOF_BITMAP_ZERO(ret); 7748 listsv = newSVpvs("# comment\n"); 7749 } 7750 7751 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 7752 7753 if (!SIZE_ONLY && POSIXCC(nextvalue)) 7754 checkposixcc(pRExC_state); 7755 7756 /* allow 1st char to be ] (allowing it to be - is dealt with later) */ 7757 if (UCHARAT(RExC_parse) == ']') 7758 goto charclassloop; 7759 7760 parseit: 7761 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { 7762 7763 charclassloop: 7764 7765 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 7766 7767 if (!range) 7768 rangebegin = RExC_parse; 7769 if (UTF) { 7770 value = utf8n_to_uvchr((U8*)RExC_parse, 7771 RExC_end - RExC_parse, 7772 &numlen, UTF8_ALLOW_DEFAULT); 7773 RExC_parse += numlen; 7774 } 7775 else 7776 value = UCHARAT(RExC_parse++); 7777 7778 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 7779 if (value == '[' && POSIXCC(nextvalue)) 7780 namedclass = regpposixcc(pRExC_state, value); 7781 else if (value == '\\') { 7782 if (UTF) { 7783 value = utf8n_to_uvchr((U8*)RExC_parse, 7784 RExC_end - RExC_parse, 7785 &numlen, UTF8_ALLOW_DEFAULT); 7786 RExC_parse += numlen; 7787 } 7788 else 7789 value = UCHARAT(RExC_parse++); 7790 /* Some compilers cannot handle switching on 64-bit integer 7791 * values, therefore value cannot be an UV. Yes, this will 7792 * be a problem later if we want switch on Unicode. 7793 * A similar issue a little bit later when switching on 7794 * namedclass. --jhi */ 7795 switch ((I32)value) { 7796 case 'w': namedclass = ANYOF_ALNUM; break; 7797 case 'W': namedclass = ANYOF_NALNUM; break; 7798 case 's': namedclass = ANYOF_SPACE; break; 7799 case 'S': namedclass = ANYOF_NSPACE; break; 7800 case 'd': namedclass = ANYOF_DIGIT; break; 7801 case 'D': namedclass = ANYOF_NDIGIT; break; 7802 case 'v': namedclass = ANYOF_VERTWS; break; 7803 case 'V': namedclass = ANYOF_NVERTWS; break; 7804 case 'h': namedclass = ANYOF_HORIZWS; break; 7805 case 'H': namedclass = ANYOF_NHORIZWS; break; 7806 case 'N': /* Handle \N{NAME} in class */ 7807 { 7808 /* We only pay attention to the first char of 7809 multichar strings being returned. I kinda wonder 7810 if this makes sense as it does change the behaviour 7811 from earlier versions, OTOH that behaviour was broken 7812 as well. */ 7813 UV v; /* value is register so we cant & it /grrr */ 7814 if (reg_namedseq(pRExC_state, &v)) { 7815 goto parseit; 7816 } 7817 value= v; 7818 } 7819 break; 7820 case 'p': 7821 case 'P': 7822 { 7823 char *e; 7824 if (RExC_parse >= RExC_end) 7825 vFAIL2("Empty \\%c{}", (U8)value); 7826 if (*RExC_parse == '{') { 7827 const U8 c = (U8)value; 7828 e = strchr(RExC_parse++, '}'); 7829 if (!e) 7830 vFAIL2("Missing right brace on \\%c{}", c); 7831 while (isSPACE(UCHARAT(RExC_parse))) 7832 RExC_parse++; 7833 if (e == RExC_parse) 7834 vFAIL2("Empty \\%c{}", c); 7835 n = e - RExC_parse; 7836 while (isSPACE(UCHARAT(RExC_parse + n - 1))) 7837 n--; 7838 } 7839 else { 7840 e = RExC_parse; 7841 n = 1; 7842 } 7843 if (!SIZE_ONLY) { 7844 if (UCHARAT(RExC_parse) == '^') { 7845 RExC_parse++; 7846 n--; 7847 value = value == 'p' ? 'P' : 'p'; /* toggle */ 7848 while (isSPACE(UCHARAT(RExC_parse))) { 7849 RExC_parse++; 7850 n--; 7851 } 7852 } 7853 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n", 7854 (value=='p' ? '+' : '!'), (int)n, RExC_parse); 7855 } 7856 RExC_parse = e + 1; 7857 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 7858 namedclass = ANYOF_MAX; /* no official name, but it's named */ 7859 } 7860 break; 7861 case 'n': value = '\n'; break; 7862 case 'r': value = '\r'; break; 7863 case 't': value = '\t'; break; 7864 case 'f': value = '\f'; break; 7865 case 'b': value = '\b'; break; 7866 case 'e': value = ASCII_TO_NATIVE('\033');break; 7867 case 'a': value = ASCII_TO_NATIVE('\007');break; 7868 case 'x': 7869 if (*RExC_parse == '{') { 7870 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 7871 | PERL_SCAN_DISALLOW_PREFIX; 7872 char * const e = strchr(RExC_parse++, '}'); 7873 if (!e) 7874 vFAIL("Missing right brace on \\x{}"); 7875 7876 numlen = e - RExC_parse; 7877 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 7878 RExC_parse = e + 1; 7879 } 7880 else { 7881 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 7882 numlen = 2; 7883 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 7884 RExC_parse += numlen; 7885 } 7886 if (PL_encoding && value < 0x100) 7887 goto recode_encoding; 7888 break; 7889 case 'c': 7890 value = UCHARAT(RExC_parse++); 7891 value = toCTRL(value); 7892 break; 7893 case '0': case '1': case '2': case '3': case '4': 7894 case '5': case '6': case '7': case '8': case '9': 7895 { 7896 I32 flags = 0; 7897 numlen = 3; 7898 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 7899 RExC_parse += numlen; 7900 if (PL_encoding && value < 0x100) 7901 goto recode_encoding; 7902 break; 7903 } 7904 recode_encoding: 7905 { 7906 SV* enc = PL_encoding; 7907 value = reg_recode((const char)(U8)value, &enc); 7908 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) 7909 vWARN(RExC_parse, 7910 "Invalid escape in the specified encoding"); 7911 break; 7912 } 7913 default: 7914 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP)) 7915 vWARN2(RExC_parse, 7916 "Unrecognized escape \\%c in character class passed through", 7917 (int)value); 7918 break; 7919 } 7920 } /* end of \blah */ 7921 #ifdef EBCDIC 7922 else 7923 literal_endpoint++; 7924 #endif 7925 7926 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 7927 7928 if (!SIZE_ONLY && !need_class) 7929 ANYOF_CLASS_ZERO(ret); 7930 7931 need_class = 1; 7932 7933 /* a bad range like a-\d, a-[:digit:] ? */ 7934 if (range) { 7935 if (!SIZE_ONLY) { 7936 if (ckWARN(WARN_REGEXP)) { 7937 const int w = 7938 RExC_parse >= rangebegin ? 7939 RExC_parse - rangebegin : 0; 7940 vWARN4(RExC_parse, 7941 "False [] range \"%*.*s\"", 7942 w, w, rangebegin); 7943 } 7944 if (prevvalue < 256) { 7945 ANYOF_BITMAP_SET(ret, prevvalue); 7946 ANYOF_BITMAP_SET(ret, '-'); 7947 } 7948 else { 7949 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 7950 Perl_sv_catpvf(aTHX_ listsv, 7951 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); 7952 } 7953 } 7954 7955 range = 0; /* this was not a true range */ 7956 } 7957 7958 7959 7960 if (!SIZE_ONLY) { 7961 const char *what = NULL; 7962 char yesno = 0; 7963 7964 if (namedclass > OOB_NAMEDCLASS) 7965 optimize_invert = FALSE; 7966 /* Possible truncation here but in some 64-bit environments 7967 * the compiler gets heartburn about switch on 64-bit values. 7968 * A similar issue a little earlier when switching on value. 7969 * --jhi */ 7970 switch ((I32)namedclass) { 7971 case _C_C_T_(ALNUM, isALNUM(value), "Word"); 7972 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum"); 7973 case _C_C_T_(ALPHA, isALPHA(value), "Alpha"); 7974 case _C_C_T_(BLANK, isBLANK(value), "Blank"); 7975 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl"); 7976 case _C_C_T_(GRAPH, isGRAPH(value), "Graph"); 7977 case _C_C_T_(LOWER, isLOWER(value), "Lower"); 7978 case _C_C_T_(PRINT, isPRINT(value), "Print"); 7979 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space"); 7980 case _C_C_T_(PUNCT, isPUNCT(value), "Punct"); 7981 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); 7982 case _C_C_T_(UPPER, isUPPER(value), "Upper"); 7983 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); 7984 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); 7985 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); 7986 case ANYOF_ASCII: 7987 if (LOC) 7988 ANYOF_CLASS_SET(ret, ANYOF_ASCII); 7989 else { 7990 #ifndef EBCDIC 7991 for (value = 0; value < 128; value++) 7992 ANYOF_BITMAP_SET(ret, value); 7993 #else /* EBCDIC */ 7994 for (value = 0; value < 256; value++) { 7995 if (isASCII(value)) 7996 ANYOF_BITMAP_SET(ret, value); 7997 } 7998 #endif /* EBCDIC */ 7999 } 8000 yesno = '+'; 8001 what = "ASCII"; 8002 break; 8003 case ANYOF_NASCII: 8004 if (LOC) 8005 ANYOF_CLASS_SET(ret, ANYOF_NASCII); 8006 else { 8007 #ifndef EBCDIC 8008 for (value = 128; value < 256; value++) 8009 ANYOF_BITMAP_SET(ret, value); 8010 #else /* EBCDIC */ 8011 for (value = 0; value < 256; value++) { 8012 if (!isASCII(value)) 8013 ANYOF_BITMAP_SET(ret, value); 8014 } 8015 #endif /* EBCDIC */ 8016 } 8017 yesno = '!'; 8018 what = "ASCII"; 8019 break; 8020 case ANYOF_DIGIT: 8021 if (LOC) 8022 ANYOF_CLASS_SET(ret, ANYOF_DIGIT); 8023 else { 8024 /* consecutive digits assumed */ 8025 for (value = '0'; value <= '9'; value++) 8026 ANYOF_BITMAP_SET(ret, value); 8027 } 8028 yesno = '+'; 8029 what = "Digit"; 8030 break; 8031 case ANYOF_NDIGIT: 8032 if (LOC) 8033 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); 8034 else { 8035 /* consecutive digits assumed */ 8036 for (value = 0; value < '0'; value++) 8037 ANYOF_BITMAP_SET(ret, value); 8038 for (value = '9' + 1; value < 256; value++) 8039 ANYOF_BITMAP_SET(ret, value); 8040 } 8041 yesno = '!'; 8042 what = "Digit"; 8043 break; 8044 case ANYOF_MAX: 8045 /* this is to handle \p and \P */ 8046 break; 8047 default: 8048 vFAIL("Invalid [::] class"); 8049 break; 8050 } 8051 if (what) { 8052 /* Strings such as "+utf8::isWord\n" */ 8053 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); 8054 } 8055 if (LOC) 8056 ANYOF_FLAGS(ret) |= ANYOF_CLASS; 8057 continue; 8058 } 8059 } /* end of namedclass \blah */ 8060 8061 if (range) { 8062 if (prevvalue > (IV)value) /* b-a */ { 8063 const int w = RExC_parse - rangebegin; 8064 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); 8065 range = 0; /* not a valid range */ 8066 } 8067 } 8068 else { 8069 prevvalue = value; /* save the beginning of the range */ 8070 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && 8071 RExC_parse[1] != ']') { 8072 RExC_parse++; 8073 8074 /* a bad range like \w-, [:word:]- ? */ 8075 if (namedclass > OOB_NAMEDCLASS) { 8076 if (ckWARN(WARN_REGEXP)) { 8077 const int w = 8078 RExC_parse >= rangebegin ? 8079 RExC_parse - rangebegin : 0; 8080 vWARN4(RExC_parse, 8081 "False [] range \"%*.*s\"", 8082 w, w, rangebegin); 8083 } 8084 if (!SIZE_ONLY) 8085 ANYOF_BITMAP_SET(ret, '-'); 8086 } else 8087 range = 1; /* yeah, it's a range! */ 8088 continue; /* but do it the next time */ 8089 } 8090 } 8091 8092 /* now is the next time */ 8093 /*stored += (value - prevvalue + 1);*/ 8094 if (!SIZE_ONLY) { 8095 if (prevvalue < 256) { 8096 const IV ceilvalue = value < 256 ? value : 255; 8097 IV i; 8098 #ifdef EBCDIC 8099 /* In EBCDIC [\x89-\x91] should include 8100 * the \x8e but [i-j] should not. */ 8101 if (literal_endpoint == 2 && 8102 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || 8103 (isUPPER(prevvalue) && isUPPER(ceilvalue)))) 8104 { 8105 if (isLOWER(prevvalue)) { 8106 for (i = prevvalue; i <= ceilvalue; i++) 8107 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { 8108 stored++; 8109 ANYOF_BITMAP_SET(ret, i); 8110 } 8111 } else { 8112 for (i = prevvalue; i <= ceilvalue; i++) 8113 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { 8114 stored++; 8115 ANYOF_BITMAP_SET(ret, i); 8116 } 8117 } 8118 } 8119 else 8120 #endif 8121 for (i = prevvalue; i <= ceilvalue; i++) { 8122 if (!ANYOF_BITMAP_TEST(ret,i)) { 8123 stored++; 8124 ANYOF_BITMAP_SET(ret, i); 8125 } 8126 } 8127 } 8128 if (value > 255 || UTF) { 8129 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); 8130 const UV natvalue = NATIVE_TO_UNI(value); 8131 stored+=2; /* can't optimize this class */ 8132 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 8133 if (prevnatvalue < natvalue) { /* what about > ? */ 8134 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", 8135 prevnatvalue, natvalue); 8136 } 8137 else if (prevnatvalue == natvalue) { 8138 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); 8139 if (FOLD) { 8140 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 8141 STRLEN foldlen; 8142 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen); 8143 8144 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */ 8145 if (RExC_precomp[0] == ':' && 8146 RExC_precomp[1] == '[' && 8147 (f == 0xDF || f == 0x92)) { 8148 f = NATIVE_TO_UNI(f); 8149 } 8150 #endif 8151 /* If folding and foldable and a single 8152 * character, insert also the folded version 8153 * to the charclass. */ 8154 if (f != value) { 8155 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */ 8156 if ((RExC_precomp[0] == ':' && 8157 RExC_precomp[1] == '[' && 8158 (f == 0xA2 && 8159 (value == 0xFB05 || value == 0xFB06))) ? 8160 foldlen == ((STRLEN)UNISKIP(f) - 1) : 8161 foldlen == (STRLEN)UNISKIP(f) ) 8162 #else 8163 if (foldlen == (STRLEN)UNISKIP(f)) 8164 #endif 8165 Perl_sv_catpvf(aTHX_ listsv, 8166 "%04"UVxf"\n", f); 8167 else { 8168 /* Any multicharacter foldings 8169 * require the following transform: 8170 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) 8171 * where E folds into "pq" and F folds 8172 * into "rst", all other characters 8173 * fold to single characters. We save 8174 * away these multicharacter foldings, 8175 * to be later saved as part of the 8176 * additional "s" data. */ 8177 SV *sv; 8178 8179 if (!unicode_alternate) 8180 unicode_alternate = newAV(); 8181 sv = newSVpvn((char*)foldbuf, foldlen); 8182 SvUTF8_on(sv); 8183 av_push(unicode_alternate, sv); 8184 } 8185 } 8186 8187 /* If folding and the value is one of the Greek 8188 * sigmas insert a few more sigmas to make the 8189 * folding rules of the sigmas to work right. 8190 * Note that not all the possible combinations 8191 * are handled here: some of them are handled 8192 * by the standard folding rules, and some of 8193 * them (literal or EXACTF cases) are handled 8194 * during runtime in regexec.c:S_find_byclass(). */ 8195 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { 8196 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 8197 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); 8198 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 8199 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 8200 } 8201 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA) 8202 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 8203 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 8204 } 8205 } 8206 } 8207 #ifdef EBCDIC 8208 literal_endpoint = 0; 8209 #endif 8210 } 8211 8212 range = 0; /* this range (if it was one) is done now */ 8213 } 8214 8215 if (need_class) { 8216 ANYOF_FLAGS(ret) |= ANYOF_LARGE; 8217 if (SIZE_ONLY) 8218 RExC_size += ANYOF_CLASS_ADD_SKIP; 8219 else 8220 RExC_emit += ANYOF_CLASS_ADD_SKIP; 8221 } 8222 8223 8224 if (SIZE_ONLY) 8225 return ret; 8226 /****** !SIZE_ONLY AFTER HERE *********/ 8227 8228 if( stored == 1 && (value < 128 || (value < 256 && !UTF)) 8229 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) ) 8230 ) { 8231 /* optimize single char class to an EXACT node 8232 but *only* when its not a UTF/high char */ 8233 const char * cur_parse= RExC_parse; 8234 RExC_emit = (regnode *)orig_emit; 8235 RExC_parse = (char *)orig_parse; 8236 ret = reg_node(pRExC_state, 8237 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT)); 8238 RExC_parse = (char *)cur_parse; 8239 *STRING(ret)= (char)value; 8240 STR_LEN(ret)= 1; 8241 RExC_emit += STR_SZ(1); 8242 return ret; 8243 } 8244 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ 8245 if ( /* If the only flag is folding (plus possibly inversion). */ 8246 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) 8247 ) { 8248 for (value = 0; value < 256; ++value) { 8249 if (ANYOF_BITMAP_TEST(ret, value)) { 8250 UV fold = PL_fold[value]; 8251 8252 if (fold != value) 8253 ANYOF_BITMAP_SET(ret, fold); 8254 } 8255 } 8256 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; 8257 } 8258 8259 /* optimize inverted simple patterns (e.g. [^a-z]) */ 8260 if (optimize_invert && 8261 /* If the only flag is inversion. */ 8262 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { 8263 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) 8264 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; 8265 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; 8266 } 8267 { 8268 AV * const av = newAV(); 8269 SV *rv; 8270 /* The 0th element stores the character class description 8271 * in its textual form: used later (regexec.c:Perl_regclass_swash()) 8272 * to initialize the appropriate swash (which gets stored in 8273 * the 1st element), and also useful for dumping the regnode. 8274 * The 2nd element stores the multicharacter foldings, 8275 * used later (regexec.c:S_reginclass()). */ 8276 av_store(av, 0, listsv); 8277 av_store(av, 1, NULL); 8278 av_store(av, 2, (SV*)unicode_alternate); 8279 rv = newRV_noinc((SV*)av); 8280 n = add_data(pRExC_state, 1, "s"); 8281 RExC_rxi->data->data[n] = (void*)rv; 8282 ARG_SET(ret, n); 8283 } 8284 return ret; 8285 } 8286 #undef _C_C_T_ 8287 8288 8289 /* reg_skipcomment() 8290 8291 Absorbs an /x style # comments from the input stream. 8292 Returns true if there is more text remaining in the stream. 8293 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment 8294 terminates the pattern without including a newline. 8295 8296 Note its the callers responsibility to ensure that we are 8297 actually in /x mode 8298 8299 */ 8300 8301 STATIC bool 8302 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) 8303 { 8304 bool ended = 0; 8305 while (RExC_parse < RExC_end) 8306 if (*RExC_parse++ == '\n') { 8307 ended = 1; 8308 break; 8309 } 8310 if (!ended) { 8311 /* we ran off the end of the pattern without ending 8312 the comment, so we have to add an \n when wrapping */ 8313 RExC_seen |= REG_SEEN_RUN_ON_COMMENT; 8314 return 0; 8315 } else 8316 return 1; 8317 } 8318 8319 /* nextchar() 8320 8321 Advance that parse position, and optionall absorbs 8322 "whitespace" from the inputstream. 8323 8324 Without /x "whitespace" means (?#...) style comments only, 8325 with /x this means (?#...) and # comments and whitespace proper. 8326 8327 Returns the RExC_parse point from BEFORE the scan occurs. 8328 8329 This is the /x friendly way of saying RExC_parse++. 8330 */ 8331 8332 STATIC char* 8333 S_nextchar(pTHX_ RExC_state_t *pRExC_state) 8334 { 8335 char* const retval = RExC_parse++; 8336 8337 for (;;) { 8338 if (*RExC_parse == '(' && RExC_parse[1] == '?' && 8339 RExC_parse[2] == '#') { 8340 while (*RExC_parse != ')') { 8341 if (RExC_parse == RExC_end) 8342 FAIL("Sequence (?#... not terminated"); 8343 RExC_parse++; 8344 } 8345 RExC_parse++; 8346 continue; 8347 } 8348 if (RExC_flags & RXf_PMf_EXTENDED) { 8349 if (isSPACE(*RExC_parse)) { 8350 RExC_parse++; 8351 continue; 8352 } 8353 else if (*RExC_parse == '#') { 8354 if ( reg_skipcomment( pRExC_state ) ) 8355 continue; 8356 } 8357 } 8358 return retval; 8359 } 8360 } 8361 8362 /* 8363 - reg_node - emit a node 8364 */ 8365 STATIC regnode * /* Location. */ 8366 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 8367 { 8368 dVAR; 8369 register regnode *ptr; 8370 regnode * const ret = RExC_emit; 8371 GET_RE_DEBUG_FLAGS_DECL; 8372 8373 if (SIZE_ONLY) { 8374 SIZE_ALIGN(RExC_size); 8375 RExC_size += 1; 8376 return(ret); 8377 } 8378 if (RExC_emit >= RExC_emit_bound) 8379 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); 8380 8381 NODE_ALIGN_FILL(ret); 8382 ptr = ret; 8383 FILL_ADVANCE_NODE(ptr, op); 8384 #ifdef RE_TRACK_PATTERN_OFFSETS 8385 if (RExC_offsets) { /* MJD */ 8386 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", 8387 "reg_node", __LINE__, 8388 PL_reg_name[op], 8389 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] 8390 ? "Overwriting end of array!\n" : "OK", 8391 (UV)(RExC_emit - RExC_emit_start), 8392 (UV)(RExC_parse - RExC_start), 8393 (UV)RExC_offsets[0])); 8394 Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); 8395 } 8396 #endif 8397 RExC_emit = ptr; 8398 return(ret); 8399 } 8400 8401 /* 8402 - reganode - emit a node with an argument 8403 */ 8404 STATIC regnode * /* Location. */ 8405 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 8406 { 8407 dVAR; 8408 register regnode *ptr; 8409 regnode * const ret = RExC_emit; 8410 GET_RE_DEBUG_FLAGS_DECL; 8411 8412 if (SIZE_ONLY) { 8413 SIZE_ALIGN(RExC_size); 8414 RExC_size += 2; 8415 /* 8416 We can't do this: 8417 8418 assert(2==regarglen[op]+1); 8419 8420 Anything larger than this has to allocate the extra amount. 8421 If we changed this to be: 8422 8423 RExC_size += (1 + regarglen[op]); 8424 8425 then it wouldn't matter. Its not clear what side effect 8426 might come from that so its not done so far. 8427 -- dmq 8428 */ 8429 return(ret); 8430 } 8431 if (RExC_emit >= RExC_emit_bound) 8432 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); 8433 8434 NODE_ALIGN_FILL(ret); 8435 ptr = ret; 8436 FILL_ADVANCE_NODE_ARG(ptr, op, arg); 8437 #ifdef RE_TRACK_PATTERN_OFFSETS 8438 if (RExC_offsets) { /* MJD */ 8439 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", 8440 "reganode", 8441 __LINE__, 8442 PL_reg_name[op], 8443 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? 8444 "Overwriting end of array!\n" : "OK", 8445 (UV)(RExC_emit - RExC_emit_start), 8446 (UV)(RExC_parse - RExC_start), 8447 (UV)RExC_offsets[0])); 8448 Set_Cur_Node_Offset; 8449 } 8450 #endif 8451 RExC_emit = ptr; 8452 return(ret); 8453 } 8454 8455 /* 8456 - reguni - emit (if appropriate) a Unicode character 8457 */ 8458 STATIC STRLEN 8459 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) 8460 { 8461 dVAR; 8462 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); 8463 } 8464 8465 /* 8466 - reginsert - insert an operator in front of already-emitted operand 8467 * 8468 * Means relocating the operand. 8469 */ 8470 STATIC void 8471 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) 8472 { 8473 dVAR; 8474 register regnode *src; 8475 register regnode *dst; 8476 register regnode *place; 8477 const int offset = regarglen[(U8)op]; 8478 const int size = NODE_STEP_REGNODE + offset; 8479 GET_RE_DEBUG_FLAGS_DECL; 8480 PERL_UNUSED_ARG(depth); 8481 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ 8482 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); 8483 if (SIZE_ONLY) { 8484 RExC_size += size; 8485 return; 8486 } 8487 8488 src = RExC_emit; 8489 RExC_emit += size; 8490 dst = RExC_emit; 8491 if (RExC_open_parens) { 8492 int paren; 8493 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ 8494 for ( paren=0 ; paren < RExC_npar ; paren++ ) { 8495 if ( RExC_open_parens[paren] >= opnd ) { 8496 /*DEBUG_PARSE_FMT("open"," - %d",size);*/ 8497 RExC_open_parens[paren] += size; 8498 } else { 8499 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ 8500 } 8501 if ( RExC_close_parens[paren] >= opnd ) { 8502 /*DEBUG_PARSE_FMT("close"," - %d",size);*/ 8503 RExC_close_parens[paren] += size; 8504 } else { 8505 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ 8506 } 8507 } 8508 } 8509 8510 while (src > opnd) { 8511 StructCopy(--src, --dst, regnode); 8512 #ifdef RE_TRACK_PATTERN_OFFSETS 8513 if (RExC_offsets) { /* MJD 20010112 */ 8514 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", 8515 "reg_insert", 8516 __LINE__, 8517 PL_reg_name[op], 8518 (UV)(dst - RExC_emit_start) > RExC_offsets[0] 8519 ? "Overwriting end of array!\n" : "OK", 8520 (UV)(src - RExC_emit_start), 8521 (UV)(dst - RExC_emit_start), 8522 (UV)RExC_offsets[0])); 8523 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); 8524 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); 8525 } 8526 #endif 8527 } 8528 8529 8530 place = opnd; /* Op node, where operand used to be. */ 8531 #ifdef RE_TRACK_PATTERN_OFFSETS 8532 if (RExC_offsets) { /* MJD */ 8533 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", 8534 "reginsert", 8535 __LINE__, 8536 PL_reg_name[op], 8537 (UV)(place - RExC_emit_start) > RExC_offsets[0] 8538 ? "Overwriting end of array!\n" : "OK", 8539 (UV)(place - RExC_emit_start), 8540 (UV)(RExC_parse - RExC_start), 8541 (UV)RExC_offsets[0])); 8542 Set_Node_Offset(place, RExC_parse); 8543 Set_Node_Length(place, 1); 8544 } 8545 #endif 8546 src = NEXTOPER(place); 8547 FILL_ADVANCE_NODE(place, op); 8548 Zero(src, offset, regnode); 8549 } 8550 8551 /* 8552 - regtail - set the next-pointer at the end of a node chain of p to val. 8553 - SEE ALSO: regtail_study 8554 */ 8555 /* TODO: All three parms should be const */ 8556 STATIC void 8557 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) 8558 { 8559 dVAR; 8560 register regnode *scan; 8561 GET_RE_DEBUG_FLAGS_DECL; 8562 #ifndef DEBUGGING 8563 PERL_UNUSED_ARG(depth); 8564 #endif 8565 8566 if (SIZE_ONLY) 8567 return; 8568 8569 /* Find last node. */ 8570 scan = p; 8571 for (;;) { 8572 regnode * const temp = regnext(scan); 8573 DEBUG_PARSE_r({ 8574 SV * const mysv=sv_newmortal(); 8575 DEBUG_PARSE_MSG((scan==p ? "tail" : "")); 8576 regprop(RExC_rx, mysv, scan); 8577 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", 8578 SvPV_nolen_const(mysv), REG_NODE_NUM(scan), 8579 (temp == NULL ? "->" : ""), 8580 (temp == NULL ? PL_reg_name[OP(val)] : "") 8581 ); 8582 }); 8583 if (temp == NULL) 8584 break; 8585 scan = temp; 8586 } 8587 8588 if (reg_off_by_arg[OP(scan)]) { 8589 ARG_SET(scan, val - scan); 8590 } 8591 else { 8592 NEXT_OFF(scan) = val - scan; 8593 } 8594 } 8595 8596 #ifdef DEBUGGING 8597 /* 8598 - regtail_study - set the next-pointer at the end of a node chain of p to val. 8599 - Look for optimizable sequences at the same time. 8600 - currently only looks for EXACT chains. 8601 8602 This is expermental code. The idea is to use this routine to perform 8603 in place optimizations on branches and groups as they are constructed, 8604 with the long term intention of removing optimization from study_chunk so 8605 that it is purely analytical. 8606 8607 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used 8608 to control which is which. 8609 8610 */ 8611 /* TODO: All four parms should be const */ 8612 8613 STATIC U8 8614 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) 8615 { 8616 dVAR; 8617 register regnode *scan; 8618 U8 exact = PSEUDO; 8619 #ifdef EXPERIMENTAL_INPLACESCAN 8620 I32 min = 0; 8621 #endif 8622 8623 GET_RE_DEBUG_FLAGS_DECL; 8624 8625 8626 if (SIZE_ONLY) 8627 return exact; 8628 8629 /* Find last node. */ 8630 8631 scan = p; 8632 for (;;) { 8633 regnode * const temp = regnext(scan); 8634 #ifdef EXPERIMENTAL_INPLACESCAN 8635 if (PL_regkind[OP(scan)] == EXACT) 8636 if (join_exact(pRExC_state,scan,&min,1,val,depth+1)) 8637 return EXACT; 8638 #endif 8639 if ( exact ) { 8640 switch (OP(scan)) { 8641 case EXACT: 8642 case EXACTF: 8643 case EXACTFL: 8644 if( exact == PSEUDO ) 8645 exact= OP(scan); 8646 else if ( exact != OP(scan) ) 8647 exact= 0; 8648 case NOTHING: 8649 break; 8650 default: 8651 exact= 0; 8652 } 8653 } 8654 DEBUG_PARSE_r({ 8655 SV * const mysv=sv_newmortal(); 8656 DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); 8657 regprop(RExC_rx, mysv, scan); 8658 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", 8659 SvPV_nolen_const(mysv), 8660 REG_NODE_NUM(scan), 8661 PL_reg_name[exact]); 8662 }); 8663 if (temp == NULL) 8664 break; 8665 scan = temp; 8666 } 8667 DEBUG_PARSE_r({ 8668 SV * const mysv_val=sv_newmortal(); 8669 DEBUG_PARSE_MSG(""); 8670 regprop(RExC_rx, mysv_val, val); 8671 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", 8672 SvPV_nolen_const(mysv_val), 8673 (IV)REG_NODE_NUM(val), 8674 (IV)(val - scan) 8675 ); 8676 }); 8677 if (reg_off_by_arg[OP(scan)]) { 8678 ARG_SET(scan, val - scan); 8679 } 8680 else { 8681 NEXT_OFF(scan) = val - scan; 8682 } 8683 8684 return exact; 8685 } 8686 #endif 8687 8688 /* 8689 - regcurly - a little FSA that accepts {\d+,?\d*} 8690 */ 8691 STATIC I32 8692 S_regcurly(register const char *s) 8693 { 8694 if (*s++ != '{') 8695 return FALSE; 8696 if (!isDIGIT(*s)) 8697 return FALSE; 8698 while (isDIGIT(*s)) 8699 s++; 8700 if (*s == ',') 8701 s++; 8702 while (isDIGIT(*s)) 8703 s++; 8704 if (*s != '}') 8705 return FALSE; 8706 return TRUE; 8707 } 8708 8709 8710 /* 8711 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 8712 */ 8713 #ifdef DEBUGGING 8714 void 8715 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) { 8716 int bit; 8717 int set=0; 8718 for (bit=0; bit<32; bit++) { 8719 if (flags & (1<<bit)) { 8720 if (!set++ && lead) 8721 PerlIO_printf(Perl_debug_log, "%s",lead); 8722 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]); 8723 } 8724 } 8725 if (lead) { 8726 if (set) 8727 PerlIO_printf(Perl_debug_log, "\n"); 8728 else 8729 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead); 8730 } 8731 } 8732 #endif 8733 8734 void 8735 Perl_regdump(pTHX_ const regexp *r) 8736 { 8737 #ifdef DEBUGGING 8738 dVAR; 8739 SV * const sv = sv_newmortal(); 8740 SV *dsv= sv_newmortal(); 8741 RXi_GET_DECL(r,ri); 8742 GET_RE_DEBUG_FLAGS_DECL; 8743 8744 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); 8745 8746 /* Header fields of interest. */ 8747 if (r->anchored_substr) { 8748 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), 8749 RE_SV_DUMPLEN(r->anchored_substr), 30); 8750 PerlIO_printf(Perl_debug_log, 8751 "anchored %s%s at %"IVdf" ", 8752 s, RE_SV_TAIL(r->anchored_substr), 8753 (IV)r->anchored_offset); 8754 } else if (r->anchored_utf8) { 8755 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), 8756 RE_SV_DUMPLEN(r->anchored_utf8), 30); 8757 PerlIO_printf(Perl_debug_log, 8758 "anchored utf8 %s%s at %"IVdf" ", 8759 s, RE_SV_TAIL(r->anchored_utf8), 8760 (IV)r->anchored_offset); 8761 } 8762 if (r->float_substr) { 8763 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), 8764 RE_SV_DUMPLEN(r->float_substr), 30); 8765 PerlIO_printf(Perl_debug_log, 8766 "floating %s%s at %"IVdf"..%"UVuf" ", 8767 s, RE_SV_TAIL(r->float_substr), 8768 (IV)r->float_min_offset, (UV)r->float_max_offset); 8769 } else if (r->float_utf8) { 8770 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), 8771 RE_SV_DUMPLEN(r->float_utf8), 30); 8772 PerlIO_printf(Perl_debug_log, 8773 "floating utf8 %s%s at %"IVdf"..%"UVuf" ", 8774 s, RE_SV_TAIL(r->float_utf8), 8775 (IV)r->float_min_offset, (UV)r->float_max_offset); 8776 } 8777 if (r->check_substr || r->check_utf8) 8778 PerlIO_printf(Perl_debug_log, 8779 (const char *) 8780 (r->check_substr == r->float_substr 8781 && r->check_utf8 == r->float_utf8 8782 ? "(checking floating" : "(checking anchored")); 8783 if (r->extflags & RXf_NOSCAN) 8784 PerlIO_printf(Perl_debug_log, " noscan"); 8785 if (r->extflags & RXf_CHECK_ALL) 8786 PerlIO_printf(Perl_debug_log, " isall"); 8787 if (r->check_substr || r->check_utf8) 8788 PerlIO_printf(Perl_debug_log, ") "); 8789 8790 if (ri->regstclass) { 8791 regprop(r, sv, ri->regstclass); 8792 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); 8793 } 8794 if (r->extflags & RXf_ANCH) { 8795 PerlIO_printf(Perl_debug_log, "anchored"); 8796 if (r->extflags & RXf_ANCH_BOL) 8797 PerlIO_printf(Perl_debug_log, "(BOL)"); 8798 if (r->extflags & RXf_ANCH_MBOL) 8799 PerlIO_printf(Perl_debug_log, "(MBOL)"); 8800 if (r->extflags & RXf_ANCH_SBOL) 8801 PerlIO_printf(Perl_debug_log, "(SBOL)"); 8802 if (r->extflags & RXf_ANCH_GPOS) 8803 PerlIO_printf(Perl_debug_log, "(GPOS)"); 8804 PerlIO_putc(Perl_debug_log, ' '); 8805 } 8806 if (r->extflags & RXf_GPOS_SEEN) 8807 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); 8808 if (r->intflags & PREGf_SKIP) 8809 PerlIO_printf(Perl_debug_log, "plus "); 8810 if (r->intflags & PREGf_IMPLICIT) 8811 PerlIO_printf(Perl_debug_log, "implicit "); 8812 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); 8813 if (r->extflags & RXf_EVAL_SEEN) 8814 PerlIO_printf(Perl_debug_log, "with eval "); 8815 PerlIO_printf(Perl_debug_log, "\n"); 8816 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); 8817 #else 8818 PERL_UNUSED_CONTEXT; 8819 PERL_UNUSED_ARG(r); 8820 #endif /* DEBUGGING */ 8821 } 8822 8823 /* 8824 - regprop - printable representation of opcode 8825 */ 8826 void 8827 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) 8828 { 8829 #ifdef DEBUGGING 8830 dVAR; 8831 register int k; 8832 RXi_GET_DECL(prog,progi); 8833 GET_RE_DEBUG_FLAGS_DECL; 8834 8835 8836 sv_setpvn(sv, "", 0); 8837 8838 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ 8839 /* It would be nice to FAIL() here, but this may be called from 8840 regexec.c, and it would be hard to supply pRExC_state. */ 8841 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); 8842 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ 8843 8844 k = PL_regkind[OP(o)]; 8845 8846 if (k == EXACT) { 8847 sv_catpvs(sv, " "); 8848 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 8849 * is a crude hack but it may be the best for now since 8850 * we have no flag "this EXACTish node was UTF-8" 8851 * --jhi */ 8852 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], 8853 PERL_PV_ESCAPE_UNI_DETECT | 8854 PERL_PV_PRETTY_ELLIPSES | 8855 PERL_PV_PRETTY_LTGT | 8856 PERL_PV_PRETTY_NOCLEAR 8857 ); 8858 } else if (k == TRIE) { 8859 /* print the details of the trie in dumpuntil instead, as 8860 * progi->data isn't available here */ 8861 const char op = OP(o); 8862 const U32 n = ARG(o); 8863 const reg_ac_data * const ac = IS_TRIE_AC(op) ? 8864 (reg_ac_data *)progi->data->data[n] : 8865 NULL; 8866 const reg_trie_data * const trie 8867 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; 8868 8869 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); 8870 DEBUG_TRIE_COMPILE_r( 8871 Perl_sv_catpvf(aTHX_ sv, 8872 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">", 8873 (UV)trie->startstate, 8874 (IV)trie->statecount-1, /* -1 because of the unused 0 element */ 8875 (UV)trie->wordcount, 8876 (UV)trie->minlen, 8877 (UV)trie->maxlen, 8878 (UV)TRIE_CHARCOUNT(trie), 8879 (UV)trie->uniquecharcount 8880 ) 8881 ); 8882 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { 8883 int i; 8884 int rangestart = -1; 8885 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); 8886 sv_catpvs(sv, "["); 8887 for (i = 0; i <= 256; i++) { 8888 if (i < 256 && BITMAP_TEST(bitmap,i)) { 8889 if (rangestart == -1) 8890 rangestart = i; 8891 } else if (rangestart != -1) { 8892 if (i <= rangestart + 3) 8893 for (; rangestart < i; rangestart++) 8894 put_byte(sv, rangestart); 8895 else { 8896 put_byte(sv, rangestart); 8897 sv_catpvs(sv, "-"); 8898 put_byte(sv, i - 1); 8899 } 8900 rangestart = -1; 8901 } 8902 } 8903 sv_catpvs(sv, "]"); 8904 } 8905 8906 } else if (k == CURLY) { 8907 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) 8908 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ 8909 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); 8910 } 8911 else if (k == WHILEM && o->flags) /* Ordinal/of */ 8912 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); 8913 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { 8914 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ 8915 if ( prog->paren_names ) { 8916 if ( k != REF || OP(o) < NREF) { 8917 AV *list= (AV *)progi->data->data[progi->name_list_idx]; 8918 SV **name= av_fetch(list, ARG(o), 0 ); 8919 if (name) 8920 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 8921 } 8922 else { 8923 AV *list= (AV *)progi->data->data[ progi->name_list_idx ]; 8924 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ]; 8925 I32 *nums=(I32*)SvPVX(sv_dat); 8926 SV **name= av_fetch(list, nums[0], 0 ); 8927 I32 n; 8928 if (name) { 8929 for ( n=0; n<SvIVX(sv_dat); n++ ) { 8930 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf, 8931 (n ? "," : ""), (IV)nums[n]); 8932 } 8933 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); 8934 } 8935 } 8936 } 8937 } else if (k == GOSUB) 8938 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ 8939 else if (k == VERB) { 8940 if (!o->flags) 8941 Perl_sv_catpvf(aTHX_ sv, ":%"SVf, 8942 SVfARG((SV*)progi->data->data[ ARG( o ) ])); 8943 } else if (k == LOGICAL) 8944 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ 8945 else if (k == FOLDCHAR) 8946 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); 8947 else if (k == ANYOF) { 8948 int i, rangestart = -1; 8949 const U8 flags = ANYOF_FLAGS(o); 8950 8951 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ 8952 static const char * const anyofs[] = { 8953 "\\w", 8954 "\\W", 8955 "\\s", 8956 "\\S", 8957 "\\d", 8958 "\\D", 8959 "[:alnum:]", 8960 "[:^alnum:]", 8961 "[:alpha:]", 8962 "[:^alpha:]", 8963 "[:ascii:]", 8964 "[:^ascii:]", 8965 "[:ctrl:]", 8966 "[:^ctrl:]", 8967 "[:graph:]", 8968 "[:^graph:]", 8969 "[:lower:]", 8970 "[:^lower:]", 8971 "[:print:]", 8972 "[:^print:]", 8973 "[:punct:]", 8974 "[:^punct:]", 8975 "[:upper:]", 8976 "[:^upper:]", 8977 "[:xdigit:]", 8978 "[:^xdigit:]", 8979 "[:space:]", 8980 "[:^space:]", 8981 "[:blank:]", 8982 "[:^blank:]" 8983 }; 8984 8985 if (flags & ANYOF_LOCALE) 8986 sv_catpvs(sv, "{loc}"); 8987 if (flags & ANYOF_FOLD) 8988 sv_catpvs(sv, "{i}"); 8989 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 8990 if (flags & ANYOF_INVERT) 8991 sv_catpvs(sv, "^"); 8992 for (i = 0; i <= 256; i++) { 8993 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { 8994 if (rangestart == -1) 8995 rangestart = i; 8996 } else if (rangestart != -1) { 8997 if (i <= rangestart + 3) 8998 for (; rangestart < i; rangestart++) 8999 put_byte(sv, rangestart); 9000 else { 9001 put_byte(sv, rangestart); 9002 sv_catpvs(sv, "-"); 9003 put_byte(sv, i - 1); 9004 } 9005 rangestart = -1; 9006 } 9007 } 9008 9009 if (o->flags & ANYOF_CLASS) 9010 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) 9011 if (ANYOF_CLASS_TEST(o,i)) 9012 sv_catpv(sv, anyofs[i]); 9013 9014 if (flags & ANYOF_UNICODE) 9015 sv_catpvs(sv, "{unicode}"); 9016 else if (flags & ANYOF_UNICODE_ALL) 9017 sv_catpvs(sv, "{unicode_all}"); 9018 9019 { 9020 SV *lv; 9021 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0); 9022 9023 if (lv) { 9024 if (sw) { 9025 U8 s[UTF8_MAXBYTES_CASE+1]; 9026 9027 for (i = 0; i <= 256; i++) { /* just the first 256 */ 9028 uvchr_to_utf8(s, i); 9029 9030 if (i < 256 && swash_fetch(sw, s, TRUE)) { 9031 if (rangestart == -1) 9032 rangestart = i; 9033 } else if (rangestart != -1) { 9034 if (i <= rangestart + 3) 9035 for (; rangestart < i; rangestart++) { 9036 const U8 * const e = uvchr_to_utf8(s,rangestart); 9037 U8 *p; 9038 for(p = s; p < e; p++) 9039 put_byte(sv, *p); 9040 } 9041 else { 9042 const U8 *e = uvchr_to_utf8(s,rangestart); 9043 U8 *p; 9044 for (p = s; p < e; p++) 9045 put_byte(sv, *p); 9046 sv_catpvs(sv, "-"); 9047 e = uvchr_to_utf8(s, i-1); 9048 for (p = s; p < e; p++) 9049 put_byte(sv, *p); 9050 } 9051 rangestart = -1; 9052 } 9053 } 9054 9055 sv_catpvs(sv, "..."); /* et cetera */ 9056 } 9057 9058 { 9059 char *s = savesvpv(lv); 9060 char * const origs = s; 9061 9062 while (*s && *s != '\n') 9063 s++; 9064 9065 if (*s == '\n') { 9066 const char * const t = ++s; 9067 9068 while (*s) { 9069 if (*s == '\n') 9070 *s = ' '; 9071 s++; 9072 } 9073 if (s[-1] == ' ') 9074 s[-1] = 0; 9075 9076 sv_catpv(sv, t); 9077 } 9078 9079 Safefree(origs); 9080 } 9081 } 9082 } 9083 9084 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 9085 } 9086 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) 9087 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); 9088 #else 9089 PERL_UNUSED_CONTEXT; 9090 PERL_UNUSED_ARG(sv); 9091 PERL_UNUSED_ARG(o); 9092 PERL_UNUSED_ARG(prog); 9093 #endif /* DEBUGGING */ 9094 } 9095 9096 SV * 9097 Perl_re_intuit_string(pTHX_ REGEXP * const prog) 9098 { /* Assume that RE_INTUIT is set */ 9099 dVAR; 9100 GET_RE_DEBUG_FLAGS_DECL; 9101 PERL_UNUSED_CONTEXT; 9102 9103 DEBUG_COMPILE_r( 9104 { 9105 const char * const s = SvPV_nolen_const(prog->check_substr 9106 ? prog->check_substr : prog->check_utf8); 9107 9108 if (!PL_colorset) reginitcolors(); 9109 PerlIO_printf(Perl_debug_log, 9110 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", 9111 PL_colors[4], 9112 prog->check_substr ? "" : "utf8 ", 9113 PL_colors[5],PL_colors[0], 9114 s, 9115 PL_colors[1], 9116 (strlen(s) > 60 ? "..." : "")); 9117 } ); 9118 9119 return prog->check_substr ? prog->check_substr : prog->check_utf8; 9120 } 9121 9122 /* 9123 pregfree() 9124 9125 handles refcounting and freeing the perl core regexp structure. When 9126 it is necessary to actually free the structure the first thing it 9127 does is call the 'free' method of the regexp_engine associated to to 9128 the regexp, allowing the handling of the void *pprivate; member 9129 first. (This routine is not overridable by extensions, which is why 9130 the extensions free is called first.) 9131 9132 See regdupe and regdupe_internal if you change anything here. 9133 */ 9134 #ifndef PERL_IN_XSUB_RE 9135 void 9136 Perl_pregfree(pTHX_ struct regexp *r) 9137 { 9138 dVAR; 9139 GET_RE_DEBUG_FLAGS_DECL; 9140 9141 if (!r || (--r->refcnt > 0)) 9142 return; 9143 if (r->mother_re) { 9144 ReREFCNT_dec(r->mother_re); 9145 } else { 9146 CALLREGFREE_PVT(r); /* free the private data */ 9147 if (r->paren_names) 9148 SvREFCNT_dec(r->paren_names); 9149 Safefree(r->wrapped); 9150 } 9151 if (r->substrs) { 9152 if (r->anchored_substr) 9153 SvREFCNT_dec(r->anchored_substr); 9154 if (r->anchored_utf8) 9155 SvREFCNT_dec(r->anchored_utf8); 9156 if (r->float_substr) 9157 SvREFCNT_dec(r->float_substr); 9158 if (r->float_utf8) 9159 SvREFCNT_dec(r->float_utf8); 9160 Safefree(r->substrs); 9161 } 9162 RX_MATCH_COPY_FREE(r); 9163 #ifdef PERL_OLD_COPY_ON_WRITE 9164 if (r->saved_copy) 9165 SvREFCNT_dec(r->saved_copy); 9166 #endif 9167 Safefree(r->swap); 9168 Safefree(r->offs); 9169 Safefree(r); 9170 } 9171 9172 /* reg_temp_copy() 9173 9174 This is a hacky workaround to the structural issue of match results 9175 being stored in the regexp structure which is in turn stored in 9176 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern 9177 could be PL_curpm in multiple contexts, and could require multiple 9178 result sets being associated with the pattern simultaneously, such 9179 as when doing a recursive match with (??{$qr}) 9180 9181 The solution is to make a lightweight copy of the regexp structure 9182 when a qr// is returned from the code executed by (??{$qr}) this 9183 lightweight copy doesnt actually own any of its data except for 9184 the starp/end and the actual regexp structure itself. 9185 9186 */ 9187 9188 9189 regexp * 9190 Perl_reg_temp_copy (pTHX_ struct regexp *r) { 9191 regexp *ret; 9192 register const I32 npar = r->nparens+1; 9193 (void)ReREFCNT_inc(r); 9194 Newx(ret, 1, regexp); 9195 StructCopy(r, ret, regexp); 9196 Newx(ret->offs, npar, regexp_paren_pair); 9197 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 9198 ret->refcnt = 1; 9199 if (r->substrs) { 9200 Newx(ret->substrs, 1, struct reg_substr_data); 9201 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 9202 9203 SvREFCNT_inc_void(ret->anchored_substr); 9204 SvREFCNT_inc_void(ret->anchored_utf8); 9205 SvREFCNT_inc_void(ret->float_substr); 9206 SvREFCNT_inc_void(ret->float_utf8); 9207 9208 /* check_substr and check_utf8, if non-NULL, point to either their 9209 anchored or float namesakes, and don't hold a second reference. */ 9210 } 9211 RX_MATCH_COPIED_off(ret); 9212 #ifdef PERL_OLD_COPY_ON_WRITE 9213 ret->saved_copy = NULL; 9214 #endif 9215 ret->mother_re = r; 9216 ret->swap = NULL; 9217 9218 return ret; 9219 } 9220 #endif 9221 9222 /* regfree_internal() 9223 9224 Free the private data in a regexp. This is overloadable by 9225 extensions. Perl takes care of the regexp structure in pregfree(), 9226 this covers the *pprivate pointer which technically perldoesnt 9227 know about, however of course we have to handle the 9228 regexp_internal structure when no extension is in use. 9229 9230 Note this is called before freeing anything in the regexp 9231 structure. 9232 */ 9233 9234 void 9235 Perl_regfree_internal(pTHX_ REGEXP * const r) 9236 { 9237 dVAR; 9238 RXi_GET_DECL(r,ri); 9239 GET_RE_DEBUG_FLAGS_DECL; 9240 9241 DEBUG_COMPILE_r({ 9242 if (!PL_colorset) 9243 reginitcolors(); 9244 { 9245 SV *dsv= sv_newmortal(); 9246 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8), 9247 dsv, r->precomp, r->prelen, 60); 9248 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", 9249 PL_colors[4],PL_colors[5],s); 9250 } 9251 }); 9252 #ifdef RE_TRACK_PATTERN_OFFSETS 9253 if (ri->u.offsets) 9254 Safefree(ri->u.offsets); /* 20010421 MJD */ 9255 #endif 9256 if (ri->data) { 9257 int n = ri->data->count; 9258 PAD* new_comppad = NULL; 9259 PAD* old_comppad; 9260 PADOFFSET refcnt; 9261 9262 while (--n >= 0) { 9263 /* If you add a ->what type here, update the comment in regcomp.h */ 9264 switch (ri->data->what[n]) { 9265 case 's': 9266 case 'S': 9267 case 'u': 9268 SvREFCNT_dec((SV*)ri->data->data[n]); 9269 break; 9270 case 'f': 9271 Safefree(ri->data->data[n]); 9272 break; 9273 case 'p': 9274 new_comppad = (AV*)ri->data->data[n]; 9275 break; 9276 case 'o': 9277 if (new_comppad == NULL) 9278 Perl_croak(aTHX_ "panic: pregfree comppad"); 9279 PAD_SAVE_LOCAL(old_comppad, 9280 /* Watch out for global destruction's random ordering. */ 9281 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL 9282 ); 9283 OP_REFCNT_LOCK; 9284 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]); 9285 OP_REFCNT_UNLOCK; 9286 if (!refcnt) 9287 op_free((OP_4tree*)ri->data->data[n]); 9288 9289 PAD_RESTORE_LOCAL(old_comppad); 9290 SvREFCNT_dec((SV*)new_comppad); 9291 new_comppad = NULL; 9292 break; 9293 case 'n': 9294 break; 9295 case 'T': 9296 { /* Aho Corasick add-on structure for a trie node. 9297 Used in stclass optimization only */ 9298 U32 refcount; 9299 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; 9300 OP_REFCNT_LOCK; 9301 refcount = --aho->refcount; 9302 OP_REFCNT_UNLOCK; 9303 if ( !refcount ) { 9304 PerlMemShared_free(aho->states); 9305 PerlMemShared_free(aho->fail); 9306 /* do this last!!!! */ 9307 PerlMemShared_free(ri->data->data[n]); 9308 PerlMemShared_free(ri->regstclass); 9309 } 9310 } 9311 break; 9312 case 't': 9313 { 9314 /* trie structure. */ 9315 U32 refcount; 9316 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; 9317 OP_REFCNT_LOCK; 9318 refcount = --trie->refcount; 9319 OP_REFCNT_UNLOCK; 9320 if ( !refcount ) { 9321 PerlMemShared_free(trie->charmap); 9322 PerlMemShared_free(trie->states); 9323 PerlMemShared_free(trie->trans); 9324 if (trie->bitmap) 9325 PerlMemShared_free(trie->bitmap); 9326 if (trie->wordlen) 9327 PerlMemShared_free(trie->wordlen); 9328 if (trie->jump) 9329 PerlMemShared_free(trie->jump); 9330 if (trie->nextword) 9331 PerlMemShared_free(trie->nextword); 9332 /* do this last!!!! */ 9333 PerlMemShared_free(ri->data->data[n]); 9334 } 9335 } 9336 break; 9337 default: 9338 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); 9339 } 9340 } 9341 Safefree(ri->data->what); 9342 Safefree(ri->data); 9343 } 9344 9345 Safefree(ri); 9346 } 9347 9348 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) 9349 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t)) 9350 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t)) 9351 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) 9352 9353 /* 9354 re_dup - duplicate a regexp. 9355 9356 This routine is expected to clone a given regexp structure. It is not 9357 compiler under USE_ITHREADS. 9358 9359 After all of the core data stored in struct regexp is duplicated 9360 the regexp_engine.dupe method is used to copy any private data 9361 stored in the *pprivate pointer. This allows extensions to handle 9362 any duplication it needs to do. 9363 9364 See pregfree() and regfree_internal() if you change anything here. 9365 */ 9366 #if defined(USE_ITHREADS) 9367 #ifndef PERL_IN_XSUB_RE 9368 regexp * 9369 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) 9370 { 9371 dVAR; 9372 regexp *ret; 9373 I32 npar; 9374 9375 if (!r) 9376 return (REGEXP *)NULL; 9377 9378 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r))) 9379 return ret; 9380 9381 9382 npar = r->nparens+1; 9383 Newx(ret, 1, regexp); 9384 StructCopy(r, ret, regexp); 9385 Newx(ret->offs, npar, regexp_paren_pair); 9386 Copy(r->offs, ret->offs, npar, regexp_paren_pair); 9387 if(ret->swap) { 9388 /* no need to copy these */ 9389 Newx(ret->swap, npar, regexp_paren_pair); 9390 } 9391 9392 if (ret->substrs) { 9393 /* Do it this way to avoid reading from *r after the StructCopy(). 9394 That way, if any of the sv_dup_inc()s dislodge *r from the L1 9395 cache, it doesn't matter. */ 9396 const bool anchored = r->check_substr == r->anchored_substr; 9397 Newx(ret->substrs, 1, struct reg_substr_data); 9398 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 9399 9400 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); 9401 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); 9402 ret->float_substr = sv_dup_inc(ret->float_substr, param); 9403 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); 9404 9405 /* check_substr and check_utf8, if non-NULL, point to either their 9406 anchored or float namesakes, and don't hold a second reference. */ 9407 9408 if (ret->check_substr) { 9409 if (anchored) { 9410 assert(r->check_utf8 == r->anchored_utf8); 9411 ret->check_substr = ret->anchored_substr; 9412 ret->check_utf8 = ret->anchored_utf8; 9413 } else { 9414 assert(r->check_substr == r->float_substr); 9415 assert(r->check_utf8 == r->float_utf8); 9416 ret->check_substr = ret->float_substr; 9417 ret->check_utf8 = ret->float_utf8; 9418 } 9419 } 9420 } 9421 9422 ret->wrapped = SAVEPVN(ret->wrapped, ret->wraplen+1); 9423 ret->precomp = ret->wrapped + (ret->precomp - ret->wrapped); 9424 ret->paren_names = hv_dup_inc(ret->paren_names, param); 9425 9426 if (ret->pprivate) 9427 RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); 9428 9429 if (RX_MATCH_COPIED(ret)) 9430 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); 9431 else 9432 ret->subbeg = NULL; 9433 #ifdef PERL_OLD_COPY_ON_WRITE 9434 ret->saved_copy = NULL; 9435 #endif 9436 9437 ret->mother_re = NULL; 9438 ret->gofs = 0; 9439 ret->seen_evals = 0; 9440 9441 ptr_table_store(PL_ptr_table, r, ret); 9442 return ret; 9443 } 9444 #endif /* PERL_IN_XSUB_RE */ 9445 9446 /* 9447 regdupe_internal() 9448 9449 This is the internal complement to regdupe() which is used to copy 9450 the structure pointed to by the *pprivate pointer in the regexp. 9451 This is the core version of the extension overridable cloning hook. 9452 The regexp structure being duplicated will be copied by perl prior 9453 to this and will be provided as the regexp *r argument, however 9454 with the /old/ structures pprivate pointer value. Thus this routine 9455 may override any copying normally done by perl. 9456 9457 It returns a pointer to the new regexp_internal structure. 9458 */ 9459 9460 void * 9461 Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) 9462 { 9463 dVAR; 9464 regexp_internal *reti; 9465 int len, npar; 9466 RXi_GET_DECL(r,ri); 9467 9468 npar = r->nparens+1; 9469 len = ProgLen(ri); 9470 9471 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal); 9472 Copy(ri->program, reti->program, len+1, regnode); 9473 9474 9475 reti->regstclass = NULL; 9476 9477 if (ri->data) { 9478 struct reg_data *d; 9479 const int count = ri->data->count; 9480 int i; 9481 9482 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), 9483 char, struct reg_data); 9484 Newx(d->what, count, U8); 9485 9486 d->count = count; 9487 for (i = 0; i < count; i++) { 9488 d->what[i] = ri->data->what[i]; 9489 switch (d->what[i]) { 9490 /* legal options are one of: sSfpontTu 9491 see also regcomp.h and pregfree() */ 9492 case 's': 9493 case 'S': 9494 case 'p': /* actually an AV, but the dup function is identical. */ 9495 case 'u': /* actually an HV, but the dup function is identical. */ 9496 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param); 9497 break; 9498 case 'f': 9499 /* This is cheating. */ 9500 Newx(d->data[i], 1, struct regnode_charclass_class); 9501 StructCopy(ri->data->data[i], d->data[i], 9502 struct regnode_charclass_class); 9503 reti->regstclass = (regnode*)d->data[i]; 9504 break; 9505 case 'o': 9506 /* Compiled op trees are readonly and in shared memory, 9507 and can thus be shared without duplication. */ 9508 OP_REFCNT_LOCK; 9509 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]); 9510 OP_REFCNT_UNLOCK; 9511 break; 9512 case 'T': 9513 /* Trie stclasses are readonly and can thus be shared 9514 * without duplication. We free the stclass in pregfree 9515 * when the corresponding reg_ac_data struct is freed. 9516 */ 9517 reti->regstclass= ri->regstclass; 9518 /* Fall through */ 9519 case 't': 9520 OP_REFCNT_LOCK; 9521 ((reg_trie_data*)ri->data->data[i])->refcount++; 9522 OP_REFCNT_UNLOCK; 9523 /* Fall through */ 9524 case 'n': 9525 d->data[i] = ri->data->data[i]; 9526 break; 9527 default: 9528 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); 9529 } 9530 } 9531 9532 reti->data = d; 9533 } 9534 else 9535 reti->data = NULL; 9536 9537 reti->name_list_idx = ri->name_list_idx; 9538 9539 #ifdef RE_TRACK_PATTERN_OFFSETS 9540 if (ri->u.offsets) { 9541 Newx(reti->u.offsets, 2*len+1, U32); 9542 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); 9543 } 9544 #else 9545 SetProgLen(reti,len); 9546 #endif 9547 9548 return (void*)reti; 9549 } 9550 9551 #endif /* USE_ITHREADS */ 9552 9553 /* 9554 reg_stringify() 9555 9556 converts a regexp embedded in a MAGIC struct to its stringified form, 9557 caching the converted form in the struct and returns the cached 9558 string. 9559 9560 If lp is nonnull then it is used to return the length of the 9561 resulting string 9562 9563 If flags is nonnull and the returned string contains UTF8 then 9564 (*flags & 1) will be true. 9565 9566 If haseval is nonnull then it is used to return whether the pattern 9567 contains evals. 9568 9569 Normally called via macro: 9570 9571 CALLREG_STRINGIFY(mg,&len,&utf8); 9572 9573 And internally with 9574 9575 CALLREG_AS_STR(mg,&lp,&flags,&haseval) 9576 9577 See sv_2pv_flags() in sv.c for an example of internal usage. 9578 9579 */ 9580 #ifndef PERL_IN_XSUB_RE 9581 9582 char * 9583 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { 9584 dVAR; 9585 const regexp * const re = (regexp *)mg->mg_obj; 9586 if (haseval) 9587 *haseval = re->seen_evals; 9588 if (flags) 9589 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); 9590 if (lp) 9591 *lp = re->wraplen; 9592 return re->wrapped; 9593 } 9594 9595 /* 9596 - regnext - dig the "next" pointer out of a node 9597 */ 9598 regnode * 9599 Perl_regnext(pTHX_ register regnode *p) 9600 { 9601 dVAR; 9602 register I32 offset; 9603 9604 if (!p) 9605 return(NULL); 9606 9607 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); 9608 if (offset == 0) 9609 return(NULL); 9610 9611 return(p+offset); 9612 } 9613 #endif 9614 9615 STATIC void 9616 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) 9617 { 9618 va_list args; 9619 STRLEN l1 = strlen(pat1); 9620 STRLEN l2 = strlen(pat2); 9621 char buf[512]; 9622 SV *msv; 9623 const char *message; 9624 9625 if (l1 > 510) 9626 l1 = 510; 9627 if (l1 + l2 > 510) 9628 l2 = 510 - l1; 9629 Copy(pat1, buf, l1 , char); 9630 Copy(pat2, buf + l1, l2 , char); 9631 buf[l1 + l2] = '\n'; 9632 buf[l1 + l2 + 1] = '\0'; 9633 #ifdef I_STDARG 9634 /* ANSI variant takes additional second argument */ 9635 va_start(args, pat2); 9636 #else 9637 va_start(args); 9638 #endif 9639 msv = vmess(buf, &args); 9640 va_end(args); 9641 message = SvPV_const(msv,l1); 9642 if (l1 > 512) 9643 l1 = 512; 9644 Copy(message, buf, l1 , char); 9645 buf[l1-1] = '\0'; /* Overwrite \n */ 9646 Perl_croak(aTHX_ "%s", buf); 9647 } 9648 9649 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 9650 9651 #ifndef PERL_IN_XSUB_RE 9652 void 9653 Perl_save_re_context(pTHX) 9654 { 9655 dVAR; 9656 9657 struct re_save_state *state; 9658 9659 SAVEVPTR(PL_curcop); 9660 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1); 9661 9662 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix); 9663 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE; 9664 SSPUSHINT(SAVEt_RE_STATE); 9665 9666 Copy(&PL_reg_state, state, 1, struct re_save_state); 9667 9668 PL_reg_start_tmp = 0; 9669 PL_reg_start_tmpl = 0; 9670 PL_reg_oldsaved = NULL; 9671 PL_reg_oldsavedlen = 0; 9672 PL_reg_maxiter = 0; 9673 PL_reg_leftiter = 0; 9674 PL_reg_poscache = NULL; 9675 PL_reg_poscache_size = 0; 9676 #ifdef PERL_OLD_COPY_ON_WRITE 9677 PL_nrs = NULL; 9678 #endif 9679 9680 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ 9681 if (PL_curpm) { 9682 const REGEXP * const rx = PM_GETRE(PL_curpm); 9683 if (rx) { 9684 U32 i; 9685 for (i = 1; i <= rx->nparens; i++) { 9686 char digits[TYPE_CHARS(long)]; 9687 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); 9688 GV *const *const gvp 9689 = (GV**)hv_fetch(PL_defstash, digits, len, 0); 9690 9691 if (gvp) { 9692 GV * const gv = *gvp; 9693 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) 9694 save_scalar(gv); 9695 } 9696 } 9697 } 9698 } 9699 } 9700 #endif 9701 9702 static void 9703 clear_re(pTHX_ void *r) 9704 { 9705 dVAR; 9706 ReREFCNT_dec((regexp *)r); 9707 } 9708 9709 #ifdef DEBUGGING 9710 9711 STATIC void 9712 S_put_byte(pTHX_ SV *sv, int c) 9713 { 9714 /* Our definition of isPRINT() ignores locales, so only bytes that are 9715 not part of UTF-8 are considered printable. I assume that the same 9716 holds for UTF-EBCDIC. 9717 Also, code point 255 is not printable in either (it's E0 in EBCDIC, 9718 which Wikipedia says: 9719 9720 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all 9721 ones (binary 1111 1111, hexadecimal FF). It is similar, but not 9722 identical, to the ASCII delete (DEL) or rubout control character. 9723 ) So the old condition can be simplified to !isPRINT(c) */ 9724 if (!isPRINT(c)) 9725 Perl_sv_catpvf(aTHX_ sv, "\\%o", c); 9726 else { 9727 const char string = c; 9728 if (c == '-' || c == ']' || c == '\\' || c == '^') 9729 sv_catpvs(sv, "\\"); 9730 sv_catpvn(sv, &string, 1); 9731 } 9732 } 9733 9734 9735 #define CLEAR_OPTSTART \ 9736 if (optstart) STMT_START { \ 9737 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ 9738 optstart=NULL; \ 9739 } STMT_END 9740 9741 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); 9742 9743 STATIC const regnode * 9744 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, 9745 const regnode *last, const regnode *plast, 9746 SV* sv, I32 indent, U32 depth) 9747 { 9748 dVAR; 9749 register U8 op = PSEUDO; /* Arbitrary non-END op. */ 9750 register const regnode *next; 9751 const regnode *optstart= NULL; 9752 9753 RXi_GET_DECL(r,ri); 9754 GET_RE_DEBUG_FLAGS_DECL; 9755 9756 #ifdef DEBUG_DUMPUNTIL 9757 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, 9758 last ? last-start : 0,plast ? plast-start : 0); 9759 #endif 9760 9761 if (plast && plast < last) 9762 last= plast; 9763 9764 while (PL_regkind[op] != END && (!last || node < last)) { 9765 /* While that wasn't END last time... */ 9766 NODE_ALIGN(node); 9767 op = OP(node); 9768 if (op == CLOSE || op == WHILEM) 9769 indent--; 9770 next = regnext((regnode *)node); 9771 9772 /* Where, what. */ 9773 if (OP(node) == OPTIMIZED) { 9774 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) 9775 optstart = node; 9776 else 9777 goto after_print; 9778 } else 9779 CLEAR_OPTSTART; 9780 9781 regprop(r, sv, node); 9782 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), 9783 (int)(2*indent + 1), "", SvPVX_const(sv)); 9784 9785 if (OP(node) != OPTIMIZED) { 9786 if (next == NULL) /* Next ptr. */ 9787 PerlIO_printf(Perl_debug_log, " (0)"); 9788 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) 9789 PerlIO_printf(Perl_debug_log, " (FAIL)"); 9790 else 9791 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); 9792 (void)PerlIO_putc(Perl_debug_log, '\n'); 9793 } 9794 9795 after_print: 9796 if (PL_regkind[(U8)op] == BRANCHJ) { 9797 assert(next); 9798 { 9799 register const regnode *nnode = (OP(next) == LONGJMP 9800 ? regnext((regnode *)next) 9801 : next); 9802 if (last && nnode > last) 9803 nnode = last; 9804 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); 9805 } 9806 } 9807 else if (PL_regkind[(U8)op] == BRANCH) { 9808 assert(next); 9809 DUMPUNTIL(NEXTOPER(node), next); 9810 } 9811 else if ( PL_regkind[(U8)op] == TRIE ) { 9812 const regnode *this_trie = node; 9813 const char op = OP(node); 9814 const U32 n = ARG(node); 9815 const reg_ac_data * const ac = op>=AHOCORASICK ? 9816 (reg_ac_data *)ri->data->data[n] : 9817 NULL; 9818 const reg_trie_data * const trie = 9819 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; 9820 #ifdef DEBUGGING 9821 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET]; 9822 #endif 9823 const regnode *nextbranch= NULL; 9824 I32 word_idx; 9825 sv_setpvn(sv, "", 0); 9826 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { 9827 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); 9828 9829 PerlIO_printf(Perl_debug_log, "%*s%s ", 9830 (int)(2*(indent+3)), "", 9831 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, 9832 PL_colors[0], PL_colors[1], 9833 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | 9834 PERL_PV_PRETTY_ELLIPSES | 9835 PERL_PV_PRETTY_LTGT 9836 ) 9837 : "???" 9838 ); 9839 if (trie->jump) { 9840 U16 dist= trie->jump[word_idx+1]; 9841 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", 9842 (UV)((dist ? this_trie + dist : next) - start)); 9843 if (dist) { 9844 if (!nextbranch) 9845 nextbranch= this_trie + trie->jump[0]; 9846 DUMPUNTIL(this_trie + dist, nextbranch); 9847 } 9848 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) 9849 nextbranch= regnext((regnode *)nextbranch); 9850 } else { 9851 PerlIO_printf(Perl_debug_log, "\n"); 9852 } 9853 } 9854 if (last && next > last) 9855 node= last; 9856 else 9857 node= next; 9858 } 9859 else if ( op == CURLY ) { /* "next" might be very big: optimizer */ 9860 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, 9861 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); 9862 } 9863 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { 9864 assert(next); 9865 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); 9866 } 9867 else if ( op == PLUS || op == STAR) { 9868 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); 9869 } 9870 else if (op == ANYOF) { 9871 /* arglen 1 + class block */ 9872 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE) 9873 ? ANYOF_CLASS_SKIP : ANYOF_SKIP); 9874 node = NEXTOPER(node); 9875 } 9876 else if (PL_regkind[(U8)op] == EXACT) { 9877 /* Literal string, where present. */ 9878 node += NODE_SZ_STR(node) - 1; 9879 node = NEXTOPER(node); 9880 } 9881 else { 9882 node = NEXTOPER(node); 9883 node += regarglen[(U8)op]; 9884 } 9885 if (op == CURLYX || op == OPEN) 9886 indent++; 9887 } 9888 CLEAR_OPTSTART; 9889 #ifdef DEBUG_DUMPUNTIL 9890 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); 9891 #endif 9892 return node; 9893 } 9894 9895 #endif /* DEBUGGING */ 9896 9897 /* 9898 * Local variables: 9899 * c-indentation-style: bsd 9900 * c-basic-offset: 4 9901 * indent-tabs-mode: t 9902 * End: 9903 * 9904 * ex: set ts=8 sts=4 sw=4 noet: 9905 */ 9906