1 /* regcomp.c 2 */ 3 4 /* 5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee 6 */ 7 8 /* This file contains functions for compiling a regular expression. See 9 * also regexec.c which funnily enough, contains functions for executing 10 * a regular expression. 11 * 12 * This file is also copied at build time to ext/re/re_comp.c, where 13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 14 * This causes the main functions to be compiled under new names and with 15 * debugging support added, which makes "use re 'debug'" work. 16 */ 17 18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 19 * confused with the original package (see point 3 below). Thanks, Henry! 20 */ 21 22 /* Additional note: this code is very heavily munged from Henry's version 23 * in places. In some spots I've traded clarity for efficiency, so don't 24 * blame Henry for some of the lack of readability. 25 */ 26 27 /* The names of the functions have been changed from regcomp and 28 * regexec to pregcomp and pregexec in order to avoid conflicts 29 * with the POSIX routines of the same names. 30 */ 31 32 #ifdef PERL_EXT_RE_BUILD 33 /* need to replace pregcomp et al, so enable that */ 34 # ifndef PERL_IN_XSUB_RE 35 # define PERL_IN_XSUB_RE 36 # endif 37 /* need access to debugger hooks */ 38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING) 39 # define DEBUGGING 40 # endif 41 #endif 42 43 #ifdef PERL_IN_XSUB_RE 44 /* We *really* need to overwrite these symbols: */ 45 # define Perl_pregcomp my_regcomp 46 # define Perl_regdump my_regdump 47 # define Perl_regprop my_regprop 48 # define Perl_pregfree my_regfree 49 # define Perl_re_intuit_string my_re_intuit_string 50 /* *These* symbols are masked to allow static link. */ 51 # define Perl_regnext my_regnext 52 # define Perl_save_re_context my_save_re_context 53 # define Perl_reginitcolors my_reginitcolors 54 55 # define PERL_NO_GET_CONTEXT 56 #endif 57 58 /* 59 * pregcomp and pregexec -- regsub and regerror are not used in perl 60 * 61 * Copyright (c) 1986 by University of Toronto. 62 * Written by Henry Spencer. Not derived from licensed software. 63 * 64 * Permission is granted to anyone to use this software for any 65 * purpose on any computer system, and to redistribute it freely, 66 * subject to the following restrictions: 67 * 68 * 1. The author is not responsible for the consequences of use of 69 * this software, no matter how awful, even if they arise 70 * from defects in it. 71 * 72 * 2. The origin of this software must not be misrepresented, either 73 * by explicit claim or by omission. 74 * 75 * 3. Altered versions must be plainly marked as such, and must not 76 * be misrepresented as being the original software. 77 * 78 * 79 **** Alterations to Henry's code are... 80 **** 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others 83 **** 84 **** You may distribute under the terms of either the GNU General Public 85 **** License or the Artistic License, as specified in the README file. 86 87 * 88 * Beware that some of this code is subtly aware of the way operator 89 * precedence is structured in regular expressions. Serious changes in 90 * regular-expression syntax might require a total rethink. 91 */ 92 #include "EXTERN.h" 93 #define PERL_IN_REGCOMP_C 94 #include "perl.h" 95 96 #ifndef PERL_IN_XSUB_RE 97 # include "INTERN.h" 98 #endif 99 100 #define REG_COMP_C 101 #include "regcomp.h" 102 103 #ifdef op 104 #undef op 105 #endif /* op */ 106 107 #ifdef MSDOS 108 # if defined(BUGGY_MSC6) 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ 110 # pragma optimize("a",off) 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ 112 # pragma optimize("w",on ) 113 # endif /* BUGGY_MSC6 */ 114 #endif /* MSDOS */ 115 116 #ifndef STATIC 117 #define STATIC static 118 #endif 119 120 typedef struct RExC_state_t { 121 U32 flags; /* are we folding, multilining? */ 122 char *precomp; /* uncompiled string. */ 123 regexp *rx; 124 char *start; /* Start of input for compile */ 125 char *end; /* End of input for compile */ 126 char *parse; /* Input-scan pointer. */ 127 I32 whilem_seen; /* number of WHILEM in this expr */ 128 regnode *emit_start; /* Start of emitted-code area */ 129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ 130 I32 naughty; /* How bad is this pattern? */ 131 I32 sawback; /* Did we see \1, ...? */ 132 U32 seen; 133 I32 size; /* Code size. */ 134 I32 npar; /* () count. */ 135 I32 extralen; 136 I32 seen_zerolen; 137 I32 seen_evals; 138 I32 utf8; 139 I32 orig_utf8; 140 #if ADD_TO_REGEXEC 141 char *starttry; /* -Dr: where regtry was called. */ 142 #define RExC_starttry (pRExC_state->starttry) 143 #endif 144 } RExC_state_t; 145 146 #define RExC_flags (pRExC_state->flags) 147 #define RExC_precomp (pRExC_state->precomp) 148 #define RExC_rx (pRExC_state->rx) 149 #define RExC_start (pRExC_state->start) 150 #define RExC_end (pRExC_state->end) 151 #define RExC_parse (pRExC_state->parse) 152 #define RExC_whilem_seen (pRExC_state->whilem_seen) 153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */ 154 #define RExC_emit (pRExC_state->emit) 155 #define RExC_emit_start (pRExC_state->emit_start) 156 #define RExC_naughty (pRExC_state->naughty) 157 #define RExC_sawback (pRExC_state->sawback) 158 #define RExC_seen (pRExC_state->seen) 159 #define RExC_size (pRExC_state->size) 160 #define RExC_npar (pRExC_state->npar) 161 #define RExC_extralen (pRExC_state->extralen) 162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 163 #define RExC_seen_evals (pRExC_state->seen_evals) 164 #define RExC_utf8 (pRExC_state->utf8) 165 #define RExC_orig_utf8 (pRExC_state->orig_utf8) 166 167 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') 168 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ 169 ((*s) == '{' && regcurly(s))) 170 171 #ifdef SPSTART 172 #undef SPSTART /* dratted cpp namespace... */ 173 #endif 174 /* 175 * Flags to be passed up and down. 176 */ 177 #define WORST 0 /* Worst case. */ 178 #define HASWIDTH 0x1 /* Known to match non-null strings. */ 179 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ 180 #define SPSTART 0x4 /* Starts with * or +. */ 181 #define TRYAGAIN 0x8 /* Weeded out a declaration. */ 182 183 /* Length of a variant. */ 184 185 typedef struct scan_data_t { 186 I32 len_min; 187 I32 len_delta; 188 I32 pos_min; 189 I32 pos_delta; 190 SV *last_found; 191 I32 last_end; /* min value, <0 unless valid. */ 192 I32 last_start_min; 193 I32 last_start_max; 194 SV **longest; /* Either &l_fixed, or &l_float. */ 195 SV *longest_fixed; 196 I32 offset_fixed; 197 SV *longest_float; 198 I32 offset_float_min; 199 I32 offset_float_max; 200 I32 flags; 201 I32 whilem_c; 202 I32 *last_closep; 203 struct regnode_charclass_class *start_class; 204 } scan_data_t; 205 206 /* 207 * Forward declarations for pregcomp()'s friends. 208 */ 209 210 static const scan_data_t zero_scan_data = 211 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 212 213 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 214 #define SF_BEFORE_SEOL 0x1 215 #define SF_BEFORE_MEOL 0x2 216 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) 217 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) 218 219 #ifdef NO_UNARY_PLUS 220 # define SF_FIX_SHIFT_EOL (0+2) 221 # define SF_FL_SHIFT_EOL (0+4) 222 #else 223 # define SF_FIX_SHIFT_EOL (+2) 224 # define SF_FL_SHIFT_EOL (+4) 225 #endif 226 227 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) 228 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) 229 230 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) 231 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ 232 #define SF_IS_INF 0x40 233 #define SF_HAS_PAR 0x80 234 #define SF_IN_PAR 0x100 235 #define SF_HAS_EVAL 0x200 236 #define SCF_DO_SUBSTR 0x400 237 #define SCF_DO_STCLASS_AND 0x0800 238 #define SCF_DO_STCLASS_OR 0x1000 239 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 240 #define SCF_WHILEM_VISITED_POS 0x2000 241 242 #define UTF (RExC_utf8 != 0) 243 #define LOC ((RExC_flags & PMf_LOCALE) != 0) 244 #define FOLD ((RExC_flags & PMf_FOLD) != 0) 245 246 #define OOB_UNICODE 12345678 247 #define OOB_NAMEDCLASS -1 248 249 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 250 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) 251 252 253 /* length of regex to show in messages that don't mark a position within */ 254 #define RegexLengthToShowInErrorMessages 127 255 256 /* 257 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 258 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 259 * op/pragma/warn/regcomp. 260 */ 261 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 262 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 263 264 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" 265 266 /* 267 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 268 * arg. Show regex, up to a maximum length. If it's too long, chop and add 269 * "...". 270 */ 271 #define FAIL(msg) STMT_START { \ 272 const char *ellipses = ""; \ 273 IV len = RExC_end - RExC_precomp; \ 274 \ 275 if (!SIZE_ONLY) \ 276 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 277 if (len > RegexLengthToShowInErrorMessages) { \ 278 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 279 len = RegexLengthToShowInErrorMessages - 10; \ 280 ellipses = "..."; \ 281 } \ 282 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ 283 msg, (int)len, RExC_precomp, ellipses); \ 284 } STMT_END 285 286 /* 287 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 288 * args. Show regex, up to a maximum length. If it's too long, chop and add 289 * "...". 290 */ 291 #define FAIL2(pat,msg) STMT_START { \ 292 const char *ellipses = ""; \ 293 IV len = RExC_end - RExC_precomp; \ 294 \ 295 if (!SIZE_ONLY) \ 296 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 297 if (len > RegexLengthToShowInErrorMessages) { \ 298 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 299 len = RegexLengthToShowInErrorMessages - 10; \ 300 ellipses = "..."; \ 301 } \ 302 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \ 303 msg, (int)len, RExC_precomp, ellipses); \ 304 } STMT_END 305 306 307 /* 308 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 309 */ 310 #define Simple_vFAIL(m) STMT_START { \ 311 const IV offset = RExC_parse - RExC_precomp; \ 312 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 313 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 314 } STMT_END 315 316 /* 317 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 318 */ 319 #define vFAIL(m) STMT_START { \ 320 if (!SIZE_ONLY) \ 321 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 322 Simple_vFAIL(m); \ 323 } STMT_END 324 325 /* 326 * Like Simple_vFAIL(), but accepts two arguments. 327 */ 328 #define Simple_vFAIL2(m,a1) STMT_START { \ 329 const IV offset = RExC_parse - RExC_precomp; \ 330 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ 331 (int)offset, RExC_precomp, RExC_precomp + offset); \ 332 } STMT_END 333 334 /* 335 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 336 */ 337 #define vFAIL2(m,a1) STMT_START { \ 338 if (!SIZE_ONLY) \ 339 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 340 Simple_vFAIL2(m, a1); \ 341 } STMT_END 342 343 344 /* 345 * Like Simple_vFAIL(), but accepts three arguments. 346 */ 347 #define Simple_vFAIL3(m, a1, a2) STMT_START { \ 348 const IV offset = RExC_parse - RExC_precomp; \ 349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ 350 (int)offset, RExC_precomp, RExC_precomp + offset); \ 351 } STMT_END 352 353 /* 354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 355 */ 356 #define vFAIL3(m,a1,a2) STMT_START { \ 357 if (!SIZE_ONLY) \ 358 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 359 Simple_vFAIL3(m, a1, a2); \ 360 } STMT_END 361 362 /* 363 * Like Simple_vFAIL(), but accepts four arguments. 364 */ 365 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ 366 const IV offset = RExC_parse - RExC_precomp; \ 367 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ 368 (int)offset, RExC_precomp, RExC_precomp + offset); \ 369 } STMT_END 370 371 #define vWARN(loc,m) STMT_START { \ 372 const IV offset = loc - RExC_precomp; \ 373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ 374 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 375 } STMT_END 376 377 #define vWARNdep(loc,m) STMT_START { \ 378 const IV offset = loc - RExC_precomp; \ 379 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ 380 "%s" REPORT_LOCATION, \ 381 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 382 } STMT_END 383 384 385 #define vWARN2(loc, m, a1) STMT_START { \ 386 const IV offset = loc - RExC_precomp; \ 387 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 388 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ 389 } STMT_END 390 391 #define vWARN3(loc, m, a1, a2) STMT_START { \ 392 const IV offset = loc - RExC_precomp; \ 393 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 394 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ 395 } STMT_END 396 397 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ 398 const IV offset = loc - RExC_precomp; \ 399 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 400 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ 401 } STMT_END 402 403 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ 404 const IV offset = loc - RExC_precomp; \ 405 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 406 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ 407 } STMT_END 408 409 410 /* Allow for side effects in s */ 411 #define REGC(c,s) STMT_START { \ 412 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ 413 } STMT_END 414 415 /* Macros for recording node offsets. 20001227 mjd@plover.com 416 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in 417 * element 2*n-1 of the array. Element #2n holds the byte length node #n. 418 * Element 0 holds the number n. 419 */ 420 421 #define MJD_OFFSET_DEBUG(x) 422 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */ 423 424 425 #define Set_Node_Offset_To_R(node,byte) STMT_START { \ 426 if (! SIZE_ONLY) { \ 427 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ 428 __LINE__, (node), (byte))); \ 429 if((node) < 0) { \ 430 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ 431 } else { \ 432 RExC_offsets[2*(node)-1] = (byte); \ 433 } \ 434 } \ 435 } STMT_END 436 437 #define Set_Node_Offset(node,byte) \ 438 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) 439 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) 440 441 #define Set_Node_Length_To_R(node,len) STMT_START { \ 442 if (! SIZE_ONLY) { \ 443 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ 444 __LINE__, (int)(node), (int)(len))); \ 445 if((node) < 0) { \ 446 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ 447 } else { \ 448 RExC_offsets[2*(node)] = (len); \ 449 } \ 450 } \ 451 } STMT_END 452 453 #define Set_Node_Length(node,len) \ 454 Set_Node_Length_To_R((node)-RExC_emit_start, len) 455 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) 456 #define Set_Node_Cur_Length(node) \ 457 Set_Node_Length(node, RExC_parse - parse_start) 458 459 /* Get offsets and lengths */ 460 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) 461 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) 462 463 static void clear_re(pTHX_ void *r); 464 465 /* Mark that we cannot extend a found fixed substring at this point. 466 Updata the longest found anchored substring and the longest found 467 floating substrings if needed. */ 468 469 STATIC void 470 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) 471 { 472 const STRLEN l = CHR_SVLEN(data->last_found); 473 const STRLEN old_l = CHR_SVLEN(*data->longest); 474 475 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 476 SvSetMagicSV(*data->longest, data->last_found); 477 if (*data->longest == data->longest_fixed) { 478 data->offset_fixed = l ? data->last_start_min : data->pos_min; 479 if (data->flags & SF_BEFORE_EOL) 480 data->flags 481 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); 482 else 483 data->flags &= ~SF_FIX_BEFORE_EOL; 484 } 485 else { 486 data->offset_float_min = l ? data->last_start_min : data->pos_min; 487 data->offset_float_max = (l 488 ? data->last_start_max 489 : data->pos_min + data->pos_delta); 490 if ((U32)data->offset_float_max > (U32)I32_MAX) 491 data->offset_float_max = I32_MAX; 492 if (data->flags & SF_BEFORE_EOL) 493 data->flags 494 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); 495 else 496 data->flags &= ~SF_FL_BEFORE_EOL; 497 } 498 } 499 SvCUR_set(data->last_found, 0); 500 { 501 SV * const sv = data->last_found; 502 MAGIC * const mg = 503 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; 504 if (mg && mg->mg_len > 0) 505 mg->mg_len = 0; 506 } 507 data->last_end = -1; 508 data->flags &= ~SF_BEFORE_EOL; 509 } 510 511 /* Can match anything (initialization) */ 512 STATIC void 513 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 514 { 515 ANYOF_CLASS_ZERO(cl); 516 ANYOF_BITMAP_SETALL(cl); 517 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; 518 if (LOC) 519 cl->flags |= ANYOF_LOCALE; 520 } 521 522 /* Can match anything (initialization) */ 523 STATIC int 524 S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl) 525 { 526 int value; 527 528 for (value = 0; value <= ANYOF_MAX; value += 2) 529 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) 530 return 1; 531 if (!(cl->flags & ANYOF_UNICODE_ALL)) 532 return 0; 533 if (!ANYOF_BITMAP_TESTALLSET(cl)) 534 return 0; 535 return 1; 536 } 537 538 /* Can match anything (initialization) */ 539 STATIC void 540 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 541 { 542 Zero(cl, 1, struct regnode_charclass_class); 543 cl->type = ANYOF; 544 cl_anything(pRExC_state, cl); 545 } 546 547 STATIC void 548 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 549 { 550 Zero(cl, 1, struct regnode_charclass_class); 551 cl->type = ANYOF; 552 cl_anything(pRExC_state, cl); 553 if (LOC) 554 cl->flags |= ANYOF_LOCALE; 555 } 556 557 /* 'And' a given class with another one. Can create false positives */ 558 /* We assume that cl is not inverted */ 559 STATIC void 560 S_cl_and(pTHX_ struct regnode_charclass_class *cl, 561 const struct regnode_charclass_class *and_with) 562 { 563 if (!(and_with->flags & ANYOF_CLASS) 564 && !(cl->flags & ANYOF_CLASS) 565 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 566 && !(and_with->flags & ANYOF_FOLD) 567 && !(cl->flags & ANYOF_FOLD)) { 568 int i; 569 570 if (and_with->flags & ANYOF_INVERT) 571 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 572 cl->bitmap[i] &= ~and_with->bitmap[i]; 573 else 574 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 575 cl->bitmap[i] &= and_with->bitmap[i]; 576 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ 577 if (!(and_with->flags & ANYOF_EOS)) 578 cl->flags &= ~ANYOF_EOS; 579 580 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE && 581 !(and_with->flags & ANYOF_INVERT)) { 582 cl->flags &= ~ANYOF_UNICODE_ALL; 583 cl->flags |= ANYOF_UNICODE; 584 ARG_SET(cl, ARG(and_with)); 585 } 586 if (!(and_with->flags & ANYOF_UNICODE_ALL) && 587 !(and_with->flags & ANYOF_INVERT)) 588 cl->flags &= ~ANYOF_UNICODE_ALL; 589 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) && 590 !(and_with->flags & ANYOF_INVERT)) 591 cl->flags &= ~ANYOF_UNICODE; 592 } 593 594 /* 'OR' a given class with another one. Can create false positives */ 595 /* We assume that cl is not inverted */ 596 STATIC void 597 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) 598 { 599 if (or_with->flags & ANYOF_INVERT) { 600 /* We do not use 601 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) 602 * <= (B1 | !B2) | (CL1 | !CL2) 603 * which is wasteful if CL2 is small, but we ignore CL2: 604 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 605 * XXXX Can we handle case-fold? Unclear: 606 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = 607 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) 608 */ 609 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 610 && !(or_with->flags & ANYOF_FOLD) 611 && !(cl->flags & ANYOF_FOLD) ) { 612 int i; 613 614 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 615 cl->bitmap[i] |= ~or_with->bitmap[i]; 616 } /* XXXX: logic is complicated otherwise */ 617 else { 618 cl_anything(pRExC_state, cl); 619 } 620 } else { 621 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ 622 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 623 && (!(or_with->flags & ANYOF_FOLD) 624 || (cl->flags & ANYOF_FOLD)) ) { 625 int i; 626 627 /* OR char bitmap and class bitmap separately */ 628 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 629 cl->bitmap[i] |= or_with->bitmap[i]; 630 if (or_with->flags & ANYOF_CLASS) { 631 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++) 632 cl->classflags[i] |= or_with->classflags[i]; 633 cl->flags |= ANYOF_CLASS; 634 } 635 } 636 else { /* XXXX: logic is complicated, leave it along for a moment. */ 637 cl_anything(pRExC_state, cl); 638 } 639 } 640 if (or_with->flags & ANYOF_EOS) 641 cl->flags |= ANYOF_EOS; 642 643 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE && 644 ARG(cl) != ARG(or_with)) { 645 cl->flags |= ANYOF_UNICODE_ALL; 646 cl->flags &= ~ANYOF_UNICODE; 647 } 648 if (or_with->flags & ANYOF_UNICODE_ALL) { 649 cl->flags |= ANYOF_UNICODE_ALL; 650 cl->flags &= ~ANYOF_UNICODE; 651 } 652 } 653 654 /* 655 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. 656 * These need to be revisited when a newer toolchain becomes available. 657 */ 658 #if defined(__sparc64__) && defined(__GNUC__) 659 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) 660 # undef SPARC64_GCC_WORKAROUND 661 # define SPARC64_GCC_WORKAROUND 1 662 # endif 663 #endif 664 665 /* REx optimizer. Converts nodes into quickier variants "in place". 666 Finds fixed substrings. */ 667 668 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set 669 to the position after last scanned or to NULL. */ 670 671 STATIC I32 672 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags) 673 /* scanp: Start here (read-write). */ 674 /* deltap: Write maxlen-minlen here. */ 675 /* last: Stop before this one. */ 676 { 677 I32 min = 0, pars = 0, code; 678 regnode *scan = *scanp, *next; 679 I32 delta = 0; 680 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 681 int is_inf_internal = 0; /* The studied chunk is infinite */ 682 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; 683 scan_data_t data_fake; 684 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */ 685 686 while (scan && OP(scan) != END && scan < last) { 687 /* Peephole optimizer: */ 688 689 if (PL_regkind[(U8)OP(scan)] == EXACT) { 690 /* Merge several consecutive EXACTish nodes into one. */ 691 regnode *n = regnext(scan); 692 U32 stringok = 1; 693 #ifdef DEBUGGING 694 regnode *stop = scan; 695 #endif 696 697 next = scan + NODE_SZ_STR(scan); 698 /* Skip NOTHING, merge EXACT*. */ 699 while (n && 700 ( PL_regkind[(U8)OP(n)] == NOTHING || 701 (stringok && (OP(n) == OP(scan)))) 702 && NEXT_OFF(n) 703 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { 704 if (OP(n) == TAIL || n > next) 705 stringok = 0; 706 if (PL_regkind[(U8)OP(n)] == NOTHING) { 707 NEXT_OFF(scan) += NEXT_OFF(n); 708 next = n + NODE_STEP_REGNODE; 709 #ifdef DEBUGGING 710 if (stringok) 711 stop = n; 712 #endif 713 n = regnext(n); 714 } 715 else if (stringok) { 716 const int oldl = STR_LEN(scan); 717 regnode *nnext = regnext(n); 718 719 if (oldl + STR_LEN(n) > U8_MAX) 720 break; 721 NEXT_OFF(scan) += NEXT_OFF(n); 722 STR_LEN(scan) += STR_LEN(n); 723 next = n + NODE_SZ_STR(n); 724 /* Now we can overwrite *n : */ 725 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 726 #ifdef DEBUGGING 727 stop = next - 1; 728 #endif 729 n = nnext; 730 } 731 } 732 733 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) { 734 /* 735 Two problematic code points in Unicode casefolding of EXACT nodes: 736 737 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 738 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 739 740 which casefold to 741 742 Unicode UTF-8 743 744 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 745 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 746 747 This means that in case-insensitive matching (or "loose matching", 748 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte 749 length of the above casefolded versions) can match a target string 750 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). 751 This would rather mess up the minimum length computation. 752 753 What we'll do is to look for the tail four bytes, and then peek 754 at the preceding two bytes to see whether we need to decrease 755 the minimum length by four (six minus two). 756 757 Thanks to the design of UTF-8, there cannot be false matches: 758 A sequence of valid UTF-8 bytes cannot be a subsequence of 759 another valid sequence of UTF-8 bytes. 760 761 */ 762 char *s0 = STRING(scan), *s, *t; 763 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4; 764 const char * const t0 = "\xcc\x88\xcc\x81"; 765 const char * const t1 = t0 + 3; 766 767 for (s = s0 + 2; 768 s < s2 && (t = ninstr(s, s1, t0, t1)); 769 s = t + 4) { 770 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || 771 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) 772 min -= 4; 773 } 774 } 775 776 #ifdef DEBUGGING 777 /* Allow dumping */ 778 n = scan + NODE_SZ_STR(scan); 779 while (n <= stop) { 780 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) { 781 OP(n) = OPTIMIZED; 782 NEXT_OFF(n) = 0; 783 } 784 n++; 785 } 786 #endif 787 } 788 /* Follow the next-chain of the current node and optimize 789 away all the NOTHINGs from it. */ 790 if (OP(scan) != CURLYX) { 791 const int max = (reg_off_by_arg[OP(scan)] 792 ? I32_MAX 793 /* I32 may be smaller than U16 on CRAYs! */ 794 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 795 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); 796 int noff; 797 regnode *n = scan; 798 799 /* Skip NOTHING and LONGJMP. */ 800 while ((n = regnext(n)) 801 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n))) 802 || ((OP(n) == LONGJMP) && (noff = ARG(n)))) 803 && off + noff < max) 804 off += noff; 805 if (reg_off_by_arg[OP(scan)]) 806 ARG(scan) = off; 807 else 808 NEXT_OFF(scan) = off; 809 } 810 /* The principal pseudo-switch. Cannot be a switch, since we 811 look into several different things. */ 812 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ 813 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) { 814 next = regnext(scan); 815 code = OP(scan); 816 817 if (OP(next) == code || code == IFTHEN || code == SUSPEND) { 818 I32 max1 = 0, min1 = I32_MAX, num = 0; 819 struct regnode_charclass_class accum; 820 821 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ 822 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */ 823 if (flags & SCF_DO_STCLASS) 824 cl_init_zero(pRExC_state, &accum); 825 while (OP(scan) == code) { 826 I32 deltanext, minnext, f = 0, fake; 827 struct regnode_charclass_class this_class; 828 829 num++; 830 data_fake.flags = 0; 831 if (data) { 832 data_fake.whilem_c = data->whilem_c; 833 data_fake.last_closep = data->last_closep; 834 } 835 else 836 data_fake.last_closep = &fake; 837 next = regnext(scan); 838 scan = NEXTOPER(scan); 839 if (code != BRANCH) 840 scan = NEXTOPER(scan); 841 if (flags & SCF_DO_STCLASS) { 842 cl_init(pRExC_state, &this_class); 843 data_fake.start_class = &this_class; 844 f = SCF_DO_STCLASS_AND; 845 } 846 if (flags & SCF_WHILEM_VISITED_POS) 847 f |= SCF_WHILEM_VISITED_POS; 848 /* we suppose the run is continuous, last=next...*/ 849 minnext = study_chunk(pRExC_state, &scan, &deltanext, 850 next, &data_fake, f); 851 if (min1 > minnext) 852 min1 = minnext; 853 if (max1 < minnext + deltanext) 854 max1 = minnext + deltanext; 855 if (deltanext == I32_MAX) 856 is_inf = is_inf_internal = 1; 857 scan = next; 858 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 859 pars++; 860 if (data && (data_fake.flags & SF_HAS_EVAL)) 861 data->flags |= SF_HAS_EVAL; 862 if (data) 863 data->whilem_c = data_fake.whilem_c; 864 if (flags & SCF_DO_STCLASS) 865 cl_or(pRExC_state, &accum, &this_class); 866 if (code == SUSPEND) 867 break; 868 } 869 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 870 min1 = 0; 871 if (flags & SCF_DO_SUBSTR) { 872 data->pos_min += min1; 873 data->pos_delta += max1 - min1; 874 if (max1 != min1 || is_inf) 875 data->longest = &(data->longest_float); 876 } 877 min += min1; 878 delta += max1 - min1; 879 if (flags & SCF_DO_STCLASS_OR) { 880 cl_or(pRExC_state, data->start_class, &accum); 881 if (min1) { 882 cl_and(data->start_class, &and_with); 883 flags &= ~SCF_DO_STCLASS; 884 } 885 } 886 else if (flags & SCF_DO_STCLASS_AND) { 887 if (min1) { 888 cl_and(data->start_class, &accum); 889 flags &= ~SCF_DO_STCLASS; 890 } 891 else { 892 /* Switch to OR mode: cache the old value of 893 * data->start_class */ 894 StructCopy(data->start_class, &and_with, 895 struct regnode_charclass_class); 896 flags &= ~SCF_DO_STCLASS_AND; 897 StructCopy(&accum, data->start_class, 898 struct regnode_charclass_class); 899 flags |= SCF_DO_STCLASS_OR; 900 data->start_class->flags |= ANYOF_EOS; 901 } 902 } 903 904 } 905 else if (code == BRANCHJ) /* single branch is optimized. */ 906 scan = NEXTOPER(NEXTOPER(scan)); 907 else /* single branch is optimized. */ 908 scan = NEXTOPER(scan); 909 continue; 910 } 911 else if (OP(scan) == EXACT) { 912 I32 l = STR_LEN(scan); 913 UV uc = *((U8*)STRING(scan)); 914 if (UTF) { 915 const U8 * const s = (U8*)STRING(scan); 916 l = utf8_length((U8 *)s, (U8 *)s + l); 917 uc = utf8_to_uvchr((U8 *)s, NULL); 918 } 919 min += l; 920 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 921 /* The code below prefers earlier match for fixed 922 offset, later match for variable offset. */ 923 if (data->last_end == -1) { /* Update the start info. */ 924 data->last_start_min = data->pos_min; 925 data->last_start_max = is_inf 926 ? I32_MAX : data->pos_min + data->pos_delta; 927 } 928 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); 929 { 930 SV * const sv = data->last_found; 931 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? 932 mg_find(sv, PERL_MAGIC_utf8) : NULL; 933 if (mg && mg->mg_len >= 0) 934 mg->mg_len += utf8_length((U8*)STRING(scan), 935 (U8*)STRING(scan)+STR_LEN(scan)); 936 } 937 if (UTF) 938 SvUTF8_on(data->last_found); 939 data->last_end = data->pos_min + l; 940 data->pos_min += l; /* As in the first entry. */ 941 data->flags &= ~SF_BEFORE_EOL; 942 } 943 if (flags & SCF_DO_STCLASS_AND) { 944 /* Check whether it is compatible with what we know already! */ 945 int compat = 1; 946 947 if (uc >= 0x100 || 948 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 949 && !ANYOF_BITMAP_TEST(data->start_class, uc) 950 && (!(data->start_class->flags & ANYOF_FOLD) 951 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 952 ) 953 compat = 0; 954 ANYOF_CLASS_ZERO(data->start_class); 955 ANYOF_BITMAP_ZERO(data->start_class); 956 if (compat) 957 ANYOF_BITMAP_SET(data->start_class, uc); 958 data->start_class->flags &= ~ANYOF_EOS; 959 if (uc < 0x100) 960 data->start_class->flags &= ~ANYOF_UNICODE_ALL; 961 } 962 else if (flags & SCF_DO_STCLASS_OR) { 963 /* false positive possible if the class is case-folded */ 964 if (uc < 0x100) 965 ANYOF_BITMAP_SET(data->start_class, uc); 966 else 967 data->start_class->flags |= ANYOF_UNICODE_ALL; 968 data->start_class->flags &= ~ANYOF_EOS; 969 cl_and(data->start_class, &and_with); 970 } 971 flags &= ~SCF_DO_STCLASS; 972 } 973 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */ 974 I32 l = STR_LEN(scan); 975 UV uc = *((U8*)STRING(scan)); 976 977 /* Search for fixed substrings supports EXACT only. */ 978 if (flags & SCF_DO_SUBSTR) 979 scan_commit(pRExC_state, data); 980 if (UTF) { 981 U8 *s = (U8 *)STRING(scan); 982 l = utf8_length(s, s + l); 983 uc = utf8_to_uvchr(s, NULL); 984 } 985 min += l; 986 if (data && (flags & SCF_DO_SUBSTR)) 987 data->pos_min += l; 988 if (flags & SCF_DO_STCLASS_AND) { 989 /* Check whether it is compatible with what we know already! */ 990 int compat = 1; 991 992 if (uc >= 0x100 || 993 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 994 && !ANYOF_BITMAP_TEST(data->start_class, uc) 995 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 996 compat = 0; 997 ANYOF_CLASS_ZERO(data->start_class); 998 ANYOF_BITMAP_ZERO(data->start_class); 999 if (compat) { 1000 ANYOF_BITMAP_SET(data->start_class, uc); 1001 data->start_class->flags &= ~ANYOF_EOS; 1002 data->start_class->flags |= ANYOF_FOLD; 1003 if (OP(scan) == EXACTFL) 1004 data->start_class->flags |= ANYOF_LOCALE; 1005 } 1006 } 1007 else if (flags & SCF_DO_STCLASS_OR) { 1008 if (data->start_class->flags & ANYOF_FOLD) { 1009 /* false positive possible if the class is case-folded. 1010 Assume that the locale settings are the same... */ 1011 if (uc < 0x100) 1012 ANYOF_BITMAP_SET(data->start_class, uc); 1013 data->start_class->flags &= ~ANYOF_EOS; 1014 } 1015 cl_and(data->start_class, &and_with); 1016 } 1017 flags &= ~SCF_DO_STCLASS; 1018 } 1019 else if (strchr((const char*)PL_varies,OP(scan))) { 1020 I32 mincount, maxcount, minnext, deltanext, fl = 0; 1021 I32 f = flags, pos_before = 0; 1022 regnode *oscan = scan; 1023 struct regnode_charclass_class this_class; 1024 struct regnode_charclass_class *oclass = NULL; 1025 I32 next_is_eval = 0; 1026 1027 switch (PL_regkind[(U8)OP(scan)]) { 1028 case WHILEM: /* End of (?:...)* . */ 1029 scan = NEXTOPER(scan); 1030 goto finish; 1031 case PLUS: 1032 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 1033 next = NEXTOPER(scan); 1034 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { 1035 mincount = 1; 1036 maxcount = REG_INFTY; 1037 next = regnext(scan); 1038 scan = NEXTOPER(scan); 1039 goto do_curly; 1040 } 1041 } 1042 if (flags & SCF_DO_SUBSTR) 1043 data->pos_min++; 1044 min++; 1045 /* Fall through. */ 1046 case STAR: 1047 if (flags & SCF_DO_STCLASS) { 1048 mincount = 0; 1049 maxcount = REG_INFTY; 1050 next = regnext(scan); 1051 scan = NEXTOPER(scan); 1052 goto do_curly; 1053 } 1054 is_inf = is_inf_internal = 1; 1055 scan = regnext(scan); 1056 if (flags & SCF_DO_SUBSTR) { 1057 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */ 1058 data->longest = &(data->longest_float); 1059 } 1060 goto optimize_curly_tail; 1061 case CURLY: 1062 mincount = ARG1(scan); 1063 maxcount = ARG2(scan); 1064 next = regnext(scan); 1065 if (OP(scan) == CURLYX) { 1066 I32 lp = (data ? *(data->last_closep) : 0); 1067 1068 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX); 1069 } 1070 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 1071 next_is_eval = (OP(scan) == EVAL); 1072 do_curly: 1073 if (flags & SCF_DO_SUBSTR) { 1074 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */ 1075 pos_before = data->pos_min; 1076 } 1077 if (data) { 1078 fl = data->flags; 1079 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 1080 if (is_inf) 1081 data->flags |= SF_IS_INF; 1082 } 1083 if (flags & SCF_DO_STCLASS) { 1084 cl_init(pRExC_state, &this_class); 1085 oclass = data->start_class; 1086 data->start_class = &this_class; 1087 f |= SCF_DO_STCLASS_AND; 1088 f &= ~SCF_DO_STCLASS_OR; 1089 } 1090 /* These are the cases when once a subexpression 1091 fails at a particular position, it cannot succeed 1092 even after backtracking at the enclosing scope. 1093 1094 XXXX what if minimal match and we are at the 1095 initial run of {n,m}? */ 1096 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) 1097 f &= ~SCF_WHILEM_VISITED_POS; 1098 1099 /* This will finish on WHILEM, setting scan, or on NULL: */ 1100 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data, 1101 mincount == 0 1102 ? (f & ~SCF_DO_SUBSTR) : f); 1103 1104 if (flags & SCF_DO_STCLASS) 1105 data->start_class = oclass; 1106 if (mincount == 0 || minnext == 0) { 1107 if (flags & SCF_DO_STCLASS_OR) { 1108 cl_or(pRExC_state, data->start_class, &this_class); 1109 } 1110 else if (flags & SCF_DO_STCLASS_AND) { 1111 /* Switch to OR mode: cache the old value of 1112 * data->start_class */ 1113 StructCopy(data->start_class, &and_with, 1114 struct regnode_charclass_class); 1115 flags &= ~SCF_DO_STCLASS_AND; 1116 StructCopy(&this_class, data->start_class, 1117 struct regnode_charclass_class); 1118 flags |= SCF_DO_STCLASS_OR; 1119 data->start_class->flags |= ANYOF_EOS; 1120 } 1121 } else { /* Non-zero len */ 1122 if (flags & SCF_DO_STCLASS_OR) { 1123 cl_or(pRExC_state, data->start_class, &this_class); 1124 cl_and(data->start_class, &and_with); 1125 } 1126 else if (flags & SCF_DO_STCLASS_AND) 1127 cl_and(data->start_class, &this_class); 1128 flags &= ~SCF_DO_STCLASS; 1129 } 1130 if (!scan) /* It was not CURLYX, but CURLY. */ 1131 scan = next; 1132 if ( /* ? quantifier ok, except for (?{ ... }) */ 1133 (next_is_eval || !(mincount == 0 && maxcount == 1)) 1134 && (minnext == 0) && (deltanext == 0) 1135 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 1136 && maxcount <= REG_INFTY/3 /* Complement check for big count */ 1137 && ckWARN(WARN_REGEXP)) 1138 { 1139 vWARN(RExC_parse, 1140 "Quantifier unexpected on zero-length expression"); 1141 } 1142 1143 min += minnext * mincount; 1144 is_inf_internal |= ((maxcount == REG_INFTY 1145 && (minnext + deltanext) > 0) 1146 || deltanext == I32_MAX); 1147 is_inf |= is_inf_internal; 1148 delta += (minnext + deltanext) * maxcount - minnext * mincount; 1149 1150 /* Try powerful optimization CURLYX => CURLYN. */ 1151 if ( OP(oscan) == CURLYX && data 1152 && data->flags & SF_IN_PAR 1153 && !(data->flags & SF_HAS_EVAL) 1154 && !deltanext && minnext == 1 ) { 1155 /* Try to optimize to CURLYN. */ 1156 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; 1157 regnode *nxt1 = nxt; 1158 #ifdef DEBUGGING 1159 regnode *nxt2; 1160 #endif 1161 1162 /* Skip open. */ 1163 nxt = regnext(nxt); 1164 if (!strchr((const char*)PL_simple,OP(nxt)) 1165 && !(PL_regkind[(U8)OP(nxt)] == EXACT 1166 && STR_LEN(nxt) == 1)) 1167 goto nogo; 1168 #ifdef DEBUGGING 1169 nxt2 = nxt; 1170 #endif 1171 nxt = regnext(nxt); 1172 if (OP(nxt) != CLOSE) 1173 goto nogo; 1174 /* Now we know that nxt2 is the only contents: */ 1175 oscan->flags = (U8)ARG(nxt); 1176 OP(oscan) = CURLYN; 1177 OP(nxt1) = NOTHING; /* was OPEN. */ 1178 #ifdef DEBUGGING 1179 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 1180 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */ 1181 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */ 1182 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 1183 OP(nxt + 1) = OPTIMIZED; /* was count. */ 1184 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ 1185 #endif 1186 } 1187 nogo: 1188 1189 /* Try optimization CURLYX => CURLYM. */ 1190 if ( OP(oscan) == CURLYX && data 1191 && !(data->flags & SF_HAS_PAR) 1192 && !(data->flags & SF_HAS_EVAL) 1193 && !deltanext /* atom is fixed width */ 1194 && minnext != 0 /* CURLYM can't handle zero width */ 1195 ) { 1196 /* XXXX How to optimize if data == 0? */ 1197 /* Optimize to a simpler form. */ 1198 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ 1199 regnode *nxt2; 1200 1201 OP(oscan) = CURLYM; 1202 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 1203 && (OP(nxt2) != WHILEM)) 1204 nxt = nxt2; 1205 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 1206 /* Need to optimize away parenths. */ 1207 if (data->flags & SF_IN_PAR) { 1208 /* Set the parenth number. */ 1209 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ 1210 1211 if (OP(nxt) != CLOSE) 1212 FAIL("Panic opt close"); 1213 oscan->flags = (U8)ARG(nxt); 1214 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 1215 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 1216 #ifdef DEBUGGING 1217 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 1218 OP(nxt + 1) = OPTIMIZED; /* was count. */ 1219 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */ 1220 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */ 1221 #endif 1222 #if 0 1223 while ( nxt1 && (OP(nxt1) != WHILEM)) { 1224 regnode *nnxt = regnext(nxt1); 1225 1226 if (nnxt == nxt) { 1227 if (reg_off_by_arg[OP(nxt1)]) 1228 ARG_SET(nxt1, nxt2 - nxt1); 1229 else if (nxt2 - nxt1 < U16_MAX) 1230 NEXT_OFF(nxt1) = nxt2 - nxt1; 1231 else 1232 OP(nxt) = NOTHING; /* Cannot beautify */ 1233 } 1234 nxt1 = nnxt; 1235 } 1236 #endif 1237 /* Optimize again: */ 1238 study_chunk(pRExC_state, &nxt1, &deltanext, nxt, 1239 NULL, 0); 1240 } 1241 else 1242 oscan->flags = 0; 1243 } 1244 else if ((OP(oscan) == CURLYX) 1245 && (flags & SCF_WHILEM_VISITED_POS) 1246 /* See the comment on a similar expression above. 1247 However, this time it not a subexpression 1248 we care about, but the expression itself. */ 1249 && (maxcount == REG_INFTY) 1250 && data && ++data->whilem_c < 16) { 1251 /* This stays as CURLYX, we can put the count/of pair. */ 1252 /* Find WHILEM (as in regexec.c) */ 1253 regnode *nxt = oscan + NEXT_OFF(oscan); 1254 1255 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ 1256 nxt += ARG(nxt); 1257 PREVOPER(nxt)->flags = (U8)(data->whilem_c 1258 | (RExC_whilem_seen << 4)); /* On WHILEM */ 1259 } 1260 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 1261 pars++; 1262 if (flags & SCF_DO_SUBSTR) { 1263 SV *last_str = Nullsv; 1264 int counted = mincount != 0; 1265 1266 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ 1267 #if defined(SPARC64_GCC_WORKAROUND) 1268 I32 b = 0; 1269 STRLEN l = 0; 1270 const char *s = NULL; 1271 I32 old = 0; 1272 1273 if (pos_before >= data->last_start_min) 1274 b = pos_before; 1275 else 1276 b = data->last_start_min; 1277 1278 l = 0; 1279 s = SvPV_const(data->last_found, l); 1280 old = b - data->last_start_min; 1281 1282 #else 1283 I32 b = pos_before >= data->last_start_min 1284 ? pos_before : data->last_start_min; 1285 STRLEN l; 1286 const char *s = SvPV_const(data->last_found, l); 1287 I32 old = b - data->last_start_min; 1288 #endif 1289 1290 if (UTF) 1291 old = utf8_hop((U8*)s, old) - (U8*)s; 1292 1293 l -= old; 1294 /* Get the added string: */ 1295 last_str = newSVpvn(s + old, l); 1296 if (UTF) 1297 SvUTF8_on(last_str); 1298 if (deltanext == 0 && pos_before == b) { 1299 /* What was added is a constant string */ 1300 if (mincount > 1) { 1301 SvGROW(last_str, (mincount * l) + 1); 1302 repeatcpy(SvPVX(last_str) + l, 1303 SvPVX_const(last_str), l, mincount - 1); 1304 SvCUR_set(last_str, SvCUR(last_str) * mincount); 1305 /* Add additional parts. */ 1306 SvCUR_set(data->last_found, 1307 SvCUR(data->last_found) - l); 1308 sv_catsv(data->last_found, last_str); 1309 { 1310 SV * sv = data->last_found; 1311 MAGIC *mg = 1312 SvUTF8(sv) && SvMAGICAL(sv) ? 1313 mg_find(sv, PERL_MAGIC_utf8) : NULL; 1314 if (mg && mg->mg_len >= 0) 1315 mg->mg_len += CHR_SVLEN(last_str); 1316 } 1317 data->last_end += l * (mincount - 1); 1318 } 1319 } else { 1320 /* start offset must point into the last copy */ 1321 data->last_start_min += minnext * (mincount - 1); 1322 data->last_start_max += is_inf ? I32_MAX 1323 : (maxcount - 1) * (minnext + data->pos_delta); 1324 } 1325 } 1326 /* It is counted once already... */ 1327 data->pos_min += minnext * (mincount - counted); 1328 data->pos_delta += - counted * deltanext + 1329 (minnext + deltanext) * maxcount - minnext * mincount; 1330 if (mincount != maxcount) { 1331 /* Cannot extend fixed substrings found inside 1332 the group. */ 1333 scan_commit(pRExC_state,data); 1334 if (mincount && last_str) { 1335 sv_setsv(data->last_found, last_str); 1336 data->last_end = data->pos_min; 1337 data->last_start_min = 1338 data->pos_min - CHR_SVLEN(last_str); 1339 data->last_start_max = is_inf 1340 ? I32_MAX 1341 : data->pos_min + data->pos_delta 1342 - CHR_SVLEN(last_str); 1343 } 1344 data->longest = &(data->longest_float); 1345 } 1346 SvREFCNT_dec(last_str); 1347 } 1348 if (data && (fl & SF_HAS_EVAL)) 1349 data->flags |= SF_HAS_EVAL; 1350 optimize_curly_tail: 1351 if (OP(oscan) != CURLYX) { 1352 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING 1353 && NEXT_OFF(next)) 1354 NEXT_OFF(oscan) += NEXT_OFF(next); 1355 } 1356 continue; 1357 default: /* REF and CLUMP only? */ 1358 if (flags & SCF_DO_SUBSTR) { 1359 scan_commit(pRExC_state,data); /* Cannot expect anything... */ 1360 data->longest = &(data->longest_float); 1361 } 1362 is_inf = is_inf_internal = 1; 1363 if (flags & SCF_DO_STCLASS_OR) 1364 cl_anything(pRExC_state, data->start_class); 1365 flags &= ~SCF_DO_STCLASS; 1366 break; 1367 } 1368 } 1369 else if (strchr((const char*)PL_simple,OP(scan))) { 1370 int value = 0; 1371 1372 if (flags & SCF_DO_SUBSTR) { 1373 scan_commit(pRExC_state,data); 1374 data->pos_min++; 1375 } 1376 min++; 1377 if (flags & SCF_DO_STCLASS) { 1378 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ 1379 1380 /* Some of the logic below assumes that switching 1381 locale on will only add false positives. */ 1382 switch (PL_regkind[(U8)OP(scan)]) { 1383 case SANY: 1384 default: 1385 do_default: 1386 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ 1387 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 1388 cl_anything(pRExC_state, data->start_class); 1389 break; 1390 case REG_ANY: 1391 if (OP(scan) == SANY) 1392 goto do_default; 1393 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ 1394 value = (ANYOF_BITMAP_TEST(data->start_class,'\n') 1395 || (data->start_class->flags & ANYOF_CLASS)); 1396 cl_anything(pRExC_state, data->start_class); 1397 } 1398 if (flags & SCF_DO_STCLASS_AND || !value) 1399 ANYOF_BITMAP_CLEAR(data->start_class,'\n'); 1400 break; 1401 case ANYOF: 1402 if (flags & SCF_DO_STCLASS_AND) 1403 cl_and(data->start_class, 1404 (struct regnode_charclass_class*)scan); 1405 else 1406 cl_or(pRExC_state, data->start_class, 1407 (struct regnode_charclass_class*)scan); 1408 break; 1409 case ALNUM: 1410 if (flags & SCF_DO_STCLASS_AND) { 1411 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1412 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 1413 for (value = 0; value < 256; value++) 1414 if (!isALNUM(value)) 1415 ANYOF_BITMAP_CLEAR(data->start_class, value); 1416 } 1417 } 1418 else { 1419 if (data->start_class->flags & ANYOF_LOCALE) 1420 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 1421 else { 1422 for (value = 0; value < 256; value++) 1423 if (isALNUM(value)) 1424 ANYOF_BITMAP_SET(data->start_class, value); 1425 } 1426 } 1427 break; 1428 case ALNUML: 1429 if (flags & SCF_DO_STCLASS_AND) { 1430 if (data->start_class->flags & ANYOF_LOCALE) 1431 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 1432 } 1433 else { 1434 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 1435 data->start_class->flags |= ANYOF_LOCALE; 1436 } 1437 break; 1438 case NALNUM: 1439 if (flags & SCF_DO_STCLASS_AND) { 1440 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1441 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 1442 for (value = 0; value < 256; value++) 1443 if (isALNUM(value)) 1444 ANYOF_BITMAP_CLEAR(data->start_class, value); 1445 } 1446 } 1447 else { 1448 if (data->start_class->flags & ANYOF_LOCALE) 1449 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 1450 else { 1451 for (value = 0; value < 256; value++) 1452 if (!isALNUM(value)) 1453 ANYOF_BITMAP_SET(data->start_class, value); 1454 } 1455 } 1456 break; 1457 case NALNUML: 1458 if (flags & SCF_DO_STCLASS_AND) { 1459 if (data->start_class->flags & ANYOF_LOCALE) 1460 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 1461 } 1462 else { 1463 data->start_class->flags |= ANYOF_LOCALE; 1464 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 1465 } 1466 break; 1467 case SPACE: 1468 if (flags & SCF_DO_STCLASS_AND) { 1469 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1470 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 1471 for (value = 0; value < 256; value++) 1472 if (!isSPACE(value)) 1473 ANYOF_BITMAP_CLEAR(data->start_class, value); 1474 } 1475 } 1476 else { 1477 if (data->start_class->flags & ANYOF_LOCALE) 1478 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 1479 else { 1480 for (value = 0; value < 256; value++) 1481 if (isSPACE(value)) 1482 ANYOF_BITMAP_SET(data->start_class, value); 1483 } 1484 } 1485 break; 1486 case SPACEL: 1487 if (flags & SCF_DO_STCLASS_AND) { 1488 if (data->start_class->flags & ANYOF_LOCALE) 1489 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 1490 } 1491 else { 1492 data->start_class->flags |= ANYOF_LOCALE; 1493 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 1494 } 1495 break; 1496 case NSPACE: 1497 if (flags & SCF_DO_STCLASS_AND) { 1498 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1499 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 1500 for (value = 0; value < 256; value++) 1501 if (isSPACE(value)) 1502 ANYOF_BITMAP_CLEAR(data->start_class, value); 1503 } 1504 } 1505 else { 1506 if (data->start_class->flags & ANYOF_LOCALE) 1507 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 1508 else { 1509 for (value = 0; value < 256; value++) 1510 if (!isSPACE(value)) 1511 ANYOF_BITMAP_SET(data->start_class, value); 1512 } 1513 } 1514 break; 1515 case NSPACEL: 1516 if (flags & SCF_DO_STCLASS_AND) { 1517 if (data->start_class->flags & ANYOF_LOCALE) { 1518 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 1519 for (value = 0; value < 256; value++) 1520 if (!isSPACE(value)) 1521 ANYOF_BITMAP_CLEAR(data->start_class, value); 1522 } 1523 } 1524 else { 1525 data->start_class->flags |= ANYOF_LOCALE; 1526 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 1527 } 1528 break; 1529 case DIGIT: 1530 if (flags & SCF_DO_STCLASS_AND) { 1531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); 1532 for (value = 0; value < 256; value++) 1533 if (!isDIGIT(value)) 1534 ANYOF_BITMAP_CLEAR(data->start_class, value); 1535 } 1536 else { 1537 if (data->start_class->flags & ANYOF_LOCALE) 1538 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); 1539 else { 1540 for (value = 0; value < 256; value++) 1541 if (isDIGIT(value)) 1542 ANYOF_BITMAP_SET(data->start_class, value); 1543 } 1544 } 1545 break; 1546 case NDIGIT: 1547 if (flags & SCF_DO_STCLASS_AND) { 1548 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); 1549 for (value = 0; value < 256; value++) 1550 if (isDIGIT(value)) 1551 ANYOF_BITMAP_CLEAR(data->start_class, value); 1552 } 1553 else { 1554 if (data->start_class->flags & ANYOF_LOCALE) 1555 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); 1556 else { 1557 for (value = 0; value < 256; value++) 1558 if (!isDIGIT(value)) 1559 ANYOF_BITMAP_SET(data->start_class, value); 1560 } 1561 } 1562 break; 1563 } 1564 if (flags & SCF_DO_STCLASS_OR) 1565 cl_and(data->start_class, &and_with); 1566 flags &= ~SCF_DO_STCLASS; 1567 } 1568 } 1569 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { 1570 data->flags |= (OP(scan) == MEOL 1571 ? SF_BEFORE_MEOL 1572 : SF_BEFORE_SEOL); 1573 } 1574 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ 1575 /* Lookbehind, or need to calculate parens/evals/stclass: */ 1576 && (scan->flags || data || (flags & SCF_DO_STCLASS)) 1577 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { 1578 /* Lookahead/lookbehind */ 1579 I32 deltanext, minnext, fake = 0; 1580 regnode *nscan; 1581 struct regnode_charclass_class intrnl; 1582 int f = 0; 1583 1584 data_fake.flags = 0; 1585 if (data) { 1586 data_fake.whilem_c = data->whilem_c; 1587 data_fake.last_closep = data->last_closep; 1588 } 1589 else 1590 data_fake.last_closep = &fake; 1591 if ( flags & SCF_DO_STCLASS && !scan->flags 1592 && OP(scan) == IFMATCH ) { /* Lookahead */ 1593 cl_init(pRExC_state, &intrnl); 1594 data_fake.start_class = &intrnl; 1595 f |= SCF_DO_STCLASS_AND; 1596 } 1597 if (flags & SCF_WHILEM_VISITED_POS) 1598 f |= SCF_WHILEM_VISITED_POS; 1599 next = regnext(scan); 1600 nscan = NEXTOPER(NEXTOPER(scan)); 1601 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f); 1602 if (scan->flags) { 1603 if (deltanext) { 1604 vFAIL("Variable length lookbehind not implemented"); 1605 } 1606 else if (minnext > U8_MAX) { 1607 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); 1608 } 1609 scan->flags = (U8)minnext; 1610 } 1611 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 1612 pars++; 1613 if (data && (data_fake.flags & SF_HAS_EVAL)) 1614 data->flags |= SF_HAS_EVAL; 1615 if (data) 1616 data->whilem_c = data_fake.whilem_c; 1617 if (f & SCF_DO_STCLASS_AND) { 1618 const int was = (data->start_class->flags & ANYOF_EOS); 1619 1620 cl_and(data->start_class, &intrnl); 1621 if (was) 1622 data->start_class->flags |= ANYOF_EOS; 1623 } 1624 } 1625 else if (OP(scan) == OPEN) { 1626 pars++; 1627 } 1628 else if (OP(scan) == CLOSE) { 1629 if ((I32)ARG(scan) == is_par) { 1630 next = regnext(scan); 1631 1632 if ( next && (OP(next) != WHILEM) && next < last) 1633 is_par = 0; /* Disable optimization */ 1634 } 1635 if (data) 1636 *(data->last_closep) = ARG(scan); 1637 } 1638 else if (OP(scan) == EVAL) { 1639 if (data) 1640 data->flags |= SF_HAS_EVAL; 1641 } 1642 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */ 1643 if (flags & SCF_DO_SUBSTR) { 1644 scan_commit(pRExC_state,data); 1645 data->longest = &(data->longest_float); 1646 } 1647 is_inf = is_inf_internal = 1; 1648 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 1649 cl_anything(pRExC_state, data->start_class); 1650 flags &= ~SCF_DO_STCLASS; 1651 } 1652 /* Else: zero-length, ignore. */ 1653 scan = regnext(scan); 1654 } 1655 1656 finish: 1657 *scanp = scan; 1658 *deltap = is_inf_internal ? I32_MAX : delta; 1659 if (flags & SCF_DO_SUBSTR && is_inf) 1660 data->pos_delta = I32_MAX - data->pos_min; 1661 if (is_par > U8_MAX) 1662 is_par = 0; 1663 if (is_par && pars==1 && data) { 1664 data->flags |= SF_IN_PAR; 1665 data->flags &= ~SF_HAS_PAR; 1666 } 1667 else if (pars && data) { 1668 data->flags |= SF_HAS_PAR; 1669 data->flags &= ~SF_IN_PAR; 1670 } 1671 if (flags & SCF_DO_STCLASS_OR) 1672 cl_and(data->start_class, &and_with); 1673 return min; 1674 } 1675 1676 STATIC I32 1677 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s) 1678 { 1679 if (RExC_rx->data) { 1680 Renewc(RExC_rx->data, 1681 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1), 1682 char, struct reg_data); 1683 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8); 1684 RExC_rx->data->count += n; 1685 } 1686 else { 1687 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1), 1688 char, struct reg_data); 1689 Newx(RExC_rx->data->what, n, U8); 1690 RExC_rx->data->count = n; 1691 } 1692 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8); 1693 return RExC_rx->data->count - n; 1694 } 1695 1696 void 1697 Perl_reginitcolors(pTHX) 1698 { 1699 const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); 1700 if (s) { 1701 char *t = savepv(s); 1702 int i = 0; 1703 PL_colors[0] = t; 1704 while (++i < 6) { 1705 t = strchr(t, '\t'); 1706 if (t) { 1707 *t = '\0'; 1708 PL_colors[i] = ++t; 1709 } 1710 else 1711 PL_colors[i] = t = (char *)""; 1712 } 1713 } else { 1714 int i = 0; 1715 while (i < 6) 1716 PL_colors[i++] = (char *)""; 1717 } 1718 PL_colorset = 1; 1719 } 1720 1721 1722 /* 1723 - pregcomp - compile a regular expression into internal code 1724 * 1725 * We can't allocate space until we know how big the compiled form will be, 1726 * but we can't compile it (and thus know how big it is) until we've got a 1727 * place to put the code. So we cheat: we compile it twice, once with code 1728 * generation turned off and size counting turned on, and once "for real". 1729 * This also means that we don't allocate space until we are sure that the 1730 * thing really will compile successfully, and we never have to move the 1731 * code and thus invalidate pointers into it. (Note that it has to be in 1732 * one piece because free() must be able to free it all.) [NB: not true in perl] 1733 * 1734 * Beware that the optimization-preparation code in here knows about some 1735 * of the structure of the compiled regexp. [I'll say.] 1736 */ 1737 regexp * 1738 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) 1739 { 1740 register regexp *r; 1741 regnode *scan; 1742 regnode *first; 1743 I32 flags; 1744 I32 minlen = 0; 1745 I32 sawplus = 0; 1746 I32 sawopen = 0; 1747 scan_data_t data; 1748 RExC_state_t RExC_state; 1749 RExC_state_t *pRExC_state = &RExC_state; 1750 1751 if (exp == NULL) 1752 FAIL("NULL regexp argument"); 1753 1754 RExC_orig_utf8 = RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; 1755 1756 DEBUG_r({ 1757 if (!PL_colorset) reginitcolors(); 1758 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", 1759 PL_colors[4],PL_colors[5],PL_colors[0], 1760 (int)(xend - exp), exp, PL_colors[1]); 1761 }); 1762 1763 redo_first_pass: 1764 RExC_precomp = exp; 1765 RExC_flags = pm->op_pmflags; 1766 RExC_sawback = 0; 1767 1768 RExC_seen = 0; 1769 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 1770 RExC_seen_evals = 0; 1771 RExC_extralen = 0; 1772 1773 /* First pass: determine size, legality. */ 1774 RExC_parse = exp; 1775 RExC_start = exp; 1776 RExC_end = xend; 1777 RExC_naughty = 0; 1778 RExC_npar = 1; 1779 RExC_size = 0L; 1780 RExC_emit = &PL_regdummy; 1781 RExC_whilem_seen = 0; 1782 #if 0 /* REGC() is (currently) a NOP at the first pass. 1783 * Clever compilers notice this and complain. --jhi */ 1784 REGC((U8)REG_MAGIC, (char*)RExC_emit); 1785 #endif 1786 if (reg(pRExC_state, 0, &flags) == NULL) { 1787 RExC_precomp = Nullch; 1788 return(NULL); 1789 } 1790 if (RExC_utf8 && !RExC_orig_utf8) { 1791 STRLEN len = xend-exp; 1792 DEBUG_r(PerlIO_printf(Perl_debug_log, 1793 "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); 1794 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); 1795 xend = exp + len; 1796 RExC_orig_utf8 = RExC_utf8; 1797 SAVEFREEPV(exp); 1798 goto redo_first_pass; 1799 } 1800 1801 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size)); 1802 1803 /* Small enough for pointer-storage convention? 1804 If extralen==0, this means that we will not need long jumps. */ 1805 if (RExC_size >= 0x10000L && RExC_extralen) 1806 RExC_size += RExC_extralen; 1807 else 1808 RExC_extralen = 0; 1809 if (RExC_whilem_seen > 15) 1810 RExC_whilem_seen = 15; 1811 1812 /* Allocate space and initialize. */ 1813 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), 1814 char, regexp); 1815 if (r == NULL) 1816 FAIL("Regexp out of space"); 1817 1818 #ifdef DEBUGGING 1819 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ 1820 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char); 1821 #endif 1822 r->refcnt = 1; 1823 r->prelen = xend - exp; 1824 r->precomp = savepvn(RExC_precomp, r->prelen); 1825 r->subbeg = NULL; 1826 r->reganch = pm->op_pmflags & PMf_COMPILETIME; 1827 r->nparens = RExC_npar - 1; /* set early to validate backrefs */ 1828 1829 r->substrs = 0; /* Useful during FAIL. */ 1830 r->startp = 0; /* Useful during FAIL. */ 1831 r->endp = 0; /* Useful during FAIL. */ 1832 1833 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ 1834 if (r->offsets) { 1835 r->offsets[0] = RExC_size; 1836 } 1837 DEBUG_r(PerlIO_printf(Perl_debug_log, 1838 "%s %"UVuf" bytes for offset annotations.\n", 1839 r->offsets ? "Got" : "Couldn't get", 1840 (UV)((2*RExC_size+1) * sizeof(U32)))); 1841 1842 RExC_rx = r; 1843 1844 /* Second pass: emit code. */ 1845 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */ 1846 RExC_parse = exp; 1847 RExC_end = xend; 1848 RExC_naughty = 0; 1849 RExC_npar = 1; 1850 RExC_emit_start = r->program; 1851 RExC_emit = r->program; 1852 /* Store the count of eval-groups for security checks: */ 1853 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals); 1854 REGC((U8)REG_MAGIC, (char*) RExC_emit++); 1855 r->data = 0; 1856 if (reg(pRExC_state, 0, &flags) == NULL) 1857 return(NULL); 1858 1859 /* Dig out information for optimizations. */ 1860 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */ 1861 pm->op_pmflags = RExC_flags; 1862 if (UTF) 1863 r->reganch |= ROPT_UTF8; /* Unicode in it? */ 1864 r->regstclass = NULL; 1865 if (RExC_naughty >= 10) /* Probably an expensive pattern. */ 1866 r->reganch |= ROPT_NAUGHTY; 1867 scan = r->program + 1; /* First BRANCH. */ 1868 1869 /* XXXX To minimize changes to RE engine we always allocate 1870 3-units-long substrs field. */ 1871 Newxz(r->substrs, 1, struct reg_substr_data); 1872 1873 StructCopy(&zero_scan_data, &data, scan_data_t); 1874 /* XXXX Should not we check for something else? Usually it is OPEN1... */ 1875 if (OP(scan) != BRANCH) { /* Only one top-level choice. */ 1876 I32 fake; 1877 STRLEN longest_float_length, longest_fixed_length; 1878 struct regnode_charclass_class ch_class; 1879 int stclass_flag; 1880 I32 last_close = 0; 1881 1882 first = scan; 1883 /* Skip introductions and multiplicators >= 1. */ 1884 while ((OP(first) == OPEN && (sawopen = 1)) || 1885 /* An OR of *one* alternative - should not happen now. */ 1886 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || 1887 (OP(first) == PLUS) || 1888 (OP(first) == MINMOD) || 1889 /* An {n,m} with n>0 */ 1890 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) { 1891 if (OP(first) == PLUS) 1892 sawplus = 1; 1893 else 1894 first += regarglen[(U8)OP(first)]; 1895 first = NEXTOPER(first); 1896 } 1897 1898 /* Starting-point info. */ 1899 again: 1900 if (PL_regkind[(U8)OP(first)] == EXACT) { 1901 if (OP(first) == EXACT) 1902 ; /* Empty, get anchored substr later. */ 1903 else if ((OP(first) == EXACTF || OP(first) == EXACTFL)) 1904 r->regstclass = first; 1905 } 1906 else if (strchr((const char*)PL_simple,OP(first))) 1907 r->regstclass = first; 1908 else if (PL_regkind[(U8)OP(first)] == BOUND || 1909 PL_regkind[(U8)OP(first)] == NBOUND) 1910 r->regstclass = first; 1911 else if (PL_regkind[(U8)OP(first)] == BOL) { 1912 r->reganch |= (OP(first) == MBOL 1913 ? ROPT_ANCH_MBOL 1914 : (OP(first) == SBOL 1915 ? ROPT_ANCH_SBOL 1916 : ROPT_ANCH_BOL)); 1917 first = NEXTOPER(first); 1918 goto again; 1919 } 1920 else if (OP(first) == GPOS) { 1921 r->reganch |= ROPT_ANCH_GPOS; 1922 first = NEXTOPER(first); 1923 goto again; 1924 } 1925 else if (!sawopen && (OP(first) == STAR && 1926 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) && 1927 !(r->reganch & ROPT_ANCH) ) 1928 { 1929 /* turn .* into ^.* with an implied $*=1 */ 1930 const int type = 1931 (OP(NEXTOPER(first)) == REG_ANY) 1932 ? ROPT_ANCH_MBOL 1933 : ROPT_ANCH_SBOL; 1934 r->reganch |= type | ROPT_IMPLICIT; 1935 first = NEXTOPER(first); 1936 goto again; 1937 } 1938 if (sawplus && (!sawopen || !RExC_sawback) 1939 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ 1940 /* x+ must match at the 1st pos of run of x's */ 1941 r->reganch |= ROPT_SKIP; 1942 1943 /* Scan is after the zeroth branch, first is atomic matcher. */ 1944 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 1945 (IV)(first - scan + 1))); 1946 /* 1947 * If there's something expensive in the r.e., find the 1948 * longest literal string that must appear and make it the 1949 * regmust. Resolve ties in favor of later strings, since 1950 * the regstart check works with the beginning of the r.e. 1951 * and avoiding duplication strengthens checking. Not a 1952 * strong reason, but sufficient in the absence of others. 1953 * [Now we resolve ties in favor of the earlier string if 1954 * it happens that c_offset_min has been invalidated, since the 1955 * earlier string may buy us something the later one won't.] 1956 */ 1957 minlen = 0; 1958 1959 data.longest_fixed = newSVpvn("",0); 1960 data.longest_float = newSVpvn("",0); 1961 data.last_found = newSVpvn("",0); 1962 data.longest = &(data.longest_fixed); 1963 first = scan; 1964 if (!r->regstclass) { 1965 cl_init(pRExC_state, &ch_class); 1966 data.start_class = &ch_class; 1967 stclass_flag = SCF_DO_STCLASS_AND; 1968 } else /* XXXX Check for BOUND? */ 1969 stclass_flag = 0; 1970 data.last_closep = &last_close; 1971 1972 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */ 1973 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag); 1974 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) 1975 && data.last_start_min == 0 && data.last_end > 0 1976 && !RExC_seen_zerolen 1977 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) 1978 r->reganch |= ROPT_CHECK_ALL; 1979 scan_commit(pRExC_state, &data); 1980 SvREFCNT_dec(data.last_found); 1981 1982 longest_float_length = CHR_SVLEN(data.longest_float); 1983 if (longest_float_length 1984 || (data.flags & SF_FL_BEFORE_EOL 1985 && (!(data.flags & SF_FL_BEFORE_MEOL) 1986 || (RExC_flags & PMf_MULTILINE)))) { 1987 int t; 1988 1989 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ 1990 && data.offset_fixed == data.offset_float_min 1991 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) 1992 goto remove_float; /* As in (a)+. */ 1993 1994 if (SvUTF8(data.longest_float)) { 1995 r->float_utf8 = data.longest_float; 1996 r->float_substr = Nullsv; 1997 } else { 1998 r->float_substr = data.longest_float; 1999 r->float_utf8 = Nullsv; 2000 } 2001 r->float_min_offset = data.offset_float_min; 2002 r->float_max_offset = data.offset_float_max; 2003 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ 2004 && (!(data.flags & SF_FL_BEFORE_MEOL) 2005 || (RExC_flags & PMf_MULTILINE))); 2006 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); 2007 } 2008 else { 2009 remove_float: 2010 r->float_substr = r->float_utf8 = Nullsv; 2011 SvREFCNT_dec(data.longest_float); 2012 longest_float_length = 0; 2013 } 2014 2015 longest_fixed_length = CHR_SVLEN(data.longest_fixed); 2016 if (longest_fixed_length 2017 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ 2018 && (!(data.flags & SF_FIX_BEFORE_MEOL) 2019 || (RExC_flags & PMf_MULTILINE)))) { 2020 int t; 2021 2022 if (SvUTF8(data.longest_fixed)) { 2023 r->anchored_utf8 = data.longest_fixed; 2024 r->anchored_substr = Nullsv; 2025 } else { 2026 r->anchored_substr = data.longest_fixed; 2027 r->anchored_utf8 = Nullsv; 2028 } 2029 r->anchored_offset = data.offset_fixed; 2030 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ 2031 && (!(data.flags & SF_FIX_BEFORE_MEOL) 2032 || (RExC_flags & PMf_MULTILINE))); 2033 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); 2034 } 2035 else { 2036 r->anchored_substr = r->anchored_utf8 = Nullsv; 2037 SvREFCNT_dec(data.longest_fixed); 2038 longest_fixed_length = 0; 2039 } 2040 if (r->regstclass 2041 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) 2042 r->regstclass = NULL; 2043 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) 2044 && stclass_flag 2045 && !(data.start_class->flags & ANYOF_EOS) 2046 && !cl_is_anything(data.start_class)) 2047 { 2048 const I32 n = add_data(pRExC_state, 1, "f"); 2049 2050 Newx(RExC_rx->data->data[n], 1, 2051 struct regnode_charclass_class); 2052 StructCopy(data.start_class, 2053 (struct regnode_charclass_class*)RExC_rx->data->data[n], 2054 struct regnode_charclass_class); 2055 r->regstclass = (regnode*)RExC_rx->data->data[n]; 2056 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ 2057 PL_regdata = r->data; /* for regprop() */ 2058 DEBUG_r({ SV *sv = sv_newmortal(); 2059 regprop(sv, (regnode*)data.start_class); 2060 PerlIO_printf(Perl_debug_log, 2061 "synthetic stclass \"%s\".\n", 2062 SvPVX_const(sv));}); 2063 } 2064 2065 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ 2066 if (longest_fixed_length > longest_float_length) { 2067 r->check_substr = r->anchored_substr; 2068 r->check_utf8 = r->anchored_utf8; 2069 r->check_offset_min = r->check_offset_max = r->anchored_offset; 2070 if (r->reganch & ROPT_ANCH_SINGLE) 2071 r->reganch |= ROPT_NOSCAN; 2072 } 2073 else { 2074 r->check_substr = r->float_substr; 2075 r->check_utf8 = r->float_utf8; 2076 r->check_offset_min = data.offset_float_min; 2077 r->check_offset_max = data.offset_float_max; 2078 } 2079 /* XXXX Currently intuiting is not compatible with ANCH_GPOS. 2080 This should be changed ASAP! */ 2081 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) { 2082 r->reganch |= RE_USE_INTUIT; 2083 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) 2084 r->reganch |= RE_INTUIT_TAIL; 2085 } 2086 } 2087 else { 2088 /* Several toplevels. Best we can is to set minlen. */ 2089 I32 fake; 2090 struct regnode_charclass_class ch_class; 2091 I32 last_close = 0; 2092 2093 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n")); 2094 scan = r->program + 1; 2095 cl_init(pRExC_state, &ch_class); 2096 data.start_class = &ch_class; 2097 data.last_closep = &last_close; 2098 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); 2099 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 2100 = r->float_substr = r->float_utf8 = Nullsv; 2101 if (!(data.start_class->flags & ANYOF_EOS) 2102 && !cl_is_anything(data.start_class)) 2103 { 2104 const I32 n = add_data(pRExC_state, 1, "f"); 2105 2106 Newx(RExC_rx->data->data[n], 1, 2107 struct regnode_charclass_class); 2108 StructCopy(data.start_class, 2109 (struct regnode_charclass_class*)RExC_rx->data->data[n], 2110 struct regnode_charclass_class); 2111 r->regstclass = (regnode*)RExC_rx->data->data[n]; 2112 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ 2113 DEBUG_r({ SV* sv = sv_newmortal(); 2114 regprop(sv, (regnode*)data.start_class); 2115 PerlIO_printf(Perl_debug_log, 2116 "synthetic stclass \"%s\".\n", 2117 SvPVX_const(sv));}); 2118 } 2119 } 2120 2121 r->minlen = minlen; 2122 if (RExC_seen & REG_SEEN_GPOS) 2123 r->reganch |= ROPT_GPOS_SEEN; 2124 if (RExC_seen & REG_SEEN_LOOKBEHIND) 2125 r->reganch |= ROPT_LOOKBEHIND_SEEN; 2126 if (RExC_seen & REG_SEEN_EVAL) 2127 r->reganch |= ROPT_EVAL_SEEN; 2128 if (RExC_seen & REG_SEEN_CANY) 2129 r->reganch |= ROPT_CANY_SEEN; 2130 Newxz(r->startp, RExC_npar, I32); 2131 Newxz(r->endp, RExC_npar, I32); 2132 PL_regdata = r->data; /* for regprop() */ 2133 DEBUG_r(regdump(r)); 2134 return(r); 2135 } 2136 2137 /* 2138 - reg - regular expression, i.e. main body or parenthesized thing 2139 * 2140 * Caller must absorb opening parenthesis. 2141 * 2142 * Combining parenthesis handling with the base level of regular expression 2143 * is a trifle forced, but the need to tie the tails of the branches to what 2144 * follows makes it hard to avoid. 2145 */ 2146 STATIC regnode * 2147 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) 2148 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ 2149 { 2150 register regnode *ret; /* Will be the head of the group. */ 2151 register regnode *br; 2152 register regnode *lastbr; 2153 register regnode *ender = 0; 2154 register I32 parno = 0; 2155 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0; 2156 2157 /* for (?g), (?gc), and (?o) warnings; warning 2158 about (?c) will warn about (?g) -- japhy */ 2159 2160 I32 wastedflags = 0x00, 2161 wasted_o = 0x01, 2162 wasted_g = 0x02, 2163 wasted_gc = 0x02 | 0x04, 2164 wasted_c = 0x04; 2165 2166 char * parse_start = RExC_parse; /* MJD */ 2167 char * const oregcomp_parse = RExC_parse; 2168 char c; 2169 2170 *flagp = 0; /* Tentatively. */ 2171 2172 2173 /* Make an OPEN node, if parenthesized. */ 2174 if (paren) { 2175 if (*RExC_parse == '?') { /* (?...) */ 2176 U32 posflags = 0, negflags = 0; 2177 U32 *flagsp = &posflags; 2178 int logical = 0; 2179 const char * const seqstart = RExC_parse; 2180 2181 RExC_parse++; 2182 paren = *RExC_parse++; 2183 ret = NULL; /* For look-ahead/behind. */ 2184 switch (paren) { 2185 case '<': /* (?<...) */ 2186 RExC_seen |= REG_SEEN_LOOKBEHIND; 2187 if (*RExC_parse == '!') 2188 paren = ','; 2189 if (*RExC_parse != '=' && *RExC_parse != '!') 2190 goto unknown; 2191 RExC_parse++; 2192 case '=': /* (?=...) */ 2193 case '!': /* (?!...) */ 2194 RExC_seen_zerolen++; 2195 case ':': /* (?:...) */ 2196 case '>': /* (?>...) */ 2197 break; 2198 case '$': /* (?$...) */ 2199 case '@': /* (?@...) */ 2200 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 2201 break; 2202 case '#': /* (?#...) */ 2203 while (*RExC_parse && *RExC_parse != ')') 2204 RExC_parse++; 2205 if (*RExC_parse != ')') 2206 FAIL("Sequence (?#... not terminated"); 2207 nextchar(pRExC_state); 2208 *flagp = TRYAGAIN; 2209 return NULL; 2210 case 'p': /* (?p...) */ 2211 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) 2212 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})"); 2213 /* FALL THROUGH*/ 2214 case '?': /* (??...) */ 2215 logical = 1; 2216 if (*RExC_parse != '{') 2217 goto unknown; 2218 paren = *RExC_parse++; 2219 /* FALL THROUGH */ 2220 case '{': /* (?{...}) */ 2221 { 2222 I32 count = 1, n = 0; 2223 char c; 2224 char *s = RExC_parse; 2225 SV *sv; 2226 OP_4tree *sop, *rop; 2227 2228 RExC_seen_zerolen++; 2229 RExC_seen |= REG_SEEN_EVAL; 2230 while (count && (c = *RExC_parse)) { 2231 if (c == '\\' && RExC_parse[1]) 2232 RExC_parse++; 2233 else if (c == '{') 2234 count++; 2235 else if (c == '}') 2236 count--; 2237 RExC_parse++; 2238 } 2239 if (*RExC_parse != ')') 2240 { 2241 RExC_parse = s; 2242 vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); 2243 } 2244 if (!SIZE_ONLY) { 2245 PAD *pad; 2246 2247 if (RExC_parse - 1 - s) 2248 sv = newSVpvn(s, RExC_parse - 1 - s); 2249 else 2250 sv = newSVpvn("", 0); 2251 2252 ENTER; 2253 Perl_save_re_context(aTHX); 2254 rop = sv_compile_2op(sv, &sop, "re", &pad); 2255 sop->op_private |= OPpREFCOUNTED; 2256 /* re_dup will OpREFCNT_inc */ 2257 OpREFCNT_set(sop, 1); 2258 LEAVE; 2259 2260 n = add_data(pRExC_state, 3, "nop"); 2261 RExC_rx->data->data[n] = (void*)rop; 2262 RExC_rx->data->data[n+1] = (void*)sop; 2263 RExC_rx->data->data[n+2] = (void*)pad; 2264 SvREFCNT_dec(sv); 2265 } 2266 else { /* First pass */ 2267 if (PL_reginterp_cnt < ++RExC_seen_evals 2268 && IN_PERL_RUNTIME) 2269 /* No compiled RE interpolated, has runtime 2270 components ===> unsafe. */ 2271 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 2272 if (PL_tainting && PL_tainted) 2273 FAIL("Eval-group in insecure regular expression"); 2274 } 2275 2276 nextchar(pRExC_state); 2277 if (logical) { 2278 ret = reg_node(pRExC_state, LOGICAL); 2279 if (!SIZE_ONLY) 2280 ret->flags = 2; 2281 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); 2282 /* deal with the length of this later - MJD */ 2283 return ret; 2284 } 2285 ret = reganode(pRExC_state, EVAL, n); 2286 Set_Node_Length(ret, RExC_parse - parse_start + 1); 2287 Set_Node_Offset(ret, parse_start); 2288 return ret; 2289 } 2290 case '(': /* (?(?{...})...) and (?(?=...)...) */ 2291 { 2292 if (RExC_parse[0] == '?') { /* (?(?...)) */ 2293 if (RExC_parse[1] == '=' || RExC_parse[1] == '!' 2294 || RExC_parse[1] == '<' 2295 || RExC_parse[1] == '{') { /* Lookahead or eval. */ 2296 I32 flag; 2297 2298 ret = reg_node(pRExC_state, LOGICAL); 2299 if (!SIZE_ONLY) 2300 ret->flags = 1; 2301 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag)); 2302 goto insert_if; 2303 } 2304 } 2305 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 2306 /* (?(1)...) */ 2307 parno = atoi(RExC_parse++); 2308 2309 while (isDIGIT(*RExC_parse)) 2310 RExC_parse++; 2311 ret = reganode(pRExC_state, GROUPP, parno); 2312 2313 if ((c = *nextchar(pRExC_state)) != ')') 2314 vFAIL("Switch condition not recognized"); 2315 insert_if: 2316 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); 2317 br = regbranch(pRExC_state, &flags, 1); 2318 if (br == NULL) 2319 br = reganode(pRExC_state, LONGJMP, 0); 2320 else 2321 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); 2322 c = *nextchar(pRExC_state); 2323 if (flags&HASWIDTH) 2324 *flagp |= HASWIDTH; 2325 if (c == '|') { 2326 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ 2327 regbranch(pRExC_state, &flags, 1); 2328 regtail(pRExC_state, ret, lastbr); 2329 if (flags&HASWIDTH) 2330 *flagp |= HASWIDTH; 2331 c = *nextchar(pRExC_state); 2332 } 2333 else 2334 lastbr = NULL; 2335 if (c != ')') 2336 vFAIL("Switch (?(condition)... contains too many branches"); 2337 ender = reg_node(pRExC_state, TAIL); 2338 regtail(pRExC_state, br, ender); 2339 if (lastbr) { 2340 regtail(pRExC_state, lastbr, ender); 2341 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); 2342 } 2343 else 2344 regtail(pRExC_state, ret, ender); 2345 return ret; 2346 } 2347 else { 2348 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); 2349 } 2350 } 2351 case 0: 2352 RExC_parse--; /* for vFAIL to print correctly */ 2353 vFAIL("Sequence (? incomplete"); 2354 break; 2355 default: 2356 --RExC_parse; 2357 parse_flags: /* (?i) */ 2358 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) { 2359 /* (?g), (?gc) and (?o) are useless here 2360 and must be globally applied -- japhy */ 2361 2362 if (*RExC_parse == 'o' || *RExC_parse == 'g') { 2363 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 2364 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g; 2365 if (! (wastedflags & wflagbit) ) { 2366 wastedflags |= wflagbit; 2367 vWARN5( 2368 RExC_parse + 1, 2369 "Useless (%s%c) - %suse /%c modifier", 2370 flagsp == &negflags ? "?-" : "?", 2371 *RExC_parse, 2372 flagsp == &negflags ? "don't " : "", 2373 *RExC_parse 2374 ); 2375 } 2376 } 2377 } 2378 else if (*RExC_parse == 'c') { 2379 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 2380 if (! (wastedflags & wasted_c) ) { 2381 wastedflags |= wasted_gc; 2382 vWARN3( 2383 RExC_parse + 1, 2384 "Useless (%sc) - %suse /gc modifier", 2385 flagsp == &negflags ? "?-" : "?", 2386 flagsp == &negflags ? "don't " : "" 2387 ); 2388 } 2389 } 2390 } 2391 else { pmflag(flagsp, *RExC_parse); } 2392 2393 ++RExC_parse; 2394 } 2395 if (*RExC_parse == '-') { 2396 flagsp = &negflags; 2397 wastedflags = 0; /* reset so (?g-c) warns twice */ 2398 ++RExC_parse; 2399 goto parse_flags; 2400 } 2401 RExC_flags |= posflags; 2402 RExC_flags &= ~negflags; 2403 if (*RExC_parse == ':') { 2404 RExC_parse++; 2405 paren = ':'; 2406 break; 2407 } 2408 unknown: 2409 if (*RExC_parse != ')') { 2410 RExC_parse++; 2411 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 2412 } 2413 nextchar(pRExC_state); 2414 *flagp = TRYAGAIN; 2415 return NULL; 2416 } 2417 } 2418 else { /* (...) */ 2419 parno = RExC_npar; 2420 RExC_npar++; 2421 ret = reganode(pRExC_state, OPEN, parno); 2422 Set_Node_Length(ret, 1); /* MJD */ 2423 Set_Node_Offset(ret, RExC_parse); /* MJD */ 2424 open = 1; 2425 } 2426 } 2427 else /* ! paren */ 2428 ret = NULL; 2429 2430 /* Pick up the branches, linking them together. */ 2431 parse_start = RExC_parse; /* MJD */ 2432 br = regbranch(pRExC_state, &flags, 1); 2433 /* branch_len = (paren != 0); */ 2434 2435 if (br == NULL) 2436 return(NULL); 2437 if (*RExC_parse == '|') { 2438 if (!SIZE_ONLY && RExC_extralen) { 2439 reginsert(pRExC_state, BRANCHJ, br); 2440 } 2441 else { /* MJD */ 2442 reginsert(pRExC_state, BRANCH, br); 2443 Set_Node_Length(br, paren != 0); 2444 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); 2445 } 2446 have_branch = 1; 2447 if (SIZE_ONLY) 2448 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ 2449 } 2450 else if (paren == ':') { 2451 *flagp |= flags&SIMPLE; 2452 } 2453 if (open) { /* Starts with OPEN. */ 2454 regtail(pRExC_state, ret, br); /* OPEN -> first. */ 2455 } 2456 else if (paren != '?') /* Not Conditional */ 2457 ret = br; 2458 *flagp |= flags & (SPSTART | HASWIDTH); 2459 lastbr = br; 2460 while (*RExC_parse == '|') { 2461 if (!SIZE_ONLY && RExC_extralen) { 2462 ender = reganode(pRExC_state, LONGJMP,0); 2463 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ 2464 } 2465 if (SIZE_ONLY) 2466 RExC_extralen += 2; /* Account for LONGJMP. */ 2467 nextchar(pRExC_state); 2468 br = regbranch(pRExC_state, &flags, 0); 2469 2470 if (br == NULL) 2471 return(NULL); 2472 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ 2473 lastbr = br; 2474 if (flags&HASWIDTH) 2475 *flagp |= HASWIDTH; 2476 *flagp |= flags&SPSTART; 2477 } 2478 2479 if (have_branch || paren != ':') { 2480 /* Make a closing node, and hook it on the end. */ 2481 switch (paren) { 2482 case ':': 2483 ender = reg_node(pRExC_state, TAIL); 2484 break; 2485 case 1: 2486 ender = reganode(pRExC_state, CLOSE, parno); 2487 Set_Node_Offset(ender,RExC_parse+1); /* MJD */ 2488 Set_Node_Length(ender,1); /* MJD */ 2489 break; 2490 case '<': 2491 case ',': 2492 case '=': 2493 case '!': 2494 *flagp &= ~HASWIDTH; 2495 /* FALL THROUGH */ 2496 case '>': 2497 ender = reg_node(pRExC_state, SUCCEED); 2498 break; 2499 case 0: 2500 ender = reg_node(pRExC_state, END); 2501 break; 2502 } 2503 regtail(pRExC_state, lastbr, ender); 2504 2505 if (have_branch) { 2506 /* Hook the tails of the branches to the closing node. */ 2507 for (br = ret; br != NULL; br = regnext(br)) { 2508 regoptail(pRExC_state, br, ender); 2509 } 2510 } 2511 } 2512 2513 { 2514 const char *p; 2515 static const char parens[] = "=!<,>"; 2516 2517 if (paren && (p = strchr(parens, paren))) { 2518 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 2519 int flag = (p - parens) > 1; 2520 2521 if (paren == '>') 2522 node = SUSPEND, flag = 0; 2523 reginsert(pRExC_state, node,ret); 2524 Set_Node_Cur_Length(ret); 2525 Set_Node_Offset(ret, parse_start + 1); 2526 ret->flags = flag; 2527 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL)); 2528 } 2529 } 2530 2531 /* Check for proper termination. */ 2532 if (paren) { 2533 RExC_flags = oregflags; 2534 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { 2535 RExC_parse = oregcomp_parse; 2536 vFAIL("Unmatched ("); 2537 } 2538 } 2539 else if (!paren && RExC_parse < RExC_end) { 2540 if (*RExC_parse == ')') { 2541 RExC_parse++; 2542 vFAIL("Unmatched )"); 2543 } 2544 else 2545 FAIL("Junk on end of regexp"); /* "Can't happen". */ 2546 /* NOTREACHED */ 2547 } 2548 2549 return(ret); 2550 } 2551 2552 /* 2553 - regbranch - one alternative of an | operator 2554 * 2555 * Implements the concatenation operator. 2556 */ 2557 STATIC regnode * 2558 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first) 2559 { 2560 register regnode *ret; 2561 register regnode *chain = NULL; 2562 register regnode *latest; 2563 I32 flags = 0, c = 0; 2564 2565 if (first) 2566 ret = NULL; 2567 else { 2568 if (!SIZE_ONLY && RExC_extralen) 2569 ret = reganode(pRExC_state, BRANCHJ,0); 2570 else { 2571 ret = reg_node(pRExC_state, BRANCH); 2572 Set_Node_Length(ret, 1); 2573 } 2574 } 2575 2576 if (!first && SIZE_ONLY) 2577 RExC_extralen += 1; /* BRANCHJ */ 2578 2579 *flagp = WORST; /* Tentatively. */ 2580 2581 RExC_parse--; 2582 nextchar(pRExC_state); 2583 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 2584 flags &= ~TRYAGAIN; 2585 latest = regpiece(pRExC_state, &flags); 2586 if (latest == NULL) { 2587 if (flags & TRYAGAIN) 2588 continue; 2589 return(NULL); 2590 } 2591 else if (ret == NULL) 2592 ret = latest; 2593 *flagp |= flags&HASWIDTH; 2594 if (chain == NULL) /* First piece. */ 2595 *flagp |= flags&SPSTART; 2596 else { 2597 RExC_naughty++; 2598 regtail(pRExC_state, chain, latest); 2599 } 2600 chain = latest; 2601 c++; 2602 } 2603 if (chain == NULL) { /* Loop ran zero times. */ 2604 chain = reg_node(pRExC_state, NOTHING); 2605 if (ret == NULL) 2606 ret = chain; 2607 } 2608 if (c == 1) { 2609 *flagp |= flags&SIMPLE; 2610 } 2611 2612 return(ret); 2613 } 2614 2615 /* 2616 - regpiece - something followed by possible [*+?] 2617 * 2618 * Note that the branching code sequences used for ? and the general cases 2619 * of * and + are somewhat optimized: they use the same NOTHING node as 2620 * both the endmarker for their branch list and the body of the last branch. 2621 * It might seem that this node could be dispensed with entirely, but the 2622 * endmarker role is not redundant. 2623 */ 2624 STATIC regnode * 2625 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) 2626 { 2627 register regnode *ret; 2628 register char op; 2629 register char *next; 2630 I32 flags; 2631 const char * const origparse = RExC_parse; 2632 char *maxpos; 2633 I32 min; 2634 I32 max = REG_INFTY; 2635 char *parse_start; 2636 2637 ret = regatom(pRExC_state, &flags); 2638 if (ret == NULL) { 2639 if (flags & TRYAGAIN) 2640 *flagp |= TRYAGAIN; 2641 return(NULL); 2642 } 2643 2644 op = *RExC_parse; 2645 2646 if (op == '{' && regcurly(RExC_parse)) { 2647 parse_start = RExC_parse; /* MJD */ 2648 next = RExC_parse + 1; 2649 maxpos = Nullch; 2650 while (isDIGIT(*next) || *next == ',') { 2651 if (*next == ',') { 2652 if (maxpos) 2653 break; 2654 else 2655 maxpos = next; 2656 } 2657 next++; 2658 } 2659 if (*next == '}') { /* got one */ 2660 if (!maxpos) 2661 maxpos = next; 2662 RExC_parse++; 2663 min = atoi(RExC_parse); 2664 if (*maxpos == ',') 2665 maxpos++; 2666 else 2667 maxpos = RExC_parse; 2668 max = atoi(maxpos); 2669 if (!max && *maxpos != '0') 2670 max = REG_INFTY; /* meaning "infinity" */ 2671 else if (max >= REG_INFTY) 2672 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 2673 RExC_parse = next; 2674 nextchar(pRExC_state); 2675 2676 do_curly: 2677 if ((flags&SIMPLE)) { 2678 RExC_naughty += 2 + RExC_naughty / 2; 2679 reginsert(pRExC_state, CURLY, ret); 2680 Set_Node_Offset(ret, parse_start+1); /* MJD */ 2681 Set_Node_Cur_Length(ret); 2682 } 2683 else { 2684 regnode *w = reg_node(pRExC_state, WHILEM); 2685 2686 w->flags = 0; 2687 regtail(pRExC_state, ret, w); 2688 if (!SIZE_ONLY && RExC_extralen) { 2689 reginsert(pRExC_state, LONGJMP,ret); 2690 reginsert(pRExC_state, NOTHING,ret); 2691 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ 2692 } 2693 reginsert(pRExC_state, CURLYX,ret); 2694 /* MJD hk */ 2695 Set_Node_Offset(ret, parse_start+1); 2696 Set_Node_Length(ret, 2697 op == '{' ? (RExC_parse - parse_start) : 1); 2698 2699 if (!SIZE_ONLY && RExC_extralen) 2700 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ 2701 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); 2702 if (SIZE_ONLY) 2703 RExC_whilem_seen++, RExC_extralen += 3; 2704 RExC_naughty += 4 + RExC_naughty; /* compound interest */ 2705 } 2706 ret->flags = 0; 2707 2708 if (min > 0) 2709 *flagp = WORST; 2710 if (max > 0) 2711 *flagp |= HASWIDTH; 2712 if (max && max < min) 2713 vFAIL("Can't do {n,m} with n > m"); 2714 if (!SIZE_ONLY) { 2715 ARG1_SET(ret, (U16)min); 2716 ARG2_SET(ret, (U16)max); 2717 } 2718 2719 goto nest_check; 2720 } 2721 } 2722 2723 if (!ISMULT1(op)) { 2724 *flagp = flags; 2725 return(ret); 2726 } 2727 2728 #if 0 /* Now runtime fix should be reliable. */ 2729 2730 /* if this is reinstated, don't forget to put this back into perldiag: 2731 2732 =item Regexp *+ operand could be empty at {#} in regex m/%s/ 2733 2734 (F) The part of the regexp subject to either the * or + quantifier 2735 could match an empty string. The {#} shows in the regular 2736 expression about where the problem was discovered. 2737 2738 */ 2739 2740 if (!(flags&HASWIDTH) && op != '?') 2741 vFAIL("Regexp *+ operand could be empty"); 2742 #endif 2743 2744 parse_start = RExC_parse; 2745 nextchar(pRExC_state); 2746 2747 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); 2748 2749 if (op == '*' && (flags&SIMPLE)) { 2750 reginsert(pRExC_state, STAR, ret); 2751 ret->flags = 0; 2752 RExC_naughty += 4; 2753 } 2754 else if (op == '*') { 2755 min = 0; 2756 goto do_curly; 2757 } 2758 else if (op == '+' && (flags&SIMPLE)) { 2759 reginsert(pRExC_state, PLUS, ret); 2760 ret->flags = 0; 2761 RExC_naughty += 3; 2762 } 2763 else if (op == '+') { 2764 min = 1; 2765 goto do_curly; 2766 } 2767 else if (op == '?') { 2768 min = 0; max = 1; 2769 goto do_curly; 2770 } 2771 nest_check: 2772 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { 2773 vWARN3(RExC_parse, 2774 "%.*s matches null string many times", 2775 RExC_parse - origparse, 2776 origparse); 2777 } 2778 2779 if (*RExC_parse == '?') { 2780 nextchar(pRExC_state); 2781 reginsert(pRExC_state, MINMOD, ret); 2782 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE); 2783 } 2784 if (ISMULT2(RExC_parse)) { 2785 RExC_parse++; 2786 vFAIL("Nested quantifiers"); 2787 } 2788 2789 return(ret); 2790 } 2791 2792 /* 2793 - regatom - the lowest level 2794 * 2795 * Optimization: gobbles an entire sequence of ordinary characters so that 2796 * it can turn them into a single node, which is smaller to store and 2797 * faster to run. Backslashed characters are exceptions, each becoming a 2798 * separate node; the code is simpler that way and it's not worth fixing. 2799 * 2800 * [Yes, it is worth fixing, some scripts can run twice the speed.] */ 2801 STATIC regnode * 2802 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) 2803 { 2804 register regnode *ret = 0; 2805 I32 flags; 2806 char *parse_start = RExC_parse; 2807 2808 *flagp = WORST; /* Tentatively. */ 2809 2810 tryagain: 2811 switch (*RExC_parse) { 2812 case '^': 2813 RExC_seen_zerolen++; 2814 nextchar(pRExC_state); 2815 if (RExC_flags & PMf_MULTILINE) 2816 ret = reg_node(pRExC_state, MBOL); 2817 else if (RExC_flags & PMf_SINGLELINE) 2818 ret = reg_node(pRExC_state, SBOL); 2819 else 2820 ret = reg_node(pRExC_state, BOL); 2821 Set_Node_Length(ret, 1); /* MJD */ 2822 break; 2823 case '$': 2824 nextchar(pRExC_state); 2825 if (*RExC_parse) 2826 RExC_seen_zerolen++; 2827 if (RExC_flags & PMf_MULTILINE) 2828 ret = reg_node(pRExC_state, MEOL); 2829 else if (RExC_flags & PMf_SINGLELINE) 2830 ret = reg_node(pRExC_state, SEOL); 2831 else 2832 ret = reg_node(pRExC_state, EOL); 2833 Set_Node_Length(ret, 1); /* MJD */ 2834 break; 2835 case '.': 2836 nextchar(pRExC_state); 2837 if (RExC_flags & PMf_SINGLELINE) 2838 ret = reg_node(pRExC_state, SANY); 2839 else 2840 ret = reg_node(pRExC_state, REG_ANY); 2841 *flagp |= HASWIDTH|SIMPLE; 2842 RExC_naughty++; 2843 Set_Node_Length(ret, 1); /* MJD */ 2844 break; 2845 case '[': 2846 { 2847 char *oregcomp_parse = ++RExC_parse; 2848 ret = regclass(pRExC_state); 2849 if (*RExC_parse != ']') { 2850 RExC_parse = oregcomp_parse; 2851 vFAIL("Unmatched ["); 2852 } 2853 nextchar(pRExC_state); 2854 *flagp |= HASWIDTH|SIMPLE; 2855 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ 2856 break; 2857 } 2858 case '(': 2859 nextchar(pRExC_state); 2860 ret = reg(pRExC_state, 1, &flags); 2861 if (ret == NULL) { 2862 if (flags & TRYAGAIN) { 2863 if (RExC_parse == RExC_end) { 2864 /* Make parent create an empty node if needed. */ 2865 *flagp |= TRYAGAIN; 2866 return(NULL); 2867 } 2868 goto tryagain; 2869 } 2870 return(NULL); 2871 } 2872 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); 2873 break; 2874 case '|': 2875 case ')': 2876 if (flags & TRYAGAIN) { 2877 *flagp |= TRYAGAIN; 2878 return NULL; 2879 } 2880 vFAIL("Internal urp"); 2881 /* Supposed to be caught earlier. */ 2882 break; 2883 case '{': 2884 if (!regcurly(RExC_parse)) { 2885 RExC_parse++; 2886 goto defchar; 2887 } 2888 /* FALL THROUGH */ 2889 case '?': 2890 case '+': 2891 case '*': 2892 RExC_parse++; 2893 vFAIL("Quantifier follows nothing"); 2894 break; 2895 case '\\': 2896 switch (*++RExC_parse) { 2897 case 'A': 2898 RExC_seen_zerolen++; 2899 ret = reg_node(pRExC_state, SBOL); 2900 *flagp |= SIMPLE; 2901 nextchar(pRExC_state); 2902 Set_Node_Length(ret, 2); /* MJD */ 2903 break; 2904 case 'G': 2905 ret = reg_node(pRExC_state, GPOS); 2906 RExC_seen |= REG_SEEN_GPOS; 2907 *flagp |= SIMPLE; 2908 nextchar(pRExC_state); 2909 Set_Node_Length(ret, 2); /* MJD */ 2910 break; 2911 case 'Z': 2912 ret = reg_node(pRExC_state, SEOL); 2913 *flagp |= SIMPLE; 2914 RExC_seen_zerolen++; /* Do not optimize RE away */ 2915 nextchar(pRExC_state); 2916 break; 2917 case 'z': 2918 ret = reg_node(pRExC_state, EOS); 2919 *flagp |= SIMPLE; 2920 RExC_seen_zerolen++; /* Do not optimize RE away */ 2921 nextchar(pRExC_state); 2922 Set_Node_Length(ret, 2); /* MJD */ 2923 break; 2924 case 'C': 2925 ret = reg_node(pRExC_state, CANY); 2926 RExC_seen |= REG_SEEN_CANY; 2927 *flagp |= HASWIDTH|SIMPLE; 2928 nextchar(pRExC_state); 2929 Set_Node_Length(ret, 2); /* MJD */ 2930 break; 2931 case 'X': 2932 ret = reg_node(pRExC_state, CLUMP); 2933 *flagp |= HASWIDTH; 2934 nextchar(pRExC_state); 2935 Set_Node_Length(ret, 2); /* MJD */ 2936 break; 2937 case 'w': 2938 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); 2939 *flagp |= HASWIDTH|SIMPLE; 2940 nextchar(pRExC_state); 2941 Set_Node_Length(ret, 2); /* MJD */ 2942 break; 2943 case 'W': 2944 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); 2945 *flagp |= HASWIDTH|SIMPLE; 2946 nextchar(pRExC_state); 2947 Set_Node_Length(ret, 2); /* MJD */ 2948 break; 2949 case 'b': 2950 RExC_seen_zerolen++; 2951 RExC_seen |= REG_SEEN_LOOKBEHIND; 2952 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); 2953 *flagp |= SIMPLE; 2954 nextchar(pRExC_state); 2955 Set_Node_Length(ret, 2); /* MJD */ 2956 break; 2957 case 'B': 2958 RExC_seen_zerolen++; 2959 RExC_seen |= REG_SEEN_LOOKBEHIND; 2960 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); 2961 *flagp |= SIMPLE; 2962 nextchar(pRExC_state); 2963 Set_Node_Length(ret, 2); /* MJD */ 2964 break; 2965 case 's': 2966 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); 2967 *flagp |= HASWIDTH|SIMPLE; 2968 nextchar(pRExC_state); 2969 Set_Node_Length(ret, 2); /* MJD */ 2970 break; 2971 case 'S': 2972 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); 2973 *flagp |= HASWIDTH|SIMPLE; 2974 nextchar(pRExC_state); 2975 Set_Node_Length(ret, 2); /* MJD */ 2976 break; 2977 case 'd': 2978 ret = reg_node(pRExC_state, DIGIT); 2979 *flagp |= HASWIDTH|SIMPLE; 2980 nextchar(pRExC_state); 2981 Set_Node_Length(ret, 2); /* MJD */ 2982 break; 2983 case 'D': 2984 ret = reg_node(pRExC_state, NDIGIT); 2985 *flagp |= HASWIDTH|SIMPLE; 2986 nextchar(pRExC_state); 2987 Set_Node_Length(ret, 2); /* MJD */ 2988 break; 2989 case 'p': 2990 case 'P': 2991 { 2992 char* oldregxend = RExC_end; 2993 char* parse_start = RExC_parse - 2; 2994 2995 if (RExC_parse[1] == '{') { 2996 /* a lovely hack--pretend we saw [\pX] instead */ 2997 RExC_end = strchr(RExC_parse, '}'); 2998 if (!RExC_end) { 2999 U8 c = (U8)*RExC_parse; 3000 RExC_parse += 2; 3001 RExC_end = oldregxend; 3002 vFAIL2("Missing right brace on \\%c{}", c); 3003 } 3004 RExC_end++; 3005 } 3006 else { 3007 RExC_end = RExC_parse + 2; 3008 if (RExC_end > oldregxend) 3009 RExC_end = oldregxend; 3010 } 3011 RExC_parse--; 3012 3013 ret = regclass(pRExC_state); 3014 3015 RExC_end = oldregxend; 3016 RExC_parse--; 3017 3018 Set_Node_Offset(ret, parse_start + 2); 3019 Set_Node_Cur_Length(ret); 3020 nextchar(pRExC_state); 3021 *flagp |= HASWIDTH|SIMPLE; 3022 } 3023 break; 3024 case 'n': 3025 case 'r': 3026 case 't': 3027 case 'f': 3028 case 'e': 3029 case 'a': 3030 case 'x': 3031 case 'c': 3032 case '0': 3033 goto defchar; 3034 case '1': case '2': case '3': case '4': 3035 case '5': case '6': case '7': case '8': case '9': 3036 { 3037 const I32 num = atoi(RExC_parse); 3038 3039 if (num > 9 && num >= RExC_npar) 3040 goto defchar; 3041 else { 3042 char * parse_start = RExC_parse - 1; /* MJD */ 3043 while (isDIGIT(*RExC_parse)) 3044 RExC_parse++; 3045 3046 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens) 3047 vFAIL("Reference to nonexistent group"); 3048 RExC_sawback = 1; 3049 ret = reganode(pRExC_state, 3050 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF), 3051 num); 3052 *flagp |= HASWIDTH; 3053 3054 /* override incorrect value set in reganode MJD */ 3055 Set_Node_Offset(ret, parse_start+1); 3056 Set_Node_Cur_Length(ret); /* MJD */ 3057 RExC_parse--; 3058 nextchar(pRExC_state); 3059 } 3060 } 3061 break; 3062 case '\0': 3063 if (RExC_parse >= RExC_end) 3064 FAIL("Trailing \\"); 3065 /* FALL THROUGH */ 3066 default: 3067 /* Do not generate "unrecognized" warnings here, we fall 3068 back into the quick-grab loop below */ 3069 parse_start--; 3070 goto defchar; 3071 } 3072 break; 3073 3074 case '#': 3075 if (RExC_flags & PMf_EXTENDED) { 3076 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++; 3077 if (RExC_parse < RExC_end) 3078 goto tryagain; 3079 } 3080 /* FALL THROUGH */ 3081 3082 default: { 3083 register STRLEN len; 3084 register UV ender; 3085 register char *p; 3086 char *oldp, *s; 3087 STRLEN foldlen; 3088 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; 3089 3090 parse_start = RExC_parse - 1; 3091 3092 RExC_parse++; 3093 3094 defchar: 3095 ender = 0; 3096 ret = reg_node(pRExC_state, 3097 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); 3098 s = STRING(ret); 3099 for (len = 0, p = RExC_parse - 1; 3100 len < 127 && p < RExC_end; 3101 len++) 3102 { 3103 oldp = p; 3104 3105 if (RExC_flags & PMf_EXTENDED) 3106 p = regwhite(p, RExC_end); 3107 switch (*p) { 3108 case '^': 3109 case '$': 3110 case '.': 3111 case '[': 3112 case '(': 3113 case ')': 3114 case '|': 3115 goto loopdone; 3116 case '\\': 3117 switch (*++p) { 3118 case 'A': 3119 case 'C': 3120 case 'X': 3121 case 'G': 3122 case 'Z': 3123 case 'z': 3124 case 'w': 3125 case 'W': 3126 case 'b': 3127 case 'B': 3128 case 's': 3129 case 'S': 3130 case 'd': 3131 case 'D': 3132 case 'p': 3133 case 'P': 3134 --p; 3135 goto loopdone; 3136 case 'n': 3137 ender = '\n'; 3138 p++; 3139 break; 3140 case 'r': 3141 ender = '\r'; 3142 p++; 3143 break; 3144 case 't': 3145 ender = '\t'; 3146 p++; 3147 break; 3148 case 'f': 3149 ender = '\f'; 3150 p++; 3151 break; 3152 case 'e': 3153 ender = ASCII_TO_NATIVE('\033'); 3154 p++; 3155 break; 3156 case 'a': 3157 ender = ASCII_TO_NATIVE('\007'); 3158 p++; 3159 break; 3160 case 'x': 3161 if (*++p == '{') { 3162 char* const e = strchr(p, '}'); 3163 3164 if (!e) { 3165 RExC_parse = p + 1; 3166 vFAIL("Missing right brace on \\x{}"); 3167 } 3168 else { 3169 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 3170 | PERL_SCAN_DISALLOW_PREFIX; 3171 STRLEN numlen = e - p - 1; 3172 ender = grok_hex(p + 1, &numlen, &flags, NULL); 3173 if (ender > 0xff) 3174 RExC_utf8 = 1; 3175 p = e + 1; 3176 } 3177 } 3178 else { 3179 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 3180 STRLEN numlen = 2; 3181 ender = grok_hex(p, &numlen, &flags, NULL); 3182 p += numlen; 3183 } 3184 break; 3185 case 'c': 3186 p++; 3187 ender = UCHARAT(p++); 3188 ender = toCTRL(ender); 3189 break; 3190 case '0': case '1': case '2': case '3':case '4': 3191 case '5': case '6': case '7': case '8':case '9': 3192 if (*p == '0' || 3193 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { 3194 I32 flags = 0; 3195 STRLEN numlen = 3; 3196 ender = grok_oct(p, &numlen, &flags, NULL); 3197 p += numlen; 3198 } 3199 else { 3200 --p; 3201 goto loopdone; 3202 } 3203 break; 3204 case '\0': 3205 if (p >= RExC_end) 3206 FAIL("Trailing \\"); 3207 /* FALL THROUGH */ 3208 default: 3209 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP)) 3210 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); 3211 goto normal_default; 3212 } 3213 break; 3214 default: 3215 normal_default: 3216 if (UTF8_IS_START(*p) && UTF) { 3217 STRLEN numlen; 3218 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 3219 &numlen, 0); 3220 p += numlen; 3221 } 3222 else 3223 ender = *p++; 3224 break; 3225 } 3226 if (RExC_flags & PMf_EXTENDED) 3227 p = regwhite(p, RExC_end); 3228 if (UTF && FOLD) { 3229 /* Prime the casefolded buffer. */ 3230 ender = toFOLD_uni(ender, tmpbuf, &foldlen); 3231 } 3232 if (ISMULT2(p)) { /* Back off on ?+*. */ 3233 if (len) 3234 p = oldp; 3235 else if (UTF) { 3236 STRLEN unilen; 3237 3238 if (FOLD) { 3239 /* Emit all the Unicode characters. */ 3240 STRLEN numlen; 3241 for (foldbuf = tmpbuf; 3242 foldlen; 3243 foldlen -= numlen) { 3244 ender = utf8_to_uvchr(foldbuf, &numlen); 3245 if (numlen > 0) { 3246 reguni(pRExC_state, ender, s, &unilen); 3247 s += unilen; 3248 len += unilen; 3249 /* In EBCDIC the numlen 3250 * and unilen can differ. */ 3251 foldbuf += numlen; 3252 if (numlen >= foldlen) 3253 break; 3254 } 3255 else 3256 break; /* "Can't happen." */ 3257 } 3258 } 3259 else { 3260 reguni(pRExC_state, ender, s, &unilen); 3261 if (unilen > 0) { 3262 s += unilen; 3263 len += unilen; 3264 } 3265 } 3266 } 3267 else { 3268 len++; 3269 REGC((char)ender, s++); 3270 } 3271 break; 3272 } 3273 if (UTF) { 3274 STRLEN unilen; 3275 3276 if (FOLD) { 3277 /* Emit all the Unicode characters. */ 3278 STRLEN numlen; 3279 for (foldbuf = tmpbuf; 3280 foldlen; 3281 foldlen -= numlen) { 3282 ender = utf8_to_uvchr(foldbuf, &numlen); 3283 if (numlen > 0) { 3284 reguni(pRExC_state, ender, s, &unilen); 3285 len += unilen; 3286 s += unilen; 3287 /* In EBCDIC the numlen 3288 * and unilen can differ. */ 3289 foldbuf += numlen; 3290 if (numlen >= foldlen) 3291 break; 3292 } 3293 else 3294 break; 3295 } 3296 } 3297 else { 3298 reguni(pRExC_state, ender, s, &unilen); 3299 if (unilen > 0) { 3300 s += unilen; 3301 len += unilen; 3302 } 3303 } 3304 len--; 3305 } 3306 else 3307 REGC((char)ender, s++); 3308 } 3309 loopdone: 3310 RExC_parse = p - 1; 3311 Set_Node_Cur_Length(ret); /* MJD */ 3312 nextchar(pRExC_state); 3313 { 3314 /* len is STRLEN which is unsigned, need to copy to signed */ 3315 IV iv = len; 3316 if (iv < 0) 3317 vFAIL("Internal disaster"); 3318 } 3319 if (len > 0) 3320 *flagp |= HASWIDTH; 3321 if (len == 1 && UNI_IS_INVARIANT(ender)) 3322 *flagp |= SIMPLE; 3323 if (!SIZE_ONLY) 3324 STR_LEN(ret) = len; 3325 if (SIZE_ONLY) 3326 RExC_size += STR_SZ(len); 3327 else 3328 RExC_emit += STR_SZ(len); 3329 } 3330 break; 3331 } 3332 3333 /* If the encoding pragma is in effect recode the text of 3334 * any EXACT-kind nodes. */ 3335 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) { 3336 STRLEN oldlen = STR_LEN(ret); 3337 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); 3338 3339 if (RExC_utf8) 3340 SvUTF8_on(sv); 3341 if (sv_utf8_downgrade(sv, TRUE)) { 3342 const char * const s = sv_recode_to_utf8(sv, PL_encoding); 3343 const STRLEN newlen = SvCUR(sv); 3344 3345 if (SvUTF8(sv)) 3346 RExC_utf8 = 1; 3347 if (!SIZE_ONLY) { 3348 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", 3349 (int)oldlen, STRING(ret), 3350 (int)newlen, s)); 3351 Copy(s, STRING(ret), newlen, char); 3352 STR_LEN(ret) += newlen - oldlen; 3353 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); 3354 } else 3355 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); 3356 } 3357 } 3358 3359 return(ret); 3360 } 3361 3362 STATIC char * 3363 S_regwhite(pTHX_ char *p, const char *e) 3364 { 3365 while (p < e) { 3366 if (isSPACE(*p)) 3367 ++p; 3368 else if (*p == '#') { 3369 do { 3370 p++; 3371 } while (p < e && *p != '\n'); 3372 } 3373 else 3374 break; 3375 } 3376 return p; 3377 } 3378 3379 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 3380 Character classes ([:foo:]) can also be negated ([:^foo:]). 3381 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 3382 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 3383 but trigger failures because they are currently unimplemented. */ 3384 3385 #define POSIXCC_DONE(c) ((c) == ':') 3386 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 3387 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 3388 3389 STATIC I32 3390 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) 3391 { 3392 I32 namedclass = OOB_NAMEDCLASS; 3393 3394 if (value == '[' && RExC_parse + 1 < RExC_end && 3395 /* I smell either [: or [= or [. -- POSIX has been here, right? */ 3396 POSIXCC(UCHARAT(RExC_parse))) { 3397 const char c = UCHARAT(RExC_parse); 3398 char* s = RExC_parse++; 3399 3400 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) 3401 RExC_parse++; 3402 if (RExC_parse == RExC_end) 3403 /* Grandfather lone [:, [=, [. */ 3404 RExC_parse = s; 3405 else { 3406 const char* t = RExC_parse++; /* skip over the c */ 3407 const char *posixcc; 3408 3409 assert(*t == c); 3410 3411 if (UCHARAT(RExC_parse) == ']') { 3412 RExC_parse++; /* skip over the ending ] */ 3413 posixcc = s + 1; 3414 if (*s == ':') { 3415 const I32 complement = *posixcc == '^' ? *posixcc++ : 0; 3416 const I32 skip = t - posixcc; 3417 3418 /* Initially switch on the length of the name. */ 3419 switch (skip) { 3420 case 4: 3421 if (memEQ(posixcc, "word", 4)) { 3422 /* this is not POSIX, this is the Perl \w */; 3423 namedclass 3424 = complement ? ANYOF_NALNUM : ANYOF_ALNUM; 3425 } 3426 break; 3427 case 5: 3428 /* Names all of length 5. */ 3429 /* alnum alpha ascii blank cntrl digit graph lower 3430 print punct space upper */ 3431 /* Offset 4 gives the best switch position. */ 3432 switch (posixcc[4]) { 3433 case 'a': 3434 if (memEQ(posixcc, "alph", 4)) { 3435 /* a */ 3436 namedclass 3437 = complement ? ANYOF_NALPHA : ANYOF_ALPHA; 3438 } 3439 break; 3440 case 'e': 3441 if (memEQ(posixcc, "spac", 4)) { 3442 /* e */ 3443 namedclass 3444 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; 3445 } 3446 break; 3447 case 'h': 3448 if (memEQ(posixcc, "grap", 4)) { 3449 /* h */ 3450 namedclass 3451 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH; 3452 } 3453 break; 3454 case 'i': 3455 if (memEQ(posixcc, "asci", 4)) { 3456 /* i */ 3457 namedclass 3458 = complement ? ANYOF_NASCII : ANYOF_ASCII; 3459 } 3460 break; 3461 case 'k': 3462 if (memEQ(posixcc, "blan", 4)) { 3463 /* k */ 3464 namedclass 3465 = complement ? ANYOF_NBLANK : ANYOF_BLANK; 3466 } 3467 break; 3468 case 'l': 3469 if (memEQ(posixcc, "cntr", 4)) { 3470 /* l */ 3471 namedclass 3472 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL; 3473 } 3474 break; 3475 case 'm': 3476 if (memEQ(posixcc, "alnu", 4)) { 3477 /* m */ 3478 namedclass 3479 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; 3480 } 3481 break; 3482 case 'r': 3483 if (memEQ(posixcc, "lowe", 4)) { 3484 /* r */ 3485 namedclass 3486 = complement ? ANYOF_NLOWER : ANYOF_LOWER; 3487 } 3488 if (memEQ(posixcc, "uppe", 4)) { 3489 /* r */ 3490 namedclass 3491 = complement ? ANYOF_NUPPER : ANYOF_UPPER; 3492 } 3493 break; 3494 case 't': 3495 if (memEQ(posixcc, "digi", 4)) { 3496 /* t */ 3497 namedclass 3498 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT; 3499 } 3500 if (memEQ(posixcc, "prin", 4)) { 3501 /* t */ 3502 namedclass 3503 = complement ? ANYOF_NPRINT : ANYOF_PRINT; 3504 } 3505 if (memEQ(posixcc, "punc", 4)) { 3506 /* t */ 3507 namedclass 3508 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT; 3509 } 3510 break; 3511 } 3512 break; 3513 case 6: 3514 if (memEQ(posixcc, "xdigit", 6)) { 3515 namedclass 3516 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; 3517 } 3518 break; 3519 } 3520 3521 if (namedclass == OOB_NAMEDCLASS) 3522 { 3523 Simple_vFAIL3("POSIX class [:%.*s:] unknown", 3524 t - s - 1, s + 1); 3525 } 3526 assert (posixcc[skip] == ':'); 3527 assert (posixcc[skip+1] == ']'); 3528 } else if (!SIZE_ONLY) { 3529 /* [[=foo=]] and [[.foo.]] are still future. */ 3530 3531 /* adjust RExC_parse so the warning shows after 3532 the class closes */ 3533 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') 3534 RExC_parse++; 3535 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 3536 } 3537 } else { 3538 /* Maternal grandfather: 3539 * "[:" ending in ":" but not in ":]" */ 3540 RExC_parse = s; 3541 } 3542 } 3543 } 3544 3545 return namedclass; 3546 } 3547 3548 STATIC void 3549 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) 3550 { 3551 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) { 3552 const char *s = RExC_parse; 3553 const char c = *s++; 3554 3555 while(*s && isALNUM(*s)) 3556 s++; 3557 if (*s && c == *s && s[1] == ']') { 3558 if (ckWARN(WARN_REGEXP)) 3559 vWARN3(s+2, 3560 "POSIX syntax [%c %c] belongs inside character classes", 3561 c, c); 3562 3563 /* [[=foo=]] and [[.foo.]] are still future. */ 3564 if (POSIXCC_NOTYET(c)) { 3565 /* adjust RExC_parse so the error shows after 3566 the class closes */ 3567 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']') 3568 ; 3569 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 3570 } 3571 } 3572 } 3573 } 3574 3575 STATIC regnode * 3576 S_regclass(pTHX_ RExC_state_t *pRExC_state) 3577 { 3578 register UV value; 3579 register UV nextvalue; 3580 register IV prevvalue = OOB_UNICODE; 3581 register IV range = 0; 3582 register regnode *ret; 3583 STRLEN numlen; 3584 IV namedclass; 3585 char *rangebegin = 0; 3586 bool need_class = 0; 3587 SV *listsv = Nullsv; 3588 register char *e; 3589 UV n; 3590 bool optimize_invert = TRUE; 3591 AV* unicode_alternate = 0; 3592 #ifdef EBCDIC 3593 UV literal_endpoint = 0; 3594 #endif 3595 3596 ret = reganode(pRExC_state, ANYOF, 0); 3597 3598 if (!SIZE_ONLY) 3599 ANYOF_FLAGS(ret) = 0; 3600 3601 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ 3602 RExC_naughty++; 3603 RExC_parse++; 3604 if (!SIZE_ONLY) 3605 ANYOF_FLAGS(ret) |= ANYOF_INVERT; 3606 } 3607 3608 if (SIZE_ONLY) 3609 RExC_size += ANYOF_SKIP; 3610 else { 3611 RExC_emit += ANYOF_SKIP; 3612 if (FOLD) 3613 ANYOF_FLAGS(ret) |= ANYOF_FOLD; 3614 if (LOC) 3615 ANYOF_FLAGS(ret) |= ANYOF_LOCALE; 3616 ANYOF_BITMAP_ZERO(ret); 3617 listsv = newSVpvn("# comment\n", 10); 3618 } 3619 3620 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 3621 3622 if (!SIZE_ONLY && POSIXCC(nextvalue)) 3623 checkposixcc(pRExC_state); 3624 3625 /* allow 1st char to be ] (allowing it to be - is dealt with later) */ 3626 if (UCHARAT(RExC_parse) == ']') 3627 goto charclassloop; 3628 3629 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { 3630 3631 charclassloop: 3632 3633 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 3634 3635 if (!range) 3636 rangebegin = RExC_parse; 3637 if (UTF) { 3638 value = utf8n_to_uvchr((U8*)RExC_parse, 3639 RExC_end - RExC_parse, 3640 &numlen, 0); 3641 RExC_parse += numlen; 3642 } 3643 else 3644 value = UCHARAT(RExC_parse++); 3645 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 3646 if (value == '[' && POSIXCC(nextvalue)) 3647 namedclass = regpposixcc(pRExC_state, value); 3648 else if (value == '\\') { 3649 if (UTF) { 3650 value = utf8n_to_uvchr((U8*)RExC_parse, 3651 RExC_end - RExC_parse, 3652 &numlen, 0); 3653 RExC_parse += numlen; 3654 } 3655 else 3656 value = UCHARAT(RExC_parse++); 3657 /* Some compilers cannot handle switching on 64-bit integer 3658 * values, therefore value cannot be an UV. Yes, this will 3659 * be a problem later if we want switch on Unicode. 3660 * A similar issue a little bit later when switching on 3661 * namedclass. --jhi */ 3662 switch ((I32)value) { 3663 case 'w': namedclass = ANYOF_ALNUM; break; 3664 case 'W': namedclass = ANYOF_NALNUM; break; 3665 case 's': namedclass = ANYOF_SPACE; break; 3666 case 'S': namedclass = ANYOF_NSPACE; break; 3667 case 'd': namedclass = ANYOF_DIGIT; break; 3668 case 'D': namedclass = ANYOF_NDIGIT; break; 3669 case 'p': 3670 case 'P': 3671 if (RExC_parse >= RExC_end) 3672 vFAIL2("Empty \\%c{}", (U8)value); 3673 if (*RExC_parse == '{') { 3674 const U8 c = (U8)value; 3675 e = strchr(RExC_parse++, '}'); 3676 if (!e) 3677 vFAIL2("Missing right brace on \\%c{}", c); 3678 while (isSPACE(UCHARAT(RExC_parse))) 3679 RExC_parse++; 3680 if (e == RExC_parse) 3681 vFAIL2("Empty \\%c{}", c); 3682 n = e - RExC_parse; 3683 while (isSPACE(UCHARAT(RExC_parse + n - 1))) 3684 n--; 3685 } 3686 else { 3687 e = RExC_parse; 3688 n = 1; 3689 } 3690 if (!SIZE_ONLY) { 3691 if (UCHARAT(RExC_parse) == '^') { 3692 RExC_parse++; 3693 n--; 3694 value = value == 'p' ? 'P' : 'p'; /* toggle */ 3695 while (isSPACE(UCHARAT(RExC_parse))) { 3696 RExC_parse++; 3697 n--; 3698 } 3699 } 3700 if (value == 'p') 3701 Perl_sv_catpvf(aTHX_ listsv, 3702 "+utf8::%.*s\n", (int)n, RExC_parse); 3703 else 3704 Perl_sv_catpvf(aTHX_ listsv, 3705 "!utf8::%.*s\n", (int)n, RExC_parse); 3706 } 3707 RExC_parse = e + 1; 3708 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 3709 namedclass = ANYOF_MAX; /* no official name, but it's named */ 3710 break; 3711 case 'n': value = '\n'; break; 3712 case 'r': value = '\r'; break; 3713 case 't': value = '\t'; break; 3714 case 'f': value = '\f'; break; 3715 case 'b': value = '\b'; break; 3716 case 'e': value = ASCII_TO_NATIVE('\033');break; 3717 case 'a': value = ASCII_TO_NATIVE('\007');break; 3718 case 'x': 3719 if (*RExC_parse == '{') { 3720 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 3721 | PERL_SCAN_DISALLOW_PREFIX; 3722 e = strchr(RExC_parse++, '}'); 3723 if (!e) 3724 vFAIL("Missing right brace on \\x{}"); 3725 3726 numlen = e - RExC_parse; 3727 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 3728 RExC_parse = e + 1; 3729 } 3730 else { 3731 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 3732 numlen = 2; 3733 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 3734 RExC_parse += numlen; 3735 } 3736 break; 3737 case 'c': 3738 value = UCHARAT(RExC_parse++); 3739 value = toCTRL(value); 3740 break; 3741 case '0': case '1': case '2': case '3': case '4': 3742 case '5': case '6': case '7': case '8': case '9': 3743 { 3744 I32 flags = 0; 3745 numlen = 3; 3746 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 3747 RExC_parse += numlen; 3748 break; 3749 } 3750 default: 3751 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP)) 3752 vWARN2(RExC_parse, 3753 "Unrecognized escape \\%c in character class passed through", 3754 (int)value); 3755 break; 3756 } 3757 } /* end of \blah */ 3758 #ifdef EBCDIC 3759 else 3760 literal_endpoint++; 3761 #endif 3762 3763 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 3764 3765 if (!SIZE_ONLY && !need_class) 3766 ANYOF_CLASS_ZERO(ret); 3767 3768 need_class = 1; 3769 3770 /* a bad range like a-\d, a-[:digit:] ? */ 3771 if (range) { 3772 if (!SIZE_ONLY) { 3773 if (ckWARN(WARN_REGEXP)) 3774 vWARN4(RExC_parse, 3775 "False [] range \"%*.*s\"", 3776 RExC_parse - rangebegin, 3777 RExC_parse - rangebegin, 3778 rangebegin); 3779 if (prevvalue < 256) { 3780 ANYOF_BITMAP_SET(ret, prevvalue); 3781 ANYOF_BITMAP_SET(ret, '-'); 3782 } 3783 else { 3784 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 3785 Perl_sv_catpvf(aTHX_ listsv, 3786 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); 3787 } 3788 } 3789 3790 range = 0; /* this was not a true range */ 3791 } 3792 3793 if (!SIZE_ONLY) { 3794 const char *what = NULL; 3795 char yesno = 0; 3796 3797 if (namedclass > OOB_NAMEDCLASS) 3798 optimize_invert = FALSE; 3799 /* Possible truncation here but in some 64-bit environments 3800 * the compiler gets heartburn about switch on 64-bit values. 3801 * A similar issue a little earlier when switching on value. 3802 * --jhi */ 3803 switch ((I32)namedclass) { 3804 case ANYOF_ALNUM: 3805 if (LOC) 3806 ANYOF_CLASS_SET(ret, ANYOF_ALNUM); 3807 else { 3808 for (value = 0; value < 256; value++) 3809 if (isALNUM(value)) 3810 ANYOF_BITMAP_SET(ret, value); 3811 } 3812 yesno = '+'; 3813 what = "Word"; 3814 break; 3815 case ANYOF_NALNUM: 3816 if (LOC) 3817 ANYOF_CLASS_SET(ret, ANYOF_NALNUM); 3818 else { 3819 for (value = 0; value < 256; value++) 3820 if (!isALNUM(value)) 3821 ANYOF_BITMAP_SET(ret, value); 3822 } 3823 yesno = '!'; 3824 what = "Word"; 3825 break; 3826 case ANYOF_ALNUMC: 3827 if (LOC) 3828 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); 3829 else { 3830 for (value = 0; value < 256; value++) 3831 if (isALNUMC(value)) 3832 ANYOF_BITMAP_SET(ret, value); 3833 } 3834 yesno = '+'; 3835 what = "Alnum"; 3836 break; 3837 case ANYOF_NALNUMC: 3838 if (LOC) 3839 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC); 3840 else { 3841 for (value = 0; value < 256; value++) 3842 if (!isALNUMC(value)) 3843 ANYOF_BITMAP_SET(ret, value); 3844 } 3845 yesno = '!'; 3846 what = "Alnum"; 3847 break; 3848 case ANYOF_ALPHA: 3849 if (LOC) 3850 ANYOF_CLASS_SET(ret, ANYOF_ALPHA); 3851 else { 3852 for (value = 0; value < 256; value++) 3853 if (isALPHA(value)) 3854 ANYOF_BITMAP_SET(ret, value); 3855 } 3856 yesno = '+'; 3857 what = "Alpha"; 3858 break; 3859 case ANYOF_NALPHA: 3860 if (LOC) 3861 ANYOF_CLASS_SET(ret, ANYOF_NALPHA); 3862 else { 3863 for (value = 0; value < 256; value++) 3864 if (!isALPHA(value)) 3865 ANYOF_BITMAP_SET(ret, value); 3866 } 3867 yesno = '!'; 3868 what = "Alpha"; 3869 break; 3870 case ANYOF_ASCII: 3871 if (LOC) 3872 ANYOF_CLASS_SET(ret, ANYOF_ASCII); 3873 else { 3874 #ifndef EBCDIC 3875 for (value = 0; value < 128; value++) 3876 ANYOF_BITMAP_SET(ret, value); 3877 #else /* EBCDIC */ 3878 for (value = 0; value < 256; value++) { 3879 if (isASCII(value)) 3880 ANYOF_BITMAP_SET(ret, value); 3881 } 3882 #endif /* EBCDIC */ 3883 } 3884 yesno = '+'; 3885 what = "ASCII"; 3886 break; 3887 case ANYOF_NASCII: 3888 if (LOC) 3889 ANYOF_CLASS_SET(ret, ANYOF_NASCII); 3890 else { 3891 #ifndef EBCDIC 3892 for (value = 128; value < 256; value++) 3893 ANYOF_BITMAP_SET(ret, value); 3894 #else /* EBCDIC */ 3895 for (value = 0; value < 256; value++) { 3896 if (!isASCII(value)) 3897 ANYOF_BITMAP_SET(ret, value); 3898 } 3899 #endif /* EBCDIC */ 3900 } 3901 yesno = '!'; 3902 what = "ASCII"; 3903 break; 3904 case ANYOF_BLANK: 3905 if (LOC) 3906 ANYOF_CLASS_SET(ret, ANYOF_BLANK); 3907 else { 3908 for (value = 0; value < 256; value++) 3909 if (isBLANK(value)) 3910 ANYOF_BITMAP_SET(ret, value); 3911 } 3912 yesno = '+'; 3913 what = "Blank"; 3914 break; 3915 case ANYOF_NBLANK: 3916 if (LOC) 3917 ANYOF_CLASS_SET(ret, ANYOF_NBLANK); 3918 else { 3919 for (value = 0; value < 256; value++) 3920 if (!isBLANK(value)) 3921 ANYOF_BITMAP_SET(ret, value); 3922 } 3923 yesno = '!'; 3924 what = "Blank"; 3925 break; 3926 case ANYOF_CNTRL: 3927 if (LOC) 3928 ANYOF_CLASS_SET(ret, ANYOF_CNTRL); 3929 else { 3930 for (value = 0; value < 256; value++) 3931 if (isCNTRL(value)) 3932 ANYOF_BITMAP_SET(ret, value); 3933 } 3934 yesno = '+'; 3935 what = "Cntrl"; 3936 break; 3937 case ANYOF_NCNTRL: 3938 if (LOC) 3939 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL); 3940 else { 3941 for (value = 0; value < 256; value++) 3942 if (!isCNTRL(value)) 3943 ANYOF_BITMAP_SET(ret, value); 3944 } 3945 yesno = '!'; 3946 what = "Cntrl"; 3947 break; 3948 case ANYOF_DIGIT: 3949 if (LOC) 3950 ANYOF_CLASS_SET(ret, ANYOF_DIGIT); 3951 else { 3952 /* consecutive digits assumed */ 3953 for (value = '0'; value <= '9'; value++) 3954 ANYOF_BITMAP_SET(ret, value); 3955 } 3956 yesno = '+'; 3957 what = "Digit"; 3958 break; 3959 case ANYOF_NDIGIT: 3960 if (LOC) 3961 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); 3962 else { 3963 /* consecutive digits assumed */ 3964 for (value = 0; value < '0'; value++) 3965 ANYOF_BITMAP_SET(ret, value); 3966 for (value = '9' + 1; value < 256; value++) 3967 ANYOF_BITMAP_SET(ret, value); 3968 } 3969 yesno = '!'; 3970 what = "Digit"; 3971 break; 3972 case ANYOF_GRAPH: 3973 if (LOC) 3974 ANYOF_CLASS_SET(ret, ANYOF_GRAPH); 3975 else { 3976 for (value = 0; value < 256; value++) 3977 if (isGRAPH(value)) 3978 ANYOF_BITMAP_SET(ret, value); 3979 } 3980 yesno = '+'; 3981 what = "Graph"; 3982 break; 3983 case ANYOF_NGRAPH: 3984 if (LOC) 3985 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH); 3986 else { 3987 for (value = 0; value < 256; value++) 3988 if (!isGRAPH(value)) 3989 ANYOF_BITMAP_SET(ret, value); 3990 } 3991 yesno = '!'; 3992 what = "Graph"; 3993 break; 3994 case ANYOF_LOWER: 3995 if (LOC) 3996 ANYOF_CLASS_SET(ret, ANYOF_LOWER); 3997 else { 3998 for (value = 0; value < 256; value++) 3999 if (isLOWER(value)) 4000 ANYOF_BITMAP_SET(ret, value); 4001 } 4002 yesno = '+'; 4003 what = "Lower"; 4004 break; 4005 case ANYOF_NLOWER: 4006 if (LOC) 4007 ANYOF_CLASS_SET(ret, ANYOF_NLOWER); 4008 else { 4009 for (value = 0; value < 256; value++) 4010 if (!isLOWER(value)) 4011 ANYOF_BITMAP_SET(ret, value); 4012 } 4013 yesno = '!'; 4014 what = "Lower"; 4015 break; 4016 case ANYOF_PRINT: 4017 if (LOC) 4018 ANYOF_CLASS_SET(ret, ANYOF_PRINT); 4019 else { 4020 for (value = 0; value < 256; value++) 4021 if (isPRINT(value)) 4022 ANYOF_BITMAP_SET(ret, value); 4023 } 4024 yesno = '+'; 4025 what = "Print"; 4026 break; 4027 case ANYOF_NPRINT: 4028 if (LOC) 4029 ANYOF_CLASS_SET(ret, ANYOF_NPRINT); 4030 else { 4031 for (value = 0; value < 256; value++) 4032 if (!isPRINT(value)) 4033 ANYOF_BITMAP_SET(ret, value); 4034 } 4035 yesno = '!'; 4036 what = "Print"; 4037 break; 4038 case ANYOF_PSXSPC: 4039 if (LOC) 4040 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC); 4041 else { 4042 for (value = 0; value < 256; value++) 4043 if (isPSXSPC(value)) 4044 ANYOF_BITMAP_SET(ret, value); 4045 } 4046 yesno = '+'; 4047 what = "Space"; 4048 break; 4049 case ANYOF_NPSXSPC: 4050 if (LOC) 4051 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC); 4052 else { 4053 for (value = 0; value < 256; value++) 4054 if (!isPSXSPC(value)) 4055 ANYOF_BITMAP_SET(ret, value); 4056 } 4057 yesno = '!'; 4058 what = "Space"; 4059 break; 4060 case ANYOF_PUNCT: 4061 if (LOC) 4062 ANYOF_CLASS_SET(ret, ANYOF_PUNCT); 4063 else { 4064 for (value = 0; value < 256; value++) 4065 if (isPUNCT(value)) 4066 ANYOF_BITMAP_SET(ret, value); 4067 } 4068 yesno = '+'; 4069 what = "Punct"; 4070 break; 4071 case ANYOF_NPUNCT: 4072 if (LOC) 4073 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT); 4074 else { 4075 for (value = 0; value < 256; value++) 4076 if (!isPUNCT(value)) 4077 ANYOF_BITMAP_SET(ret, value); 4078 } 4079 yesno = '!'; 4080 what = "Punct"; 4081 break; 4082 case ANYOF_SPACE: 4083 if (LOC) 4084 ANYOF_CLASS_SET(ret, ANYOF_SPACE); 4085 else { 4086 for (value = 0; value < 256; value++) 4087 if (isSPACE(value)) 4088 ANYOF_BITMAP_SET(ret, value); 4089 } 4090 yesno = '+'; 4091 what = "SpacePerl"; 4092 break; 4093 case ANYOF_NSPACE: 4094 if (LOC) 4095 ANYOF_CLASS_SET(ret, ANYOF_NSPACE); 4096 else { 4097 for (value = 0; value < 256; value++) 4098 if (!isSPACE(value)) 4099 ANYOF_BITMAP_SET(ret, value); 4100 } 4101 yesno = '!'; 4102 what = "SpacePerl"; 4103 break; 4104 case ANYOF_UPPER: 4105 if (LOC) 4106 ANYOF_CLASS_SET(ret, ANYOF_UPPER); 4107 else { 4108 for (value = 0; value < 256; value++) 4109 if (isUPPER(value)) 4110 ANYOF_BITMAP_SET(ret, value); 4111 } 4112 yesno = '+'; 4113 what = "Upper"; 4114 break; 4115 case ANYOF_NUPPER: 4116 if (LOC) 4117 ANYOF_CLASS_SET(ret, ANYOF_NUPPER); 4118 else { 4119 for (value = 0; value < 256; value++) 4120 if (!isUPPER(value)) 4121 ANYOF_BITMAP_SET(ret, value); 4122 } 4123 yesno = '!'; 4124 what = "Upper"; 4125 break; 4126 case ANYOF_XDIGIT: 4127 if (LOC) 4128 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT); 4129 else { 4130 for (value = 0; value < 256; value++) 4131 if (isXDIGIT(value)) 4132 ANYOF_BITMAP_SET(ret, value); 4133 } 4134 yesno = '+'; 4135 what = "XDigit"; 4136 break; 4137 case ANYOF_NXDIGIT: 4138 if (LOC) 4139 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT); 4140 else { 4141 for (value = 0; value < 256; value++) 4142 if (!isXDIGIT(value)) 4143 ANYOF_BITMAP_SET(ret, value); 4144 } 4145 yesno = '!'; 4146 what = "XDigit"; 4147 break; 4148 case ANYOF_MAX: 4149 /* this is to handle \p and \P */ 4150 break; 4151 default: 4152 vFAIL("Invalid [::] class"); 4153 break; 4154 } 4155 if (what) { 4156 /* Strings such as "+utf8::isWord\n" */ 4157 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); 4158 } 4159 if (LOC) 4160 ANYOF_FLAGS(ret) |= ANYOF_CLASS; 4161 continue; 4162 } 4163 } /* end of namedclass \blah */ 4164 4165 if (range) { 4166 if (prevvalue > (IV)value) /* b-a */ { 4167 Simple_vFAIL4("Invalid [] range \"%*.*s\"", 4168 RExC_parse - rangebegin, 4169 RExC_parse - rangebegin, 4170 rangebegin); 4171 range = 0; /* not a valid range */ 4172 } 4173 } 4174 else { 4175 prevvalue = value; /* save the beginning of the range */ 4176 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && 4177 RExC_parse[1] != ']') { 4178 RExC_parse++; 4179 4180 /* a bad range like \w-, [:word:]- ? */ 4181 if (namedclass > OOB_NAMEDCLASS) { 4182 if (ckWARN(WARN_REGEXP)) 4183 vWARN4(RExC_parse, 4184 "False [] range \"%*.*s\"", 4185 RExC_parse - rangebegin, 4186 RExC_parse - rangebegin, 4187 rangebegin); 4188 if (!SIZE_ONLY) 4189 ANYOF_BITMAP_SET(ret, '-'); 4190 } else 4191 range = 1; /* yeah, it's a range! */ 4192 continue; /* but do it the next time */ 4193 } 4194 } 4195 4196 /* now is the next time */ 4197 if (!SIZE_ONLY) { 4198 IV i; 4199 4200 if (prevvalue < 256) { 4201 const IV ceilvalue = value < 256 ? value : 255; 4202 4203 #ifdef EBCDIC 4204 /* In EBCDIC [\x89-\x91] should include 4205 * the \x8e but [i-j] should not. */ 4206 if (literal_endpoint == 2 && 4207 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || 4208 (isUPPER(prevvalue) && isUPPER(ceilvalue)))) 4209 { 4210 if (isLOWER(prevvalue)) { 4211 for (i = prevvalue; i <= ceilvalue; i++) 4212 if (isLOWER(i)) 4213 ANYOF_BITMAP_SET(ret, i); 4214 } else { 4215 for (i = prevvalue; i <= ceilvalue; i++) 4216 if (isUPPER(i)) 4217 ANYOF_BITMAP_SET(ret, i); 4218 } 4219 } 4220 else 4221 #endif 4222 for (i = prevvalue; i <= ceilvalue; i++) 4223 ANYOF_BITMAP_SET(ret, i); 4224 } 4225 if (value > 255 || UTF) { 4226 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); 4227 const UV natvalue = NATIVE_TO_UNI(value); 4228 4229 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 4230 if (prevnatvalue < natvalue) { /* what about > ? */ 4231 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", 4232 prevnatvalue, natvalue); 4233 } 4234 else if (prevnatvalue == natvalue) { 4235 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); 4236 if (FOLD) { 4237 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 4238 STRLEN foldlen; 4239 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen); 4240 4241 /* If folding and foldable and a single 4242 * character, insert also the folded version 4243 * to the charclass. */ 4244 if (f != value) { 4245 if (foldlen == (STRLEN)UNISKIP(f)) 4246 Perl_sv_catpvf(aTHX_ listsv, 4247 "%04"UVxf"\n", f); 4248 else { 4249 /* Any multicharacter foldings 4250 * require the following transform: 4251 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) 4252 * where E folds into "pq" and F folds 4253 * into "rst", all other characters 4254 * fold to single characters. We save 4255 * away these multicharacter foldings, 4256 * to be later saved as part of the 4257 * additional "s" data. */ 4258 SV *sv; 4259 4260 if (!unicode_alternate) 4261 unicode_alternate = newAV(); 4262 sv = newSVpvn((char*)foldbuf, foldlen); 4263 SvUTF8_on(sv); 4264 av_push(unicode_alternate, sv); 4265 } 4266 } 4267 4268 /* If folding and the value is one of the Greek 4269 * sigmas insert a few more sigmas to make the 4270 * folding rules of the sigmas to work right. 4271 * Note that not all the possible combinations 4272 * are handled here: some of them are handled 4273 * by the standard folding rules, and some of 4274 * them (literal or EXACTF cases) are handled 4275 * during runtime in regexec.c:S_find_byclass(). */ 4276 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { 4277 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 4278 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); 4279 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 4280 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 4281 } 4282 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA) 4283 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 4284 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 4285 } 4286 } 4287 } 4288 #ifdef EBCDIC 4289 literal_endpoint = 0; 4290 #endif 4291 } 4292 4293 range = 0; /* this range (if it was one) is done now */ 4294 } 4295 4296 if (need_class) { 4297 ANYOF_FLAGS(ret) |= ANYOF_LARGE; 4298 if (SIZE_ONLY) 4299 RExC_size += ANYOF_CLASS_ADD_SKIP; 4300 else 4301 RExC_emit += ANYOF_CLASS_ADD_SKIP; 4302 } 4303 4304 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ 4305 if (!SIZE_ONLY && 4306 /* If the only flag is folding (plus possibly inversion). */ 4307 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) 4308 ) { 4309 for (value = 0; value < 256; ++value) { 4310 if (ANYOF_BITMAP_TEST(ret, value)) { 4311 UV fold = PL_fold[value]; 4312 4313 if (fold != value) 4314 ANYOF_BITMAP_SET(ret, fold); 4315 } 4316 } 4317 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; 4318 } 4319 4320 /* optimize inverted simple patterns (e.g. [^a-z]) */ 4321 if (!SIZE_ONLY && optimize_invert && 4322 /* If the only flag is inversion. */ 4323 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { 4324 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) 4325 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; 4326 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; 4327 } 4328 4329 if (!SIZE_ONLY) { 4330 AV *av = newAV(); 4331 SV *rv; 4332 4333 /* The 0th element stores the character class description 4334 * in its textual form: used later (regexec.c:Perl_regclass_swash()) 4335 * to initialize the appropriate swash (which gets stored in 4336 * the 1st element), and also useful for dumping the regnode. 4337 * The 2nd element stores the multicharacter foldings, 4338 * used later (regexec.c:S_reginclass()). */ 4339 av_store(av, 0, listsv); 4340 av_store(av, 1, NULL); 4341 av_store(av, 2, (SV*)unicode_alternate); 4342 rv = newRV_noinc((SV*)av); 4343 n = add_data(pRExC_state, 1, "s"); 4344 RExC_rx->data->data[n] = (void*)rv; 4345 ARG_SET(ret, n); 4346 } 4347 4348 return ret; 4349 } 4350 4351 STATIC char* 4352 S_nextchar(pTHX_ RExC_state_t *pRExC_state) 4353 { 4354 char* retval = RExC_parse++; 4355 4356 for (;;) { 4357 if (*RExC_parse == '(' && RExC_parse[1] == '?' && 4358 RExC_parse[2] == '#') { 4359 while (*RExC_parse != ')') { 4360 if (RExC_parse == RExC_end) 4361 FAIL("Sequence (?#... not terminated"); 4362 RExC_parse++; 4363 } 4364 RExC_parse++; 4365 continue; 4366 } 4367 if (RExC_flags & PMf_EXTENDED) { 4368 if (isSPACE(*RExC_parse)) { 4369 RExC_parse++; 4370 continue; 4371 } 4372 else if (*RExC_parse == '#') { 4373 while (RExC_parse < RExC_end) 4374 if (*RExC_parse++ == '\n') break; 4375 continue; 4376 } 4377 } 4378 return retval; 4379 } 4380 } 4381 4382 /* 4383 - reg_node - emit a node 4384 */ 4385 STATIC regnode * /* Location. */ 4386 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 4387 { 4388 register regnode *ptr; 4389 regnode * const ret = RExC_emit; 4390 4391 if (SIZE_ONLY) { 4392 SIZE_ALIGN(RExC_size); 4393 RExC_size += 1; 4394 return(ret); 4395 } 4396 4397 NODE_ALIGN_FILL(ret); 4398 ptr = ret; 4399 FILL_ADVANCE_NODE(ptr, op); 4400 if (RExC_offsets) { /* MJD */ 4401 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 4402 "reg_node", __LINE__, 4403 reg_name[op], 4404 RExC_emit - RExC_emit_start > RExC_offsets[0] 4405 ? "Overwriting end of array!\n" : "OK", 4406 RExC_emit - RExC_emit_start, 4407 RExC_parse - RExC_start, 4408 RExC_offsets[0])); 4409 Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); 4410 } 4411 4412 RExC_emit = ptr; 4413 4414 return(ret); 4415 } 4416 4417 /* 4418 - reganode - emit a node with an argument 4419 */ 4420 STATIC regnode * /* Location. */ 4421 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 4422 { 4423 register regnode *ptr; 4424 regnode * const ret = RExC_emit; 4425 4426 if (SIZE_ONLY) { 4427 SIZE_ALIGN(RExC_size); 4428 RExC_size += 2; 4429 return(ret); 4430 } 4431 4432 NODE_ALIGN_FILL(ret); 4433 ptr = ret; 4434 FILL_ADVANCE_NODE_ARG(ptr, op, arg); 4435 if (RExC_offsets) { /* MJD */ 4436 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 4437 "reganode", 4438 __LINE__, 4439 reg_name[op], 4440 RExC_emit - RExC_emit_start > RExC_offsets[0] ? 4441 "Overwriting end of array!\n" : "OK", 4442 RExC_emit - RExC_emit_start, 4443 RExC_parse - RExC_start, 4444 RExC_offsets[0])); 4445 Set_Cur_Node_Offset; 4446 } 4447 4448 RExC_emit = ptr; 4449 4450 return(ret); 4451 } 4452 4453 /* 4454 - reguni - emit (if appropriate) a Unicode character 4455 */ 4456 STATIC void 4457 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp) 4458 { 4459 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); 4460 } 4461 4462 /* 4463 - reginsert - insert an operator in front of already-emitted operand 4464 * 4465 * Means relocating the operand. 4466 */ 4467 STATIC void 4468 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) 4469 { 4470 register regnode *src; 4471 register regnode *dst; 4472 register regnode *place; 4473 const int offset = regarglen[(U8)op]; 4474 4475 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ 4476 4477 if (SIZE_ONLY) { 4478 RExC_size += NODE_STEP_REGNODE + offset; 4479 return; 4480 } 4481 4482 src = RExC_emit; 4483 RExC_emit += NODE_STEP_REGNODE + offset; 4484 dst = RExC_emit; 4485 while (src > opnd) { 4486 StructCopy(--src, --dst, regnode); 4487 if (RExC_offsets) { /* MJD 20010112 */ 4488 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n", 4489 "reg_insert", 4490 __LINE__, 4491 reg_name[op], 4492 dst - RExC_emit_start > RExC_offsets[0] 4493 ? "Overwriting end of array!\n" : "OK", 4494 src - RExC_emit_start, 4495 dst - RExC_emit_start, 4496 RExC_offsets[0])); 4497 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); 4498 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); 4499 } 4500 } 4501 4502 4503 place = opnd; /* Op node, where operand used to be. */ 4504 if (RExC_offsets) { /* MJD */ 4505 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 4506 "reginsert", 4507 __LINE__, 4508 reg_name[op], 4509 place - RExC_emit_start > RExC_offsets[0] 4510 ? "Overwriting end of array!\n" : "OK", 4511 place - RExC_emit_start, 4512 RExC_parse - RExC_start, 4513 RExC_offsets[0])); 4514 Set_Node_Offset(place, RExC_parse); 4515 Set_Node_Length(place, 1); 4516 } 4517 src = NEXTOPER(place); 4518 FILL_ADVANCE_NODE(place, op); 4519 Zero(src, offset, regnode); 4520 } 4521 4522 /* 4523 - regtail - set the next-pointer at the end of a node chain of p to val. 4524 */ 4525 STATIC void 4526 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val) 4527 { 4528 register regnode *scan; 4529 4530 if (SIZE_ONLY) 4531 return; 4532 4533 /* Find last node. */ 4534 scan = p; 4535 for (;;) { 4536 regnode * const temp = regnext(scan); 4537 if (temp == NULL) 4538 break; 4539 scan = temp; 4540 } 4541 4542 if (reg_off_by_arg[OP(scan)]) { 4543 ARG_SET(scan, val - scan); 4544 } 4545 else { 4546 NEXT_OFF(scan) = val - scan; 4547 } 4548 } 4549 4550 /* 4551 - regoptail - regtail on operand of first argument; nop if operandless 4552 */ 4553 STATIC void 4554 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val) 4555 { 4556 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 4557 if (p == NULL || SIZE_ONLY) 4558 return; 4559 if (PL_regkind[(U8)OP(p)] == BRANCH) { 4560 regtail(pRExC_state, NEXTOPER(p), val); 4561 } 4562 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) { 4563 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val); 4564 } 4565 else 4566 return; 4567 } 4568 4569 /* 4570 - regcurly - a little FSA that accepts {\d+,?\d*} 4571 */ 4572 STATIC I32 4573 S_regcurly(pTHX_ register const char *s) 4574 { 4575 if (*s++ != '{') 4576 return FALSE; 4577 if (!isDIGIT(*s)) 4578 return FALSE; 4579 while (isDIGIT(*s)) 4580 s++; 4581 if (*s == ',') 4582 s++; 4583 while (isDIGIT(*s)) 4584 s++; 4585 if (*s != '}') 4586 return FALSE; 4587 return TRUE; 4588 } 4589 4590 /* 4591 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 4592 */ 4593 void 4594 Perl_regdump(pTHX_ regexp *r) 4595 { 4596 #ifdef DEBUGGING 4597 SV *sv = sv_newmortal(); 4598 4599 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0); 4600 4601 /* Header fields of interest. */ 4602 if (r->anchored_substr) 4603 PerlIO_printf(Perl_debug_log, 4604 "anchored \"%s%.*s%s\"%s at %"IVdf" ", 4605 PL_colors[0], 4606 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)), 4607 SvPVX_const(r->anchored_substr), 4608 PL_colors[1], 4609 SvTAIL(r->anchored_substr) ? "$" : "", 4610 (IV)r->anchored_offset); 4611 else if (r->anchored_utf8) 4612 PerlIO_printf(Perl_debug_log, 4613 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ", 4614 PL_colors[0], 4615 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)), 4616 SvPVX_const(r->anchored_utf8), 4617 PL_colors[1], 4618 SvTAIL(r->anchored_utf8) ? "$" : "", 4619 (IV)r->anchored_offset); 4620 if (r->float_substr) 4621 PerlIO_printf(Perl_debug_log, 4622 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ", 4623 PL_colors[0], 4624 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)), 4625 SvPVX_const(r->float_substr), 4626 PL_colors[1], 4627 SvTAIL(r->float_substr) ? "$" : "", 4628 (IV)r->float_min_offset, (UV)r->float_max_offset); 4629 else if (r->float_utf8) 4630 PerlIO_printf(Perl_debug_log, 4631 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ", 4632 PL_colors[0], 4633 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)), 4634 SvPVX_const(r->float_utf8), 4635 PL_colors[1], 4636 SvTAIL(r->float_utf8) ? "$" : "", 4637 (IV)r->float_min_offset, (UV)r->float_max_offset); 4638 if (r->check_substr || r->check_utf8) 4639 PerlIO_printf(Perl_debug_log, 4640 r->check_substr == r->float_substr 4641 && r->check_utf8 == r->float_utf8 4642 ? "(checking floating" : "(checking anchored"); 4643 if (r->reganch & ROPT_NOSCAN) 4644 PerlIO_printf(Perl_debug_log, " noscan"); 4645 if (r->reganch & ROPT_CHECK_ALL) 4646 PerlIO_printf(Perl_debug_log, " isall"); 4647 if (r->check_substr || r->check_utf8) 4648 PerlIO_printf(Perl_debug_log, ") "); 4649 4650 if (r->regstclass) { 4651 regprop(sv, r->regstclass); 4652 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv)); 4653 } 4654 if (r->reganch & ROPT_ANCH) { 4655 PerlIO_printf(Perl_debug_log, "anchored"); 4656 if (r->reganch & ROPT_ANCH_BOL) 4657 PerlIO_printf(Perl_debug_log, "(BOL)"); 4658 if (r->reganch & ROPT_ANCH_MBOL) 4659 PerlIO_printf(Perl_debug_log, "(MBOL)"); 4660 if (r->reganch & ROPT_ANCH_SBOL) 4661 PerlIO_printf(Perl_debug_log, "(SBOL)"); 4662 if (r->reganch & ROPT_ANCH_GPOS) 4663 PerlIO_printf(Perl_debug_log, "(GPOS)"); 4664 PerlIO_putc(Perl_debug_log, ' '); 4665 } 4666 if (r->reganch & ROPT_GPOS_SEEN) 4667 PerlIO_printf(Perl_debug_log, "GPOS "); 4668 if (r->reganch & ROPT_SKIP) 4669 PerlIO_printf(Perl_debug_log, "plus "); 4670 if (r->reganch & ROPT_IMPLICIT) 4671 PerlIO_printf(Perl_debug_log, "implicit "); 4672 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen); 4673 if (r->reganch & ROPT_EVAL_SEEN) 4674 PerlIO_printf(Perl_debug_log, "with eval "); 4675 PerlIO_printf(Perl_debug_log, "\n"); 4676 if (r->offsets) { 4677 U32 i; 4678 const U32 len = r->offsets[0]; 4679 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]); 4680 for (i = 1; i <= len; i++) 4681 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 4682 (UV)r->offsets[i*2-1], 4683 (UV)r->offsets[i*2]); 4684 PerlIO_printf(Perl_debug_log, "\n"); 4685 } 4686 #endif /* DEBUGGING */ 4687 } 4688 4689 /* 4690 - regprop - printable representation of opcode 4691 */ 4692 void 4693 Perl_regprop(pTHX_ SV *sv, regnode *o) 4694 { 4695 #ifdef DEBUGGING 4696 register int k; 4697 4698 sv_setpvn(sv, "", 0); 4699 if (OP(o) >= reg_num) /* regnode.type is unsigned */ 4700 /* It would be nice to FAIL() here, but this may be called from 4701 regexec.c, and it would be hard to supply pRExC_state. */ 4702 Perl_croak(aTHX_ "Corrupted regexp opcode"); 4703 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */ 4704 4705 k = PL_regkind[(U8)OP(o)]; 4706 4707 if (k == EXACT) { 4708 SV * const dsv = sv_2mortal(newSVpvn("", 0)); 4709 /* Using is_utf8_string() is a crude hack but it may 4710 * be the best for now since we have no flag "this EXACTish 4711 * node was UTF-8" --jhi */ 4712 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o)); 4713 const char * const s = do_utf8 ? 4714 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 4715 UNI_DISPLAY_REGEX) : 4716 STRING(o); 4717 const int len = do_utf8 ? 4718 strlen(s) : 4719 STR_LEN(o); 4720 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", 4721 PL_colors[0], 4722 len, s, 4723 PL_colors[1]); 4724 } 4725 else if (k == CURLY) { 4726 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) 4727 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ 4728 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); 4729 } 4730 else if (k == WHILEM && o->flags) /* Ordinal/of */ 4731 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); 4732 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP ) 4733 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ 4734 else if (k == LOGICAL) 4735 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ 4736 else if (k == ANYOF) { 4737 int i, rangestart = -1; 4738 const U8 flags = ANYOF_FLAGS(o); 4739 const char * const anyofs[] = { /* Should be synchronized with 4740 * ANYOF_ #xdefines in regcomp.h */ 4741 "\\w", 4742 "\\W", 4743 "\\s", 4744 "\\S", 4745 "\\d", 4746 "\\D", 4747 "[:alnum:]", 4748 "[:^alnum:]", 4749 "[:alpha:]", 4750 "[:^alpha:]", 4751 "[:ascii:]", 4752 "[:^ascii:]", 4753 "[:ctrl:]", 4754 "[:^ctrl:]", 4755 "[:graph:]", 4756 "[:^graph:]", 4757 "[:lower:]", 4758 "[:^lower:]", 4759 "[:print:]", 4760 "[:^print:]", 4761 "[:punct:]", 4762 "[:^punct:]", 4763 "[:upper:]", 4764 "[:^upper:]", 4765 "[:xdigit:]", 4766 "[:^xdigit:]", 4767 "[:space:]", 4768 "[:^space:]", 4769 "[:blank:]", 4770 "[:^blank:]" 4771 }; 4772 4773 if (flags & ANYOF_LOCALE) 4774 sv_catpv(sv, "{loc}"); 4775 if (flags & ANYOF_FOLD) 4776 sv_catpv(sv, "{i}"); 4777 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 4778 if (flags & ANYOF_INVERT) 4779 sv_catpv(sv, "^"); 4780 for (i = 0; i <= 256; i++) { 4781 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { 4782 if (rangestart == -1) 4783 rangestart = i; 4784 } else if (rangestart != -1) { 4785 if (i <= rangestart + 3) 4786 for (; rangestart < i; rangestart++) 4787 put_byte(sv, rangestart); 4788 else { 4789 put_byte(sv, rangestart); 4790 sv_catpv(sv, "-"); 4791 put_byte(sv, i - 1); 4792 } 4793 rangestart = -1; 4794 } 4795 } 4796 4797 if (o->flags & ANYOF_CLASS) 4798 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) 4799 if (ANYOF_CLASS_TEST(o,i)) 4800 sv_catpv(sv, anyofs[i]); 4801 4802 if (flags & ANYOF_UNICODE) 4803 sv_catpv(sv, "{unicode}"); 4804 else if (flags & ANYOF_UNICODE_ALL) 4805 sv_catpv(sv, "{unicode_all}"); 4806 4807 { 4808 SV *lv; 4809 SV * const sw = regclass_swash(o, FALSE, &lv, 0); 4810 4811 if (lv) { 4812 if (sw) { 4813 U8 s[UTF8_MAXBYTES_CASE+1]; 4814 4815 for (i = 0; i <= 256; i++) { /* just the first 256 */ 4816 uvchr_to_utf8(s, i); 4817 4818 if (i < 256 && swash_fetch(sw, s, TRUE)) { 4819 if (rangestart == -1) 4820 rangestart = i; 4821 } else if (rangestart != -1) { 4822 if (i <= rangestart + 3) 4823 for (; rangestart < i; rangestart++) { 4824 const U8 * const e = uvchr_to_utf8(s,rangestart); 4825 U8 *p; 4826 for(p = s; p < e; p++) 4827 put_byte(sv, *p); 4828 } 4829 else { 4830 const U8 *e = uvchr_to_utf8(s,rangestart); 4831 U8 *p; 4832 for (p = s; p < e; p++) 4833 put_byte(sv, *p); 4834 sv_catpvn(sv, "-", 1); 4835 e = uvchr_to_utf8(s, i-1); 4836 for (p = s; p < e; p++) 4837 put_byte(sv, *p); 4838 } 4839 rangestart = -1; 4840 } 4841 } 4842 4843 sv_catpv(sv, "..."); /* et cetera */ 4844 } 4845 4846 { 4847 char *s = savesvpv(lv); 4848 char *origs = s; 4849 4850 while(*s && *s != '\n') s++; 4851 4852 if (*s == '\n') { 4853 const char * const t = ++s; 4854 4855 while (*s) { 4856 if (*s == '\n') 4857 *s = ' '; 4858 s++; 4859 } 4860 if (s[-1] == ' ') 4861 s[-1] = 0; 4862 4863 sv_catpv(sv, t); 4864 } 4865 4866 Safefree(origs); 4867 } 4868 } 4869 } 4870 4871 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 4872 } 4873 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) 4874 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags); 4875 #endif /* DEBUGGING */ 4876 } 4877 4878 SV * 4879 Perl_re_intuit_string(pTHX_ regexp *prog) 4880 { /* Assume that RE_INTUIT is set */ 4881 DEBUG_r( 4882 { 4883 const char * const s = SvPV_nolen_const(prog->check_substr 4884 ? prog->check_substr : prog->check_utf8); 4885 4886 if (!PL_colorset) reginitcolors(); 4887 PerlIO_printf(Perl_debug_log, 4888 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", 4889 PL_colors[4], 4890 prog->check_substr ? "" : "utf8 ", 4891 PL_colors[5],PL_colors[0], 4892 s, 4893 PL_colors[1], 4894 (strlen(s) > 60 ? "..." : "")); 4895 } ); 4896 4897 return prog->check_substr ? prog->check_substr : prog->check_utf8; 4898 } 4899 4900 void 4901 Perl_pregfree(pTHX_ struct regexp *r) 4902 { 4903 #ifdef DEBUGGING 4904 SV *dsv = PERL_DEBUG_PAD_ZERO(0); 4905 #endif 4906 4907 if (!r || (--r->refcnt > 0)) 4908 return; 4909 DEBUG_r({ 4910 const char *s = (r->reganch & ROPT_UTF8) 4911 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX) 4912 : pv_display(dsv, r->precomp, r->prelen, 0, 60); 4913 const int len = SvCUR(dsv); 4914 if (!PL_colorset) 4915 reginitcolors(); 4916 PerlIO_printf(Perl_debug_log, 4917 "%sFreeing REx:%s `%s%*.*s%s%s'\n", 4918 PL_colors[4],PL_colors[5],PL_colors[0], 4919 len, len, s, 4920 PL_colors[1], 4921 len > 60 ? "..." : ""); 4922 }); 4923 4924 /* gcov results gave these as non-null 100% of the time, so there's no 4925 optimisation in checking them before calling Safefree */ 4926 Safefree(r->precomp); 4927 Safefree(r->offsets); /* 20010421 MJD */ 4928 if (RX_MATCH_COPIED(r)) 4929 Safefree(r->subbeg); 4930 if (r->substrs) { 4931 if (r->anchored_substr) 4932 SvREFCNT_dec(r->anchored_substr); 4933 if (r->anchored_utf8) 4934 SvREFCNT_dec(r->anchored_utf8); 4935 if (r->float_substr) 4936 SvREFCNT_dec(r->float_substr); 4937 if (r->float_utf8) 4938 SvREFCNT_dec(r->float_utf8); 4939 Safefree(r->substrs); 4940 } 4941 if (r->data) { 4942 int n = r->data->count; 4943 PAD* new_comppad = NULL; 4944 PAD* old_comppad; 4945 PADOFFSET refcnt; 4946 4947 while (--n >= 0) { 4948 /* If you add a ->what type here, update the comment in regcomp.h */ 4949 switch (r->data->what[n]) { 4950 case 's': 4951 SvREFCNT_dec((SV*)r->data->data[n]); 4952 break; 4953 case 'f': 4954 Safefree(r->data->data[n]); 4955 break; 4956 case 'p': 4957 new_comppad = (AV*)r->data->data[n]; 4958 break; 4959 case 'o': 4960 if (new_comppad == NULL) 4961 Perl_croak(aTHX_ "panic: pregfree comppad"); 4962 PAD_SAVE_LOCAL(old_comppad, 4963 /* Watch out for global destruction's random ordering. */ 4964 (SvTYPE(new_comppad) == SVt_PVAV) ? 4965 new_comppad : Null(PAD *) 4966 ); 4967 OP_REFCNT_LOCK; 4968 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]); 4969 OP_REFCNT_UNLOCK; 4970 if (!refcnt) 4971 op_free((OP_4tree*)r->data->data[n]); 4972 4973 PAD_RESTORE_LOCAL(old_comppad); 4974 SvREFCNT_dec((SV*)new_comppad); 4975 new_comppad = NULL; 4976 break; 4977 case 'n': 4978 break; 4979 default: 4980 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]); 4981 } 4982 } 4983 Safefree(r->data->what); 4984 Safefree(r->data); 4985 } 4986 Safefree(r->startp); 4987 Safefree(r->endp); 4988 Safefree(r); 4989 } 4990 4991 /* 4992 - regnext - dig the "next" pointer out of a node 4993 * 4994 * [Note, when REGALIGN is defined there are two places in regmatch() 4995 * that bypass this code for speed.] 4996 */ 4997 regnode * 4998 Perl_regnext(pTHX_ register regnode *p) 4999 { 5000 register I32 offset; 5001 5002 if (p == &PL_regdummy) 5003 return(NULL); 5004 5005 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); 5006 if (offset == 0) 5007 return(NULL); 5008 5009 return(p+offset); 5010 } 5011 5012 STATIC void 5013 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) 5014 { 5015 va_list args; 5016 STRLEN l1 = strlen(pat1); 5017 STRLEN l2 = strlen(pat2); 5018 char buf[512]; 5019 SV *msv; 5020 const char *message; 5021 5022 if (l1 > 510) 5023 l1 = 510; 5024 if (l1 + l2 > 510) 5025 l2 = 510 - l1; 5026 Copy(pat1, buf, l1 , char); 5027 Copy(pat2, buf + l1, l2 , char); 5028 buf[l1 + l2] = '\n'; 5029 buf[l1 + l2 + 1] = '\0'; 5030 #ifdef I_STDARG 5031 /* ANSI variant takes additional second argument */ 5032 va_start(args, pat2); 5033 #else 5034 va_start(args); 5035 #endif 5036 msv = vmess(buf, &args); 5037 va_end(args); 5038 message = SvPV_const(msv,l1); 5039 if (l1 > 512) 5040 l1 = 512; 5041 Copy(message, buf, l1 , char); 5042 buf[l1-1] = '\0'; /* Overwrite \n */ 5043 Perl_croak(aTHX_ "%s", buf); 5044 } 5045 5046 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 5047 5048 void 5049 Perl_save_re_context(pTHX) 5050 { 5051 SAVEI32(PL_reg_flags); /* from regexec.c */ 5052 SAVEPPTR(PL_bostr); 5053 SAVEPPTR(PL_reginput); /* String-input pointer. */ 5054 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */ 5055 SAVEPPTR(PL_regeol); /* End of input, for $ check. */ 5056 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */ 5057 SAVEVPTR(PL_regendp); /* Ditto for endp. */ 5058 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */ 5059 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */ 5060 SAVEPPTR(PL_regtill); /* How far we are required to go. */ 5061 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */ 5062 PL_reg_start_tmp = 0; 5063 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */ 5064 PL_reg_start_tmpl = 0; 5065 SAVEVPTR(PL_regdata); 5066 SAVEI32(PL_reg_eval_set); /* from regexec.c */ 5067 SAVEI32(PL_regnarrate); /* from regexec.c */ 5068 SAVEVPTR(PL_regprogram); /* from regexec.c */ 5069 SAVEINT(PL_regindent); /* from regexec.c */ 5070 SAVEVPTR(PL_regcc); /* from regexec.c */ 5071 SAVEVPTR(PL_curcop); 5072 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */ 5073 SAVEVPTR(PL_reg_re); /* from regexec.c */ 5074 SAVEPPTR(PL_reg_ganch); /* from regexec.c */ 5075 SAVESPTR(PL_reg_sv); /* from regexec.c */ 5076 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */ 5077 SAVEVPTR(PL_reg_magic); /* from regexec.c */ 5078 SAVEI32(PL_reg_oldpos); /* from regexec.c */ 5079 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */ 5080 SAVEVPTR(PL_reg_curpm); /* from regexec.c */ 5081 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */ 5082 PL_reg_oldsaved = Nullch; 5083 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */ 5084 PL_reg_oldsavedlen = 0; 5085 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */ 5086 PL_reg_maxiter = 0; 5087 SAVEI32(PL_reg_leftiter); /* wait until caching pos */ 5088 PL_reg_leftiter = 0; 5089 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */ 5090 PL_reg_poscache = Nullch; 5091 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */ 5092 PL_reg_poscache_size = 0; 5093 SAVEPPTR(PL_regprecomp); /* uncompiled string. */ 5094 SAVEI32(PL_regnpar); /* () count. */ 5095 SAVEI32(PL_regsize); /* from regexec.c */ 5096 5097 { 5098 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ 5099 REGEXP *rx; 5100 5101 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) { 5102 U32 i; 5103 for (i = 1; i <= rx->nparens; i++) { 5104 GV *mgv; 5105 char digits[TYPE_CHARS(long)]; 5106 sprintf(digits, "%lu", (long)i); 5107 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV))) 5108 save_scalar(mgv); 5109 } 5110 } 5111 } 5112 5113 #ifdef DEBUGGING 5114 SAVEPPTR(PL_reg_starttry); /* from regexec.c */ 5115 #endif 5116 } 5117 5118 static void 5119 clear_re(pTHX_ void *r) 5120 { 5121 ReREFCNT_dec((regexp *)r); 5122 } 5123 5124 #ifdef DEBUGGING 5125 5126 STATIC void 5127 S_put_byte(pTHX_ SV *sv, int c) 5128 { 5129 if (isCNTRL(c) || c == 255 || !isPRINT(c)) 5130 Perl_sv_catpvf(aTHX_ sv, "\\%o", c); 5131 else if (c == '-' || c == ']' || c == '\\' || c == '^') 5132 Perl_sv_catpvf(aTHX_ sv, "\\%c", c); 5133 else 5134 Perl_sv_catpvf(aTHX_ sv, "%c", c); 5135 } 5136 5137 5138 STATIC regnode * 5139 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) 5140 { 5141 register U8 op = EXACT; /* Arbitrary non-END op. */ 5142 register regnode *next; 5143 5144 while (op != END && (!last || node < last)) { 5145 /* While that wasn't END last time... */ 5146 5147 NODE_ALIGN(node); 5148 op = OP(node); 5149 if (op == CLOSE) 5150 l--; 5151 next = regnext(node); 5152 /* Where, what. */ 5153 if (OP(node) == OPTIMIZED) 5154 goto after_print; 5155 regprop(sv, node); 5156 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), 5157 (int)(2*l + 1), "", SvPVX_const(sv)); 5158 if (next == NULL) /* Next ptr. */ 5159 PerlIO_printf(Perl_debug_log, "(0)"); 5160 else 5161 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start)); 5162 (void)PerlIO_putc(Perl_debug_log, '\n'); 5163 after_print: 5164 if (PL_regkind[(U8)op] == BRANCHJ) { 5165 register regnode *nnode = (OP(next) == LONGJMP 5166 ? regnext(next) 5167 : next); 5168 if (last && nnode > last) 5169 nnode = last; 5170 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1); 5171 } 5172 else if (PL_regkind[(U8)op] == BRANCH) { 5173 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1); 5174 } 5175 else if ( op == CURLY) { /* "next" might be very big: optimizer */ 5176 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS, 5177 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1); 5178 } 5179 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { 5180 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS, 5181 next, sv, l + 1); 5182 } 5183 else if ( op == PLUS || op == STAR) { 5184 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1); 5185 } 5186 else if (op == ANYOF) { 5187 /* arglen 1 + class block */ 5188 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE) 5189 ? ANYOF_CLASS_SKIP : ANYOF_SKIP); 5190 node = NEXTOPER(node); 5191 } 5192 else if (PL_regkind[(U8)op] == EXACT) { 5193 /* Literal string, where present. */ 5194 node += NODE_SZ_STR(node) - 1; 5195 node = NEXTOPER(node); 5196 } 5197 else { 5198 node = NEXTOPER(node); 5199 node += regarglen[(U8)op]; 5200 } 5201 if (op == CURLYX || op == OPEN) 5202 l++; 5203 else if (op == WHILEM) 5204 l--; 5205 } 5206 return node; 5207 } 5208 5209 #endif /* DEBUGGING */ 5210 5211 /* 5212 * Local variables: 5213 * c-indentation-style: bsd 5214 * c-basic-offset: 4 5215 * indent-tabs-mode: t 5216 * End: 5217 * 5218 * ex: set ts=8 sts=4 sw=4 noet: 5219 */ 5220