1 /* regcomp.c 2 */ 3 4 /* 5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee 6 */ 7 8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not 9 * confused with the original package (see point 3 below). Thanks, Henry! 10 */ 11 12 /* Additional note: this code is very heavily munged from Henry's version 13 * in places. In some spots I've traded clarity for efficiency, so don't 14 * blame Henry for some of the lack of readability. 15 */ 16 17 /* The names of the functions have been changed from regcomp and 18 * regexec to pregcomp and pregexec in order to avoid conflicts 19 * with the POSIX routines of the same names. 20 */ 21 22 #ifdef PERL_EXT_RE_BUILD 23 /* need to replace pregcomp et al, so enable that */ 24 # ifndef PERL_IN_XSUB_RE 25 # define PERL_IN_XSUB_RE 26 # endif 27 /* need access to debugger hooks */ 28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING) 29 # define DEBUGGING 30 # endif 31 #endif 32 33 #ifdef PERL_IN_XSUB_RE 34 /* We *really* need to overwrite these symbols: */ 35 # define Perl_pregcomp my_regcomp 36 # define Perl_regdump my_regdump 37 # define Perl_regprop my_regprop 38 # define Perl_pregfree my_regfree 39 # define Perl_re_intuit_string my_re_intuit_string 40 /* *These* symbols are masked to allow static link. */ 41 # define Perl_regnext my_regnext 42 # define Perl_save_re_context my_save_re_context 43 # define Perl_reginitcolors my_reginitcolors 44 45 # define PERL_NO_GET_CONTEXT 46 #endif 47 48 /*SUPPRESS 112*/ 49 /* 50 * pregcomp and pregexec -- regsub and regerror are not used in perl 51 * 52 * Copyright (c) 1986 by University of Toronto. 53 * Written by Henry Spencer. Not derived from licensed software. 54 * 55 * Permission is granted to anyone to use this software for any 56 * purpose on any computer system, and to redistribute it freely, 57 * subject to the following restrictions: 58 * 59 * 1. The author is not responsible for the consequences of use of 60 * this software, no matter how awful, even if they arise 61 * from defects in it. 62 * 63 * 2. The origin of this software must not be misrepresented, either 64 * by explicit claim or by omission. 65 * 66 * 3. Altered versions must be plainly marked as such, and must not 67 * be misrepresented as being the original software. 68 * 69 * 70 **** Alterations to Henry's code are... 71 **** 72 **** Copyright (c) 1991-2002, Larry Wall 73 **** 74 **** You may distribute under the terms of either the GNU General Public 75 **** License or the Artistic License, as specified in the README file. 76 77 * 78 * Beware that some of this code is subtly aware of the way operator 79 * precedence is structured in regular expressions. Serious changes in 80 * regular-expression syntax might require a total rethink. 81 */ 82 #include "EXTERN.h" 83 #define PERL_IN_REGCOMP_C 84 #include "perl.h" 85 86 #ifndef PERL_IN_XSUB_RE 87 # include "INTERN.h" 88 #endif 89 90 #define REG_COMP_C 91 #include "regcomp.h" 92 93 #ifdef op 94 #undef op 95 #endif /* op */ 96 97 #ifdef MSDOS 98 # if defined(BUGGY_MSC6) 99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ 100 # pragma optimize("a",off) 101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ 102 # pragma optimize("w",on ) 103 # endif /* BUGGY_MSC6 */ 104 #endif /* MSDOS */ 105 106 #ifndef STATIC 107 #define STATIC static 108 #endif 109 110 typedef struct RExC_state_t { 111 U32 flags; /* are we folding, multilining? */ 112 char *precomp; /* uncompiled string. */ 113 regexp *rx; 114 char *start; /* Start of input for compile */ 115 char *end; /* End of input for compile */ 116 char *parse; /* Input-scan pointer. */ 117 I32 whilem_seen; /* number of WHILEM in this expr */ 118 regnode *emit_start; /* Start of emitted-code area */ 119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ 120 I32 naughty; /* How bad is this pattern? */ 121 I32 sawback; /* Did we see \1, ...? */ 122 U32 seen; 123 I32 size; /* Code size. */ 124 I32 npar; /* () count. */ 125 I32 extralen; 126 I32 seen_zerolen; 127 I32 seen_evals; 128 I32 utf8; 129 #if ADD_TO_REGEXEC 130 char *starttry; /* -Dr: where regtry was called. */ 131 #define RExC_starttry (pRExC_state->starttry) 132 #endif 133 } RExC_state_t; 134 135 #define RExC_flags (pRExC_state->flags) 136 #define RExC_precomp (pRExC_state->precomp) 137 #define RExC_rx (pRExC_state->rx) 138 #define RExC_start (pRExC_state->start) 139 #define RExC_end (pRExC_state->end) 140 #define RExC_parse (pRExC_state->parse) 141 #define RExC_whilem_seen (pRExC_state->whilem_seen) 142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */ 143 #define RExC_emit (pRExC_state->emit) 144 #define RExC_emit_start (pRExC_state->emit_start) 145 #define RExC_naughty (pRExC_state->naughty) 146 #define RExC_sawback (pRExC_state->sawback) 147 #define RExC_seen (pRExC_state->seen) 148 #define RExC_size (pRExC_state->size) 149 #define RExC_npar (pRExC_state->npar) 150 #define RExC_extralen (pRExC_state->extralen) 151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen) 152 #define RExC_seen_evals (pRExC_state->seen_evals) 153 #define RExC_utf8 (pRExC_state->utf8) 154 155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') 156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ 157 ((*s) == '{' && regcurly(s))) 158 159 #ifdef SPSTART 160 #undef SPSTART /* dratted cpp namespace... */ 161 #endif 162 /* 163 * Flags to be passed up and down. 164 */ 165 #define WORST 0 /* Worst case. */ 166 #define HASWIDTH 0x1 /* Known to match non-null strings. */ 167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ 168 #define SPSTART 0x4 /* Starts with * or +. */ 169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */ 170 171 /* Length of a variant. */ 172 173 typedef struct scan_data_t { 174 I32 len_min; 175 I32 len_delta; 176 I32 pos_min; 177 I32 pos_delta; 178 SV *last_found; 179 I32 last_end; /* min value, <0 unless valid. */ 180 I32 last_start_min; 181 I32 last_start_max; 182 SV **longest; /* Either &l_fixed, or &l_float. */ 183 SV *longest_fixed; 184 I32 offset_fixed; 185 SV *longest_float; 186 I32 offset_float_min; 187 I32 offset_float_max; 188 I32 flags; 189 I32 whilem_c; 190 I32 *last_closep; 191 struct regnode_charclass_class *start_class; 192 } scan_data_t; 193 194 /* 195 * Forward declarations for pregcomp()'s friends. 196 */ 197 198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 199 0, 0, 0, 0, 0, 0}; 200 201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) 202 #define SF_BEFORE_SEOL 0x1 203 #define SF_BEFORE_MEOL 0x2 204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) 205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) 206 207 #ifdef NO_UNARY_PLUS 208 # define SF_FIX_SHIFT_EOL (0+2) 209 # define SF_FL_SHIFT_EOL (0+4) 210 #else 211 # define SF_FIX_SHIFT_EOL (+2) 212 # define SF_FL_SHIFT_EOL (+4) 213 #endif 214 215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) 216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) 217 218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) 219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ 220 #define SF_IS_INF 0x40 221 #define SF_HAS_PAR 0x80 222 #define SF_IN_PAR 0x100 223 #define SF_HAS_EVAL 0x200 224 #define SCF_DO_SUBSTR 0x400 225 #define SCF_DO_STCLASS_AND 0x0800 226 #define SCF_DO_STCLASS_OR 0x1000 227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) 228 #define SCF_WHILEM_VISITED_POS 0x2000 229 230 #define UTF (RExC_utf8 != 0) 231 #define LOC ((RExC_flags & PMf_LOCALE) != 0) 232 #define FOLD ((RExC_flags & PMf_FOLD) != 0) 233 234 #define OOB_UNICODE 12345678 235 #define OOB_NAMEDCLASS -1 236 237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) 238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) 239 240 241 /* length of regex to show in messages that don't mark a position within */ 242 #define RegexLengthToShowInErrorMessages 127 243 244 /* 245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top 246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in 247 * op/pragma/warn/regcomp. 248 */ 249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */ 250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ 251 252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" 253 254 /* 255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 256 * arg. Show regex, up to a maximum length. If it's too long, chop and add 257 * "...". 258 */ 259 #define FAIL(msg) \ 260 STMT_START { \ 261 char *ellipses = ""; \ 262 IV len = RExC_end - RExC_precomp; \ 263 \ 264 if (!SIZE_ONLY) \ 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 266 \ 267 if (len > RegexLengthToShowInErrorMessages) { \ 268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 269 len = RegexLengthToShowInErrorMessages - 10; \ 270 ellipses = "..."; \ 271 } \ 272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ 273 msg, (int)len, RExC_precomp, ellipses); \ 274 } STMT_END 275 276 /* 277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given 278 * args. Show regex, up to a maximum length. If it's too long, chop and add 279 * "...". 280 */ 281 #define FAIL2(pat,msg) \ 282 STMT_START { \ 283 char *ellipses = ""; \ 284 IV len = RExC_end - RExC_precomp; \ 285 \ 286 if (!SIZE_ONLY) \ 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 288 \ 289 if (len > RegexLengthToShowInErrorMessages) { \ 290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \ 291 len = RegexLengthToShowInErrorMessages - 10; \ 292 ellipses = "..."; \ 293 } \ 294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \ 295 msg, (int)len, RExC_precomp, ellipses); \ 296 } STMT_END 297 298 299 /* 300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan 301 */ 302 #define Simple_vFAIL(m) \ 303 STMT_START { \ 304 IV offset = RExC_parse - RExC_precomp; \ 305 \ 306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 308 } STMT_END 309 310 /* 311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() 312 */ 313 #define vFAIL(m) \ 314 STMT_START { \ 315 if (!SIZE_ONLY) \ 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 317 Simple_vFAIL(m); \ 318 } STMT_END 319 320 /* 321 * Like Simple_vFAIL(), but accepts two arguments. 322 */ 323 #define Simple_vFAIL2(m,a1) \ 324 STMT_START { \ 325 IV offset = RExC_parse - RExC_precomp; \ 326 \ 327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ 328 (int)offset, RExC_precomp, RExC_precomp + offset); \ 329 } STMT_END 330 331 /* 332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). 333 */ 334 #define vFAIL2(m,a1) \ 335 STMT_START { \ 336 if (!SIZE_ONLY) \ 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 338 Simple_vFAIL2(m, a1); \ 339 } STMT_END 340 341 342 /* 343 * Like Simple_vFAIL(), but accepts three arguments. 344 */ 345 #define Simple_vFAIL3(m, a1, a2) \ 346 STMT_START { \ 347 IV offset = RExC_parse - RExC_precomp; \ 348 \ 349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ 350 (int)offset, RExC_precomp, RExC_precomp + offset); \ 351 } STMT_END 352 353 /* 354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). 355 */ 356 #define vFAIL3(m,a1,a2) \ 357 STMT_START { \ 358 if (!SIZE_ONLY) \ 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ 360 Simple_vFAIL3(m, a1, a2); \ 361 } STMT_END 362 363 /* 364 * Like Simple_vFAIL(), but accepts four arguments. 365 */ 366 #define Simple_vFAIL4(m, a1, a2, a3) \ 367 STMT_START { \ 368 IV offset = RExC_parse - RExC_precomp; \ 369 \ 370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\ 371 (int)offset, RExC_precomp, RExC_precomp + offset); \ 372 } STMT_END 373 374 /* 375 * Like Simple_vFAIL(), but accepts five arguments. 376 */ 377 #define Simple_vFAIL5(m, a1, a2, a3, a4) \ 378 STMT_START { \ 379 IV offset = RExC_parse - RExC_precomp; \ 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\ 381 (int)offset, RExC_precomp, RExC_precomp + offset); \ 382 } STMT_END 383 384 385 #define vWARN(loc,m) \ 386 STMT_START { \ 387 IV offset = loc - RExC_precomp; \ 388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\ 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 390 } STMT_END \ 391 392 #define vWARNdep(loc,m) \ 393 STMT_START { \ 394 IV offset = loc - RExC_precomp; \ 395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\ 396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \ 397 } STMT_END \ 398 399 400 #define vWARN2(loc, m, a1) \ 401 STMT_START { \ 402 IV offset = loc - RExC_precomp; \ 403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\ 404 a1, \ 405 (int)offset, RExC_precomp, RExC_precomp + offset); \ 406 } STMT_END 407 408 #define vWARN3(loc, m, a1, a2) \ 409 STMT_START { \ 410 IV offset = loc - RExC_precomp; \ 411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 412 a1, a2, \ 413 (int)offset, RExC_precomp, RExC_precomp + offset); \ 414 } STMT_END 415 416 #define vWARN4(loc, m, a1, a2, a3) \ 417 STMT_START { \ 418 IV offset = loc - RExC_precomp; \ 419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\ 420 a1, a2, a3, \ 421 (int)offset, RExC_precomp, RExC_precomp + offset); \ 422 } STMT_END 423 424 /* used for the parse_flags section for (?c) -- japhy */ 425 #define vWARN5(loc, m, a1, a2, a3, a4) \ 426 STMT_START { \ 427 IV offset = loc - RExC_precomp; \ 428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ 429 a1, a2, a3, a4, \ 430 (int)offset, RExC_precomp, RExC_precomp + offset); \ 431 } STMT_END 432 433 434 /* Allow for side effects in s */ 435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END 436 437 /* Macros for recording node offsets. 20001227 mjd@plover.com 438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in 439 * element 2*n-1 of the array. Element #2n holds the byte length node #n. 440 * Element 0 holds the number n. 441 */ 442 443 #define MJD_OFFSET_DEBUG(x) 444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */ 445 446 447 # define Set_Node_Offset_To_R(node,byte) \ 448 STMT_START { \ 449 if (! SIZE_ONLY) { \ 450 if((node) < 0) { \ 451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \ 452 } else { \ 453 RExC_offsets[2*(node)-1] = (byte); \ 454 } \ 455 } \ 456 } STMT_END 457 458 # define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) 459 # define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) 460 461 # define Set_Node_Length_To_R(node,len) \ 462 STMT_START { \ 463 if (! SIZE_ONLY) { \ 464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \ 465 if((node) < 0) { \ 466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \ 467 } else { \ 468 RExC_offsets[2*(node)] = (len); \ 469 } \ 470 } \ 471 } STMT_END 472 473 # define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len) 474 # define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) 475 # define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start) 476 477 /* Get offsets and lengths */ 478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) 479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) 480 481 static void clear_re(pTHX_ void *r); 482 483 /* Mark that we cannot extend a found fixed substring at this point. 484 Updata the longest found anchored substring and the longest found 485 floating substrings if needed. */ 486 487 STATIC void 488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) 489 { 490 STRLEN l = CHR_SVLEN(data->last_found); 491 STRLEN old_l = CHR_SVLEN(*data->longest); 492 493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { 494 sv_setsv(*data->longest, data->last_found); 495 if (*data->longest == data->longest_fixed) { 496 data->offset_fixed = l ? data->last_start_min : data->pos_min; 497 if (data->flags & SF_BEFORE_EOL) 498 data->flags 499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); 500 else 501 data->flags &= ~SF_FIX_BEFORE_EOL; 502 } 503 else { 504 data->offset_float_min = l ? data->last_start_min : data->pos_min; 505 data->offset_float_max = (l 506 ? data->last_start_max 507 : data->pos_min + data->pos_delta); 508 if ((U32)data->offset_float_max > (U32)I32_MAX) 509 data->offset_float_max = I32_MAX; 510 if (data->flags & SF_BEFORE_EOL) 511 data->flags 512 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); 513 else 514 data->flags &= ~SF_FL_BEFORE_EOL; 515 } 516 } 517 SvCUR_set(data->last_found, 0); 518 data->last_end = -1; 519 data->flags &= ~SF_BEFORE_EOL; 520 } 521 522 /* Can match anything (initialization) */ 523 STATIC void 524 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 525 { 526 ANYOF_CLASS_ZERO(cl); 527 ANYOF_BITMAP_SETALL(cl); 528 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; 529 if (LOC) 530 cl->flags |= ANYOF_LOCALE; 531 } 532 533 /* Can match anything (initialization) */ 534 STATIC int 535 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) 536 { 537 int value; 538 539 for (value = 0; value <= ANYOF_MAX; value += 2) 540 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) 541 return 1; 542 if (!(cl->flags & ANYOF_UNICODE_ALL)) 543 return 0; 544 if (!ANYOF_BITMAP_TESTALLSET(cl)) 545 return 0; 546 return 1; 547 } 548 549 /* Can match anything (initialization) */ 550 STATIC void 551 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 552 { 553 Zero(cl, 1, struct regnode_charclass_class); 554 cl->type = ANYOF; 555 cl_anything(pRExC_state, cl); 556 } 557 558 STATIC void 559 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) 560 { 561 Zero(cl, 1, struct regnode_charclass_class); 562 cl->type = ANYOF; 563 cl_anything(pRExC_state, cl); 564 if (LOC) 565 cl->flags |= ANYOF_LOCALE; 566 } 567 568 /* 'And' a given class with another one. Can create false positives */ 569 /* We assume that cl is not inverted */ 570 STATIC void 571 S_cl_and(pTHX_ struct regnode_charclass_class *cl, 572 struct regnode_charclass_class *and_with) 573 { 574 if (!(and_with->flags & ANYOF_CLASS) 575 && !(cl->flags & ANYOF_CLASS) 576 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 577 && !(and_with->flags & ANYOF_FOLD) 578 && !(cl->flags & ANYOF_FOLD)) { 579 int i; 580 581 if (and_with->flags & ANYOF_INVERT) 582 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 583 cl->bitmap[i] &= ~and_with->bitmap[i]; 584 else 585 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 586 cl->bitmap[i] &= and_with->bitmap[i]; 587 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ 588 if (!(and_with->flags & ANYOF_EOS)) 589 cl->flags &= ~ANYOF_EOS; 590 591 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) { 592 cl->flags &= ~ANYOF_UNICODE_ALL; 593 cl->flags |= ANYOF_UNICODE; 594 ARG_SET(cl, ARG(and_with)); 595 } 596 if (!(and_with->flags & ANYOF_UNICODE_ALL)) 597 cl->flags &= ~ANYOF_UNICODE_ALL; 598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL))) 599 cl->flags &= ~ANYOF_UNICODE; 600 } 601 602 /* 'OR' a given class with another one. Can create false positives */ 603 /* We assume that cl is not inverted */ 604 STATIC void 605 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with) 606 { 607 if (or_with->flags & ANYOF_INVERT) { 608 /* We do not use 609 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) 610 * <= (B1 | !B2) | (CL1 | !CL2) 611 * which is wasteful if CL2 is small, but we ignore CL2: 612 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 613 * XXXX Can we handle case-fold? Unclear: 614 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = 615 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) 616 */ 617 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 618 && !(or_with->flags & ANYOF_FOLD) 619 && !(cl->flags & ANYOF_FOLD) ) { 620 int i; 621 622 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 623 cl->bitmap[i] |= ~or_with->bitmap[i]; 624 } /* XXXX: logic is complicated otherwise */ 625 else { 626 cl_anything(pRExC_state, cl); 627 } 628 } else { 629 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ 630 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) 631 && (!(or_with->flags & ANYOF_FOLD) 632 || (cl->flags & ANYOF_FOLD)) ) { 633 int i; 634 635 /* OR char bitmap and class bitmap separately */ 636 for (i = 0; i < ANYOF_BITMAP_SIZE; i++) 637 cl->bitmap[i] |= or_with->bitmap[i]; 638 if (or_with->flags & ANYOF_CLASS) { 639 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++) 640 cl->classflags[i] |= or_with->classflags[i]; 641 cl->flags |= ANYOF_CLASS; 642 } 643 } 644 else { /* XXXX: logic is complicated, leave it along for a moment. */ 645 cl_anything(pRExC_state, cl); 646 } 647 } 648 if (or_with->flags & ANYOF_EOS) 649 cl->flags |= ANYOF_EOS; 650 651 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE && 652 ARG(cl) != ARG(or_with)) { 653 cl->flags |= ANYOF_UNICODE_ALL; 654 cl->flags &= ~ANYOF_UNICODE; 655 } 656 if (or_with->flags & ANYOF_UNICODE_ALL) { 657 cl->flags |= ANYOF_UNICODE_ALL; 658 cl->flags &= ~ANYOF_UNICODE; 659 } 660 } 661 662 /* 663 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. 664 * These need to be revisited when a newer toolchain becomes available. 665 */ 666 #if defined(__sparc64__) && defined(__GNUC__) 667 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) 668 # undef SPARC64_GCC_WORKAROUND 669 # define SPARC64_GCC_WORKAROUND 1 670 # endif 671 #endif 672 673 /* REx optimizer. Converts nodes into quickier variants "in place". 674 Finds fixed substrings. */ 675 676 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set 677 to the position after last scanned or to NULL. */ 678 679 STATIC I32 680 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags) 681 /* scanp: Start here (read-write). */ 682 /* deltap: Write maxlen-minlen here. */ 683 /* last: Stop before this one. */ 684 { 685 I32 min = 0, pars = 0, code; 686 regnode *scan = *scanp, *next; 687 I32 delta = 0; 688 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); 689 int is_inf_internal = 0; /* The studied chunk is infinite */ 690 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; 691 scan_data_t data_fake; 692 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */ 693 694 while (scan && OP(scan) != END && scan < last) { 695 /* Peephole optimizer: */ 696 697 if (PL_regkind[(U8)OP(scan)] == EXACT) { 698 /* Merge several consecutive EXACTish nodes into one. */ 699 regnode *n = regnext(scan); 700 U32 stringok = 1; 701 #ifdef DEBUGGING 702 regnode *stop = scan; 703 #endif 704 705 next = scan + NODE_SZ_STR(scan); 706 /* Skip NOTHING, merge EXACT*. */ 707 while (n && 708 ( PL_regkind[(U8)OP(n)] == NOTHING || 709 (stringok && (OP(n) == OP(scan)))) 710 && NEXT_OFF(n) 711 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { 712 if (OP(n) == TAIL || n > next) 713 stringok = 0; 714 if (PL_regkind[(U8)OP(n)] == NOTHING) { 715 NEXT_OFF(scan) += NEXT_OFF(n); 716 next = n + NODE_STEP_REGNODE; 717 #ifdef DEBUGGING 718 if (stringok) 719 stop = n; 720 #endif 721 n = regnext(n); 722 } 723 else if (stringok) { 724 int oldl = STR_LEN(scan); 725 regnode *nnext = regnext(n); 726 727 if (oldl + STR_LEN(n) > U8_MAX) 728 break; 729 NEXT_OFF(scan) += NEXT_OFF(n); 730 STR_LEN(scan) += STR_LEN(n); 731 next = n + NODE_SZ_STR(n); 732 /* Now we can overwrite *n : */ 733 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); 734 #ifdef DEBUGGING 735 stop = next - 1; 736 #endif 737 n = nnext; 738 } 739 } 740 741 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) { 742 /* 743 Two problematic code points in Unicode casefolding of EXACT nodes: 744 745 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 746 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 747 748 which casefold to 749 750 Unicode UTF-8 751 752 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 753 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 754 755 This means that in case-insensitive matching (or "loose matching", 756 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte 757 length of the above casefolded versions) can match a target string 758 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). 759 This would rather mess up the minimum length computation. 760 761 What we'll do is to look for the tail four bytes, and then peek 762 at the preceding two bytes to see whether we need to decrease 763 the minimum length by four (six minus two). 764 765 Thanks to the design of UTF-8, there cannot be false matches: 766 A sequence of valid UTF-8 bytes cannot be a subsequence of 767 another valid sequence of UTF-8 bytes. 768 769 */ 770 char *s0 = STRING(scan), *s, *t; 771 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4; 772 char *t0 = "\xcc\x88\xcc\x81"; 773 char *t1 = t0 + 3; 774 775 for (s = s0 + 2; 776 s < s2 && (t = ninstr(s, s1, t0, t1)); 777 s = t + 4) { 778 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || 779 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) 780 min -= 4; 781 } 782 } 783 784 #ifdef DEBUGGING 785 /* Allow dumping */ 786 n = scan + NODE_SZ_STR(scan); 787 while (n <= stop) { 788 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) { 789 OP(n) = OPTIMIZED; 790 NEXT_OFF(n) = 0; 791 } 792 n++; 793 } 794 #endif 795 } 796 /* Follow the next-chain of the current node and optimize 797 away all the NOTHINGs from it. */ 798 if (OP(scan) != CURLYX) { 799 int max = (reg_off_by_arg[OP(scan)] 800 ? I32_MAX 801 /* I32 may be smaller than U16 on CRAYs! */ 802 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); 803 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); 804 int noff; 805 regnode *n = scan; 806 807 /* Skip NOTHING and LONGJMP. */ 808 while ((n = regnext(n)) 809 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n))) 810 || ((OP(n) == LONGJMP) && (noff = ARG(n)))) 811 && off + noff < max) 812 off += noff; 813 if (reg_off_by_arg[OP(scan)]) 814 ARG(scan) = off; 815 else 816 NEXT_OFF(scan) = off; 817 } 818 /* The principal pseudo-switch. Cannot be a switch, since we 819 look into several different things. */ 820 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ 821 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) { 822 next = regnext(scan); 823 code = OP(scan); 824 825 if (OP(next) == code || code == IFTHEN || code == SUSPEND) { 826 I32 max1 = 0, min1 = I32_MAX, num = 0; 827 struct regnode_charclass_class accum; 828 829 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ 830 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */ 831 if (flags & SCF_DO_STCLASS) 832 cl_init_zero(pRExC_state, &accum); 833 while (OP(scan) == code) { 834 I32 deltanext, minnext, f = 0, fake; 835 struct regnode_charclass_class this_class; 836 837 num++; 838 data_fake.flags = 0; 839 if (data) { 840 data_fake.whilem_c = data->whilem_c; 841 data_fake.last_closep = data->last_closep; 842 } 843 else 844 data_fake.last_closep = &fake; 845 next = regnext(scan); 846 scan = NEXTOPER(scan); 847 if (code != BRANCH) 848 scan = NEXTOPER(scan); 849 if (flags & SCF_DO_STCLASS) { 850 cl_init(pRExC_state, &this_class); 851 data_fake.start_class = &this_class; 852 f = SCF_DO_STCLASS_AND; 853 } 854 if (flags & SCF_WHILEM_VISITED_POS) 855 f |= SCF_WHILEM_VISITED_POS; 856 /* we suppose the run is continuous, last=next...*/ 857 minnext = study_chunk(pRExC_state, &scan, &deltanext, 858 next, &data_fake, f); 859 if (min1 > minnext) 860 min1 = minnext; 861 if (max1 < minnext + deltanext) 862 max1 = minnext + deltanext; 863 if (deltanext == I32_MAX) 864 is_inf = is_inf_internal = 1; 865 scan = next; 866 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 867 pars++; 868 if (data && (data_fake.flags & SF_HAS_EVAL)) 869 data->flags |= SF_HAS_EVAL; 870 if (data) 871 data->whilem_c = data_fake.whilem_c; 872 if (flags & SCF_DO_STCLASS) 873 cl_or(pRExC_state, &accum, &this_class); 874 if (code == SUSPEND) 875 break; 876 } 877 if (code == IFTHEN && num < 2) /* Empty ELSE branch */ 878 min1 = 0; 879 if (flags & SCF_DO_SUBSTR) { 880 data->pos_min += min1; 881 data->pos_delta += max1 - min1; 882 if (max1 != min1 || is_inf) 883 data->longest = &(data->longest_float); 884 } 885 min += min1; 886 delta += max1 - min1; 887 if (flags & SCF_DO_STCLASS_OR) { 888 cl_or(pRExC_state, data->start_class, &accum); 889 if (min1) { 890 cl_and(data->start_class, &and_with); 891 flags &= ~SCF_DO_STCLASS; 892 } 893 } 894 else if (flags & SCF_DO_STCLASS_AND) { 895 if (min1) { 896 cl_and(data->start_class, &accum); 897 flags &= ~SCF_DO_STCLASS; 898 } 899 else { 900 /* Switch to OR mode: cache the old value of 901 * data->start_class */ 902 StructCopy(data->start_class, &and_with, 903 struct regnode_charclass_class); 904 flags &= ~SCF_DO_STCLASS_AND; 905 StructCopy(&accum, data->start_class, 906 struct regnode_charclass_class); 907 flags |= SCF_DO_STCLASS_OR; 908 data->start_class->flags |= ANYOF_EOS; 909 } 910 } 911 } 912 else if (code == BRANCHJ) /* single branch is optimized. */ 913 scan = NEXTOPER(NEXTOPER(scan)); 914 else /* single branch is optimized. */ 915 scan = NEXTOPER(scan); 916 continue; 917 } 918 else if (OP(scan) == EXACT) { 919 I32 l = STR_LEN(scan); 920 UV uc = *((U8*)STRING(scan)); 921 if (UTF) { 922 U8 *s = (U8*)STRING(scan); 923 l = utf8_length(s, s + l); 924 uc = utf8_to_uvchr(s, NULL); 925 } 926 min += l; 927 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ 928 /* The code below prefers earlier match for fixed 929 offset, later match for variable offset. */ 930 if (data->last_end == -1) { /* Update the start info. */ 931 data->last_start_min = data->pos_min; 932 data->last_start_max = is_inf 933 ? I32_MAX : data->pos_min + data->pos_delta; 934 } 935 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); 936 if (UTF) 937 SvUTF8_on(data->last_found); 938 data->last_end = data->pos_min + l; 939 data->pos_min += l; /* As in the first entry. */ 940 data->flags &= ~SF_BEFORE_EOL; 941 } 942 if (flags & SCF_DO_STCLASS_AND) { 943 /* Check whether it is compatible with what we know already! */ 944 int compat = 1; 945 946 if (uc >= 0x100 || 947 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 948 && !ANYOF_BITMAP_TEST(data->start_class, uc) 949 && (!(data->start_class->flags & ANYOF_FOLD) 950 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 951 ) 952 compat = 0; 953 ANYOF_CLASS_ZERO(data->start_class); 954 ANYOF_BITMAP_ZERO(data->start_class); 955 if (compat) 956 ANYOF_BITMAP_SET(data->start_class, uc); 957 data->start_class->flags &= ~ANYOF_EOS; 958 if (uc < 0x100) 959 data->start_class->flags &= ~ANYOF_UNICODE_ALL; 960 } 961 else if (flags & SCF_DO_STCLASS_OR) { 962 /* false positive possible if the class is case-folded */ 963 if (uc < 0x100) 964 ANYOF_BITMAP_SET(data->start_class, uc); 965 else 966 data->start_class->flags |= ANYOF_UNICODE_ALL; 967 data->start_class->flags &= ~ANYOF_EOS; 968 cl_and(data->start_class, &and_with); 969 } 970 flags &= ~SCF_DO_STCLASS; 971 } 972 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */ 973 I32 l = STR_LEN(scan); 974 UV uc = *((U8*)STRING(scan)); 975 976 /* Search for fixed substrings supports EXACT only. */ 977 if (flags & SCF_DO_SUBSTR) 978 scan_commit(pRExC_state, data); 979 if (UTF) { 980 U8 *s = (U8 *)STRING(scan); 981 l = utf8_length(s, s + l); 982 uc = utf8_to_uvchr(s, NULL); 983 } 984 min += l; 985 if (data && (flags & SCF_DO_SUBSTR)) 986 data->pos_min += l; 987 if (flags & SCF_DO_STCLASS_AND) { 988 /* Check whether it is compatible with what we know already! */ 989 int compat = 1; 990 991 if (uc >= 0x100 || 992 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 993 && !ANYOF_BITMAP_TEST(data->start_class, uc) 994 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) 995 compat = 0; 996 ANYOF_CLASS_ZERO(data->start_class); 997 ANYOF_BITMAP_ZERO(data->start_class); 998 if (compat) { 999 ANYOF_BITMAP_SET(data->start_class, uc); 1000 data->start_class->flags &= ~ANYOF_EOS; 1001 data->start_class->flags |= ANYOF_FOLD; 1002 if (OP(scan) == EXACTFL) 1003 data->start_class->flags |= ANYOF_LOCALE; 1004 } 1005 } 1006 else if (flags & SCF_DO_STCLASS_OR) { 1007 if (data->start_class->flags & ANYOF_FOLD) { 1008 /* false positive possible if the class is case-folded. 1009 Assume that the locale settings are the same... */ 1010 if (uc < 0x100) 1011 ANYOF_BITMAP_SET(data->start_class, uc); 1012 data->start_class->flags &= ~ANYOF_EOS; 1013 } 1014 cl_and(data->start_class, &and_with); 1015 } 1016 flags &= ~SCF_DO_STCLASS; 1017 } 1018 else if (strchr((char*)PL_varies,OP(scan))) { 1019 I32 mincount, maxcount, minnext, deltanext, fl = 0; 1020 I32 f = flags, pos_before = 0; 1021 regnode *oscan = scan; 1022 struct regnode_charclass_class this_class; 1023 struct regnode_charclass_class *oclass = NULL; 1024 I32 next_is_eval = 0; 1025 1026 switch (PL_regkind[(U8)OP(scan)]) { 1027 case WHILEM: /* End of (?:...)* . */ 1028 scan = NEXTOPER(scan); 1029 goto finish; 1030 case PLUS: 1031 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { 1032 next = NEXTOPER(scan); 1033 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { 1034 mincount = 1; 1035 maxcount = REG_INFTY; 1036 next = regnext(scan); 1037 scan = NEXTOPER(scan); 1038 goto do_curly; 1039 } 1040 } 1041 if (flags & SCF_DO_SUBSTR) 1042 data->pos_min++; 1043 min++; 1044 /* Fall through. */ 1045 case STAR: 1046 if (flags & SCF_DO_STCLASS) { 1047 mincount = 0; 1048 maxcount = REG_INFTY; 1049 next = regnext(scan); 1050 scan = NEXTOPER(scan); 1051 goto do_curly; 1052 } 1053 is_inf = is_inf_internal = 1; 1054 scan = regnext(scan); 1055 if (flags & SCF_DO_SUBSTR) { 1056 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */ 1057 data->longest = &(data->longest_float); 1058 } 1059 goto optimize_curly_tail; 1060 case CURLY: 1061 mincount = ARG1(scan); 1062 maxcount = ARG2(scan); 1063 next = regnext(scan); 1064 if (OP(scan) == CURLYX) { 1065 I32 lp = (data ? *(data->last_closep) : 0); 1066 1067 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX); 1068 } 1069 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 1070 next_is_eval = (OP(scan) == EVAL); 1071 do_curly: 1072 if (flags & SCF_DO_SUBSTR) { 1073 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */ 1074 pos_before = data->pos_min; 1075 } 1076 if (data) { 1077 fl = data->flags; 1078 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); 1079 if (is_inf) 1080 data->flags |= SF_IS_INF; 1081 } 1082 if (flags & SCF_DO_STCLASS) { 1083 cl_init(pRExC_state, &this_class); 1084 oclass = data->start_class; 1085 data->start_class = &this_class; 1086 f |= SCF_DO_STCLASS_AND; 1087 f &= ~SCF_DO_STCLASS_OR; 1088 } 1089 /* These are the cases when once a subexpression 1090 fails at a particular position, it cannot succeed 1091 even after backtracking at the enclosing scope. 1092 1093 XXXX what if minimal match and we are at the 1094 initial run of {n,m}? */ 1095 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) 1096 f &= ~SCF_WHILEM_VISITED_POS; 1097 1098 /* This will finish on WHILEM, setting scan, or on NULL: */ 1099 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data, 1100 mincount == 0 1101 ? (f & ~SCF_DO_SUBSTR) : f); 1102 1103 if (flags & SCF_DO_STCLASS) 1104 data->start_class = oclass; 1105 if (mincount == 0 || minnext == 0) { 1106 if (flags & SCF_DO_STCLASS_OR) { 1107 cl_or(pRExC_state, data->start_class, &this_class); 1108 } 1109 else if (flags & SCF_DO_STCLASS_AND) { 1110 /* Switch to OR mode: cache the old value of 1111 * data->start_class */ 1112 StructCopy(data->start_class, &and_with, 1113 struct regnode_charclass_class); 1114 flags &= ~SCF_DO_STCLASS_AND; 1115 StructCopy(&this_class, data->start_class, 1116 struct regnode_charclass_class); 1117 flags |= SCF_DO_STCLASS_OR; 1118 data->start_class->flags |= ANYOF_EOS; 1119 } 1120 } else { /* Non-zero len */ 1121 if (flags & SCF_DO_STCLASS_OR) { 1122 cl_or(pRExC_state, data->start_class, &this_class); 1123 cl_and(data->start_class, &and_with); 1124 } 1125 else if (flags & SCF_DO_STCLASS_AND) 1126 cl_and(data->start_class, &this_class); 1127 flags &= ~SCF_DO_STCLASS; 1128 } 1129 if (!scan) /* It was not CURLYX, but CURLY. */ 1130 scan = next; 1131 if (ckWARN(WARN_REGEXP) 1132 /* ? quantifier ok, except for (?{ ... }) */ 1133 && (next_is_eval || !(mincount == 0 && maxcount == 1)) 1134 && (minnext == 0) && (deltanext == 0) 1135 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) 1136 && maxcount <= REG_INFTY/3) /* Complement check for big count */ 1137 { 1138 vWARN(RExC_parse, 1139 "Quantifier unexpected on zero-length expression"); 1140 } 1141 1142 min += minnext * mincount; 1143 is_inf_internal |= ((maxcount == REG_INFTY 1144 && (minnext + deltanext) > 0) 1145 || deltanext == I32_MAX); 1146 is_inf |= is_inf_internal; 1147 delta += (minnext + deltanext) * maxcount - minnext * mincount; 1148 1149 /* Try powerful optimization CURLYX => CURLYN. */ 1150 if ( OP(oscan) == CURLYX && data 1151 && data->flags & SF_IN_PAR 1152 && !(data->flags & SF_HAS_EVAL) 1153 && !deltanext && minnext == 1 ) { 1154 /* Try to optimize to CURLYN. */ 1155 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; 1156 regnode *nxt1 = nxt; 1157 #ifdef DEBUGGING 1158 regnode *nxt2; 1159 #endif 1160 1161 /* Skip open. */ 1162 nxt = regnext(nxt); 1163 if (!strchr((char*)PL_simple,OP(nxt)) 1164 && !(PL_regkind[(U8)OP(nxt)] == EXACT 1165 && STR_LEN(nxt) == 1)) 1166 goto nogo; 1167 #ifdef DEBUGGING 1168 nxt2 = nxt; 1169 #endif 1170 nxt = regnext(nxt); 1171 if (OP(nxt) != CLOSE) 1172 goto nogo; 1173 /* Now we know that nxt2 is the only contents: */ 1174 oscan->flags = (U8)ARG(nxt); 1175 OP(oscan) = CURLYN; 1176 OP(nxt1) = NOTHING; /* was OPEN. */ 1177 #ifdef DEBUGGING 1178 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 1179 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */ 1180 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */ 1181 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 1182 OP(nxt + 1) = OPTIMIZED; /* was count. */ 1183 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ 1184 #endif 1185 } 1186 nogo: 1187 1188 /* Try optimization CURLYX => CURLYM. */ 1189 if ( OP(oscan) == CURLYX && data 1190 && !(data->flags & SF_HAS_PAR) 1191 && !(data->flags & SF_HAS_EVAL) 1192 && !deltanext ) { 1193 /* XXXX How to optimize if data == 0? */ 1194 /* Optimize to a simpler form. */ 1195 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ 1196 regnode *nxt2; 1197 1198 OP(oscan) = CURLYM; 1199 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ 1200 && (OP(nxt2) != WHILEM)) 1201 nxt = nxt2; 1202 OP(nxt2) = SUCCEED; /* Whas WHILEM */ 1203 /* Need to optimize away parenths. */ 1204 if (data->flags & SF_IN_PAR) { 1205 /* Set the parenth number. */ 1206 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ 1207 1208 if (OP(nxt) != CLOSE) 1209 FAIL("Panic opt close"); 1210 oscan->flags = (U8)ARG(nxt); 1211 OP(nxt1) = OPTIMIZED; /* was OPEN. */ 1212 OP(nxt) = OPTIMIZED; /* was CLOSE. */ 1213 #ifdef DEBUGGING 1214 OP(nxt1 + 1) = OPTIMIZED; /* was count. */ 1215 OP(nxt + 1) = OPTIMIZED; /* was count. */ 1216 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */ 1217 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */ 1218 #endif 1219 #if 0 1220 while ( nxt1 && (OP(nxt1) != WHILEM)) { 1221 regnode *nnxt = regnext(nxt1); 1222 1223 if (nnxt == nxt) { 1224 if (reg_off_by_arg[OP(nxt1)]) 1225 ARG_SET(nxt1, nxt2 - nxt1); 1226 else if (nxt2 - nxt1 < U16_MAX) 1227 NEXT_OFF(nxt1) = nxt2 - nxt1; 1228 else 1229 OP(nxt) = NOTHING; /* Cannot beautify */ 1230 } 1231 nxt1 = nnxt; 1232 } 1233 #endif 1234 /* Optimize again: */ 1235 study_chunk(pRExC_state, &nxt1, &deltanext, nxt, 1236 NULL, 0); 1237 } 1238 else 1239 oscan->flags = 0; 1240 } 1241 else if ((OP(oscan) == CURLYX) 1242 && (flags & SCF_WHILEM_VISITED_POS) 1243 /* See the comment on a similar expression above. 1244 However, this time it not a subexpression 1245 we care about, but the expression itself. */ 1246 && (maxcount == REG_INFTY) 1247 && data && ++data->whilem_c < 16) { 1248 /* This stays as CURLYX, we can put the count/of pair. */ 1249 /* Find WHILEM (as in regexec.c) */ 1250 regnode *nxt = oscan + NEXT_OFF(oscan); 1251 1252 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ 1253 nxt += ARG(nxt); 1254 PREVOPER(nxt)->flags = (U8)(data->whilem_c 1255 | (RExC_whilem_seen << 4)); /* On WHILEM */ 1256 } 1257 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 1258 pars++; 1259 if (flags & SCF_DO_SUBSTR) { 1260 SV *last_str = Nullsv; 1261 int counted = mincount != 0; 1262 1263 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ 1264 #if defined(SPARC64_GCC_WORKAROUND) 1265 I32 b = 0; 1266 STRLEN l = 0; 1267 char *s = NULL; 1268 I32 old = 0; 1269 1270 if (pos_before >= data->last_start_min) 1271 b = pos_before; 1272 else 1273 b = data->last_start_min; 1274 1275 l = 0; 1276 s = SvPV(data->last_found, l); 1277 old = b - data->last_start_min; 1278 1279 #else 1280 I32 b = pos_before >= data->last_start_min 1281 ? pos_before : data->last_start_min; 1282 STRLEN l; 1283 char *s = SvPV(data->last_found, l); 1284 I32 old = b - data->last_start_min; 1285 #endif 1286 1287 if (UTF) 1288 old = utf8_hop((U8*)s, old) - (U8*)s; 1289 1290 l -= old; 1291 /* Get the added string: */ 1292 last_str = newSVpvn(s + old, l); 1293 if (deltanext == 0 && pos_before == b) { 1294 /* What was added is a constant string */ 1295 if (mincount > 1) { 1296 SvGROW(last_str, (mincount * l) + 1); 1297 repeatcpy(SvPVX(last_str) + l, 1298 SvPVX(last_str), l, mincount - 1); 1299 SvCUR(last_str) *= mincount; 1300 /* Add additional parts. */ 1301 SvCUR_set(data->last_found, 1302 SvCUR(data->last_found) - l); 1303 sv_catsv(data->last_found, last_str); 1304 data->last_end += l * (mincount - 1); 1305 } 1306 } else { 1307 /* start offset must point into the last copy */ 1308 data->last_start_min += minnext * (mincount - 1); 1309 data->last_start_max += is_inf ? 0 : (maxcount - 1) 1310 * (minnext + data->pos_delta); 1311 } 1312 } 1313 /* It is counted once already... */ 1314 data->pos_min += minnext * (mincount - counted); 1315 data->pos_delta += - counted * deltanext + 1316 (minnext + deltanext) * maxcount - minnext * mincount; 1317 if (mincount != maxcount) { 1318 /* Cannot extend fixed substrings found inside 1319 the group. */ 1320 scan_commit(pRExC_state,data); 1321 if (mincount && last_str) { 1322 sv_setsv(data->last_found, last_str); 1323 data->last_end = data->pos_min; 1324 data->last_start_min = 1325 data->pos_min - CHR_SVLEN(last_str); 1326 data->last_start_max = is_inf 1327 ? I32_MAX 1328 : data->pos_min + data->pos_delta 1329 - CHR_SVLEN(last_str); 1330 } 1331 data->longest = &(data->longest_float); 1332 } 1333 SvREFCNT_dec(last_str); 1334 } 1335 if (data && (fl & SF_HAS_EVAL)) 1336 data->flags |= SF_HAS_EVAL; 1337 optimize_curly_tail: 1338 if (OP(oscan) != CURLYX) { 1339 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING 1340 && NEXT_OFF(next)) 1341 NEXT_OFF(oscan) += NEXT_OFF(next); 1342 } 1343 continue; 1344 default: /* REF and CLUMP only? */ 1345 if (flags & SCF_DO_SUBSTR) { 1346 scan_commit(pRExC_state,data); /* Cannot expect anything... */ 1347 data->longest = &(data->longest_float); 1348 } 1349 is_inf = is_inf_internal = 1; 1350 if (flags & SCF_DO_STCLASS_OR) 1351 cl_anything(pRExC_state, data->start_class); 1352 flags &= ~SCF_DO_STCLASS; 1353 break; 1354 } 1355 } 1356 else if (strchr((char*)PL_simple,OP(scan))) { 1357 int value = 0; 1358 1359 if (flags & SCF_DO_SUBSTR) { 1360 scan_commit(pRExC_state,data); 1361 data->pos_min++; 1362 } 1363 min++; 1364 if (flags & SCF_DO_STCLASS) { 1365 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ 1366 1367 /* Some of the logic below assumes that switching 1368 locale on will only add false positives. */ 1369 switch (PL_regkind[(U8)OP(scan)]) { 1370 case SANY: 1371 default: 1372 do_default: 1373 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ 1374 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 1375 cl_anything(pRExC_state, data->start_class); 1376 break; 1377 case REG_ANY: 1378 if (OP(scan) == SANY) 1379 goto do_default; 1380 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ 1381 value = (ANYOF_BITMAP_TEST(data->start_class,'\n') 1382 || (data->start_class->flags & ANYOF_CLASS)); 1383 cl_anything(pRExC_state, data->start_class); 1384 } 1385 if (flags & SCF_DO_STCLASS_AND || !value) 1386 ANYOF_BITMAP_CLEAR(data->start_class,'\n'); 1387 break; 1388 case ANYOF: 1389 if (flags & SCF_DO_STCLASS_AND) 1390 cl_and(data->start_class, 1391 (struct regnode_charclass_class*)scan); 1392 else 1393 cl_or(pRExC_state, data->start_class, 1394 (struct regnode_charclass_class*)scan); 1395 break; 1396 case ALNUM: 1397 if (flags & SCF_DO_STCLASS_AND) { 1398 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1399 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 1400 for (value = 0; value < 256; value++) 1401 if (!isALNUM(value)) 1402 ANYOF_BITMAP_CLEAR(data->start_class, value); 1403 } 1404 } 1405 else { 1406 if (data->start_class->flags & ANYOF_LOCALE) 1407 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 1408 else { 1409 for (value = 0; value < 256; value++) 1410 if (isALNUM(value)) 1411 ANYOF_BITMAP_SET(data->start_class, value); 1412 } 1413 } 1414 break; 1415 case ALNUML: 1416 if (flags & SCF_DO_STCLASS_AND) { 1417 if (data->start_class->flags & ANYOF_LOCALE) 1418 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); 1419 } 1420 else { 1421 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); 1422 data->start_class->flags |= ANYOF_LOCALE; 1423 } 1424 break; 1425 case NALNUM: 1426 if (flags & SCF_DO_STCLASS_AND) { 1427 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1428 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 1429 for (value = 0; value < 256; value++) 1430 if (isALNUM(value)) 1431 ANYOF_BITMAP_CLEAR(data->start_class, value); 1432 } 1433 } 1434 else { 1435 if (data->start_class->flags & ANYOF_LOCALE) 1436 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 1437 else { 1438 for (value = 0; value < 256; value++) 1439 if (!isALNUM(value)) 1440 ANYOF_BITMAP_SET(data->start_class, value); 1441 } 1442 } 1443 break; 1444 case NALNUML: 1445 if (flags & SCF_DO_STCLASS_AND) { 1446 if (data->start_class->flags & ANYOF_LOCALE) 1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); 1448 } 1449 else { 1450 data->start_class->flags |= ANYOF_LOCALE; 1451 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); 1452 } 1453 break; 1454 case SPACE: 1455 if (flags & SCF_DO_STCLASS_AND) { 1456 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1457 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 1458 for (value = 0; value < 256; value++) 1459 if (!isSPACE(value)) 1460 ANYOF_BITMAP_CLEAR(data->start_class, value); 1461 } 1462 } 1463 else { 1464 if (data->start_class->flags & ANYOF_LOCALE) 1465 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 1466 else { 1467 for (value = 0; value < 256; value++) 1468 if (isSPACE(value)) 1469 ANYOF_BITMAP_SET(data->start_class, value); 1470 } 1471 } 1472 break; 1473 case SPACEL: 1474 if (flags & SCF_DO_STCLASS_AND) { 1475 if (data->start_class->flags & ANYOF_LOCALE) 1476 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); 1477 } 1478 else { 1479 data->start_class->flags |= ANYOF_LOCALE; 1480 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); 1481 } 1482 break; 1483 case NSPACE: 1484 if (flags & SCF_DO_STCLASS_AND) { 1485 if (!(data->start_class->flags & ANYOF_LOCALE)) { 1486 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 1487 for (value = 0; value < 256; value++) 1488 if (isSPACE(value)) 1489 ANYOF_BITMAP_CLEAR(data->start_class, value); 1490 } 1491 } 1492 else { 1493 if (data->start_class->flags & ANYOF_LOCALE) 1494 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 1495 else { 1496 for (value = 0; value < 256; value++) 1497 if (!isSPACE(value)) 1498 ANYOF_BITMAP_SET(data->start_class, value); 1499 } 1500 } 1501 break; 1502 case NSPACEL: 1503 if (flags & SCF_DO_STCLASS_AND) { 1504 if (data->start_class->flags & ANYOF_LOCALE) { 1505 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); 1506 for (value = 0; value < 256; value++) 1507 if (!isSPACE(value)) 1508 ANYOF_BITMAP_CLEAR(data->start_class, value); 1509 } 1510 } 1511 else { 1512 data->start_class->flags |= ANYOF_LOCALE; 1513 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); 1514 } 1515 break; 1516 case DIGIT: 1517 if (flags & SCF_DO_STCLASS_AND) { 1518 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); 1519 for (value = 0; value < 256; value++) 1520 if (!isDIGIT(value)) 1521 ANYOF_BITMAP_CLEAR(data->start_class, value); 1522 } 1523 else { 1524 if (data->start_class->flags & ANYOF_LOCALE) 1525 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); 1526 else { 1527 for (value = 0; value < 256; value++) 1528 if (isDIGIT(value)) 1529 ANYOF_BITMAP_SET(data->start_class, value); 1530 } 1531 } 1532 break; 1533 case NDIGIT: 1534 if (flags & SCF_DO_STCLASS_AND) { 1535 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); 1536 for (value = 0; value < 256; value++) 1537 if (isDIGIT(value)) 1538 ANYOF_BITMAP_CLEAR(data->start_class, value); 1539 } 1540 else { 1541 if (data->start_class->flags & ANYOF_LOCALE) 1542 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); 1543 else { 1544 for (value = 0; value < 256; value++) 1545 if (!isDIGIT(value)) 1546 ANYOF_BITMAP_SET(data->start_class, value); 1547 } 1548 } 1549 break; 1550 } 1551 if (flags & SCF_DO_STCLASS_OR) 1552 cl_and(data->start_class, &and_with); 1553 flags &= ~SCF_DO_STCLASS; 1554 } 1555 } 1556 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { 1557 data->flags |= (OP(scan) == MEOL 1558 ? SF_BEFORE_MEOL 1559 : SF_BEFORE_SEOL); 1560 } 1561 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ 1562 /* Lookbehind, or need to calculate parens/evals/stclass: */ 1563 && (scan->flags || data || (flags & SCF_DO_STCLASS)) 1564 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { 1565 /* Lookahead/lookbehind */ 1566 I32 deltanext, minnext, fake = 0; 1567 regnode *nscan; 1568 struct regnode_charclass_class intrnl; 1569 int f = 0; 1570 1571 data_fake.flags = 0; 1572 if (data) { 1573 data_fake.whilem_c = data->whilem_c; 1574 data_fake.last_closep = data->last_closep; 1575 } 1576 else 1577 data_fake.last_closep = &fake; 1578 if ( flags & SCF_DO_STCLASS && !scan->flags 1579 && OP(scan) == IFMATCH ) { /* Lookahead */ 1580 cl_init(pRExC_state, &intrnl); 1581 data_fake.start_class = &intrnl; 1582 f |= SCF_DO_STCLASS_AND; 1583 } 1584 if (flags & SCF_WHILEM_VISITED_POS) 1585 f |= SCF_WHILEM_VISITED_POS; 1586 next = regnext(scan); 1587 nscan = NEXTOPER(NEXTOPER(scan)); 1588 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f); 1589 if (scan->flags) { 1590 if (deltanext) { 1591 vFAIL("Variable length lookbehind not implemented"); 1592 } 1593 else if (minnext > U8_MAX) { 1594 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); 1595 } 1596 scan->flags = (U8)minnext; 1597 } 1598 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) 1599 pars++; 1600 if (data && (data_fake.flags & SF_HAS_EVAL)) 1601 data->flags |= SF_HAS_EVAL; 1602 if (data) 1603 data->whilem_c = data_fake.whilem_c; 1604 if (f & SCF_DO_STCLASS_AND) { 1605 int was = (data->start_class->flags & ANYOF_EOS); 1606 1607 cl_and(data->start_class, &intrnl); 1608 if (was) 1609 data->start_class->flags |= ANYOF_EOS; 1610 } 1611 } 1612 else if (OP(scan) == OPEN) { 1613 pars++; 1614 } 1615 else if (OP(scan) == CLOSE) { 1616 if ((I32)ARG(scan) == is_par) { 1617 next = regnext(scan); 1618 1619 if ( next && (OP(next) != WHILEM) && next < last) 1620 is_par = 0; /* Disable optimization */ 1621 } 1622 if (data) 1623 *(data->last_closep) = ARG(scan); 1624 } 1625 else if (OP(scan) == EVAL) { 1626 if (data) 1627 data->flags |= SF_HAS_EVAL; 1628 } 1629 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */ 1630 if (flags & SCF_DO_SUBSTR) { 1631 scan_commit(pRExC_state,data); 1632 data->longest = &(data->longest_float); 1633 } 1634 is_inf = is_inf_internal = 1; 1635 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ 1636 cl_anything(pRExC_state, data->start_class); 1637 flags &= ~SCF_DO_STCLASS; 1638 } 1639 /* Else: zero-length, ignore. */ 1640 scan = regnext(scan); 1641 } 1642 1643 finish: 1644 *scanp = scan; 1645 *deltap = is_inf_internal ? I32_MAX : delta; 1646 if (flags & SCF_DO_SUBSTR && is_inf) 1647 data->pos_delta = I32_MAX - data->pos_min; 1648 if (is_par > U8_MAX) 1649 is_par = 0; 1650 if (is_par && pars==1 && data) { 1651 data->flags |= SF_IN_PAR; 1652 data->flags &= ~SF_HAS_PAR; 1653 } 1654 else if (pars && data) { 1655 data->flags |= SF_HAS_PAR; 1656 data->flags &= ~SF_IN_PAR; 1657 } 1658 if (flags & SCF_DO_STCLASS_OR) 1659 cl_and(data->start_class, &and_with); 1660 return min; 1661 } 1662 1663 STATIC I32 1664 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s) 1665 { 1666 if (RExC_rx->data) { 1667 Renewc(RExC_rx->data, 1668 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1), 1669 char, struct reg_data); 1670 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8); 1671 RExC_rx->data->count += n; 1672 } 1673 else { 1674 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1), 1675 char, struct reg_data); 1676 New(1208, RExC_rx->data->what, n, U8); 1677 RExC_rx->data->count = n; 1678 } 1679 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8); 1680 return RExC_rx->data->count - n; 1681 } 1682 1683 void 1684 Perl_reginitcolors(pTHX) 1685 { 1686 int i = 0; 1687 char *s = PerlEnv_getenv("PERL_RE_COLORS"); 1688 1689 if (s) { 1690 PL_colors[0] = s = savepv(s); 1691 while (++i < 6) { 1692 s = strchr(s, '\t'); 1693 if (s) { 1694 *s = '\0'; 1695 PL_colors[i] = ++s; 1696 } 1697 else 1698 PL_colors[i] = s = ""; 1699 } 1700 } else { 1701 while (i < 6) 1702 PL_colors[i++] = ""; 1703 } 1704 PL_colorset = 1; 1705 } 1706 1707 1708 /* 1709 - pregcomp - compile a regular expression into internal code 1710 * 1711 * We can't allocate space until we know how big the compiled form will be, 1712 * but we can't compile it (and thus know how big it is) until we've got a 1713 * place to put the code. So we cheat: we compile it twice, once with code 1714 * generation turned off and size counting turned on, and once "for real". 1715 * This also means that we don't allocate space until we are sure that the 1716 * thing really will compile successfully, and we never have to move the 1717 * code and thus invalidate pointers into it. (Note that it has to be in 1718 * one piece because free() must be able to free it all.) [NB: not true in perl] 1719 * 1720 * Beware that the optimization-preparation code in here knows about some 1721 * of the structure of the compiled regexp. [I'll say.] 1722 */ 1723 regexp * 1724 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) 1725 { 1726 register regexp *r; 1727 regnode *scan; 1728 regnode *first; 1729 I32 flags; 1730 I32 minlen = 0; 1731 I32 sawplus = 0; 1732 I32 sawopen = 0; 1733 scan_data_t data; 1734 RExC_state_t RExC_state; 1735 RExC_state_t *pRExC_state = &RExC_state; 1736 1737 if (exp == NULL) 1738 FAIL("NULL regexp argument"); 1739 1740 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; 1741 1742 RExC_precomp = exp; 1743 DEBUG_r({ 1744 if (!PL_colorset) reginitcolors(); 1745 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", 1746 PL_colors[4],PL_colors[5],PL_colors[0], 1747 (int)(xend - exp), RExC_precomp, PL_colors[1]); 1748 }); 1749 RExC_flags = pm->op_pmflags; 1750 RExC_sawback = 0; 1751 1752 RExC_seen = 0; 1753 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 1754 RExC_seen_evals = 0; 1755 RExC_extralen = 0; 1756 1757 /* First pass: determine size, legality. */ 1758 RExC_parse = exp; 1759 RExC_start = exp; 1760 RExC_end = xend; 1761 RExC_naughty = 0; 1762 RExC_npar = 1; 1763 RExC_size = 0L; 1764 RExC_emit = &PL_regdummy; 1765 RExC_whilem_seen = 0; 1766 #if 0 /* REGC() is (currently) a NOP at the first pass. 1767 * Clever compilers notice this and complain. --jhi */ 1768 REGC((U8)REG_MAGIC, (char*)RExC_emit); 1769 #endif 1770 if (reg(pRExC_state, 0, &flags) == NULL) { 1771 RExC_precomp = Nullch; 1772 return(NULL); 1773 } 1774 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size)); 1775 1776 /* Small enough for pointer-storage convention? 1777 If extralen==0, this means that we will not need long jumps. */ 1778 if (RExC_size >= 0x10000L && RExC_extralen) 1779 RExC_size += RExC_extralen; 1780 else 1781 RExC_extralen = 0; 1782 if (RExC_whilem_seen > 15) 1783 RExC_whilem_seen = 15; 1784 1785 /* Allocate space and initialize. */ 1786 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), 1787 char, regexp); 1788 if (r == NULL) 1789 FAIL("Regexp out of space"); 1790 1791 #ifdef DEBUGGING 1792 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ 1793 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char); 1794 #endif 1795 r->refcnt = 1; 1796 r->prelen = xend - exp; 1797 r->precomp = savepvn(RExC_precomp, r->prelen); 1798 r->subbeg = NULL; 1799 r->reganch = pm->op_pmflags & PMf_COMPILETIME; 1800 r->nparens = RExC_npar - 1; /* set early to validate backrefs */ 1801 1802 r->substrs = 0; /* Useful during FAIL. */ 1803 r->startp = 0; /* Useful during FAIL. */ 1804 r->endp = 0; /* Useful during FAIL. */ 1805 1806 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ 1807 if (r->offsets) { 1808 r->offsets[0] = RExC_size; 1809 } 1810 DEBUG_r(PerlIO_printf(Perl_debug_log, 1811 "%s %"UVuf" bytes for offset annotations.\n", 1812 r->offsets ? "Got" : "Couldn't get", 1813 (UV)((2*RExC_size+1) * sizeof(U32)))); 1814 1815 RExC_rx = r; 1816 1817 /* Second pass: emit code. */ 1818 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */ 1819 RExC_parse = exp; 1820 RExC_end = xend; 1821 RExC_naughty = 0; 1822 RExC_npar = 1; 1823 RExC_emit_start = r->program; 1824 RExC_emit = r->program; 1825 /* Store the count of eval-groups for security checks: */ 1826 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals); 1827 REGC((U8)REG_MAGIC, (char*) RExC_emit++); 1828 r->data = 0; 1829 if (reg(pRExC_state, 0, &flags) == NULL) 1830 return(NULL); 1831 1832 /* Dig out information for optimizations. */ 1833 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */ 1834 pm->op_pmflags = RExC_flags; 1835 if (UTF) 1836 r->reganch |= ROPT_UTF8; /* Unicode in it? */ 1837 r->regstclass = NULL; 1838 if (RExC_naughty >= 10) /* Probably an expensive pattern. */ 1839 r->reganch |= ROPT_NAUGHTY; 1840 scan = r->program + 1; /* First BRANCH. */ 1841 1842 /* XXXX To minimize changes to RE engine we always allocate 1843 3-units-long substrs field. */ 1844 Newz(1004, r->substrs, 1, struct reg_substr_data); 1845 1846 StructCopy(&zero_scan_data, &data, scan_data_t); 1847 /* XXXX Should not we check for something else? Usually it is OPEN1... */ 1848 if (OP(scan) != BRANCH) { /* Only one top-level choice. */ 1849 I32 fake; 1850 STRLEN longest_float_length, longest_fixed_length; 1851 struct regnode_charclass_class ch_class; 1852 int stclass_flag; 1853 I32 last_close = 0; 1854 1855 first = scan; 1856 /* Skip introductions and multiplicators >= 1. */ 1857 while ((OP(first) == OPEN && (sawopen = 1)) || 1858 /* An OR of *one* alternative - should not happen now. */ 1859 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || 1860 (OP(first) == PLUS) || 1861 (OP(first) == MINMOD) || 1862 /* An {n,m} with n>0 */ 1863 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) { 1864 if (OP(first) == PLUS) 1865 sawplus = 1; 1866 else 1867 first += regarglen[(U8)OP(first)]; 1868 first = NEXTOPER(first); 1869 } 1870 1871 /* Starting-point info. */ 1872 again: 1873 if (PL_regkind[(U8)OP(first)] == EXACT) { 1874 if (OP(first) == EXACT) 1875 ; /* Empty, get anchored substr later. */ 1876 else if ((OP(first) == EXACTF || OP(first) == EXACTFL)) 1877 r->regstclass = first; 1878 } 1879 else if (strchr((char*)PL_simple,OP(first))) 1880 r->regstclass = first; 1881 else if (PL_regkind[(U8)OP(first)] == BOUND || 1882 PL_regkind[(U8)OP(first)] == NBOUND) 1883 r->regstclass = first; 1884 else if (PL_regkind[(U8)OP(first)] == BOL) { 1885 r->reganch |= (OP(first) == MBOL 1886 ? ROPT_ANCH_MBOL 1887 : (OP(first) == SBOL 1888 ? ROPT_ANCH_SBOL 1889 : ROPT_ANCH_BOL)); 1890 first = NEXTOPER(first); 1891 goto again; 1892 } 1893 else if (OP(first) == GPOS) { 1894 r->reganch |= ROPT_ANCH_GPOS; 1895 first = NEXTOPER(first); 1896 goto again; 1897 } 1898 else if (!sawopen && (OP(first) == STAR && 1899 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) && 1900 !(r->reganch & ROPT_ANCH) ) 1901 { 1902 /* turn .* into ^.* with an implied $*=1 */ 1903 int type = OP(NEXTOPER(first)); 1904 1905 if (type == REG_ANY) 1906 type = ROPT_ANCH_MBOL; 1907 else 1908 type = ROPT_ANCH_SBOL; 1909 1910 r->reganch |= type | ROPT_IMPLICIT; 1911 first = NEXTOPER(first); 1912 goto again; 1913 } 1914 if (sawplus && (!sawopen || !RExC_sawback) 1915 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ 1916 /* x+ must match at the 1st pos of run of x's */ 1917 r->reganch |= ROPT_SKIP; 1918 1919 /* Scan is after the zeroth branch, first is atomic matcher. */ 1920 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 1921 (IV)(first - scan + 1))); 1922 /* 1923 * If there's something expensive in the r.e., find the 1924 * longest literal string that must appear and make it the 1925 * regmust. Resolve ties in favor of later strings, since 1926 * the regstart check works with the beginning of the r.e. 1927 * and avoiding duplication strengthens checking. Not a 1928 * strong reason, but sufficient in the absence of others. 1929 * [Now we resolve ties in favor of the earlier string if 1930 * it happens that c_offset_min has been invalidated, since the 1931 * earlier string may buy us something the later one won't.] 1932 */ 1933 minlen = 0; 1934 1935 data.longest_fixed = newSVpvn("",0); 1936 data.longest_float = newSVpvn("",0); 1937 data.last_found = newSVpvn("",0); 1938 data.longest = &(data.longest_fixed); 1939 first = scan; 1940 if (!r->regstclass) { 1941 cl_init(pRExC_state, &ch_class); 1942 data.start_class = &ch_class; 1943 stclass_flag = SCF_DO_STCLASS_AND; 1944 } else /* XXXX Check for BOUND? */ 1945 stclass_flag = 0; 1946 data.last_closep = &last_close; 1947 1948 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */ 1949 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag); 1950 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) 1951 && data.last_start_min == 0 && data.last_end > 0 1952 && !RExC_seen_zerolen 1953 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) 1954 r->reganch |= ROPT_CHECK_ALL; 1955 scan_commit(pRExC_state, &data); 1956 SvREFCNT_dec(data.last_found); 1957 1958 longest_float_length = CHR_SVLEN(data.longest_float); 1959 if (longest_float_length 1960 || (data.flags & SF_FL_BEFORE_EOL 1961 && (!(data.flags & SF_FL_BEFORE_MEOL) 1962 || (RExC_flags & PMf_MULTILINE)))) { 1963 int t; 1964 1965 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ 1966 && data.offset_fixed == data.offset_float_min 1967 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) 1968 goto remove_float; /* As in (a)+. */ 1969 1970 if (SvUTF8(data.longest_float)) { 1971 r->float_utf8 = data.longest_float; 1972 r->float_substr = Nullsv; 1973 } else { 1974 r->float_substr = data.longest_float; 1975 r->float_utf8 = Nullsv; 1976 } 1977 r->float_min_offset = data.offset_float_min; 1978 r->float_max_offset = data.offset_float_max; 1979 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ 1980 && (!(data.flags & SF_FL_BEFORE_MEOL) 1981 || (RExC_flags & PMf_MULTILINE))); 1982 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); 1983 } 1984 else { 1985 remove_float: 1986 r->float_substr = r->float_utf8 = Nullsv; 1987 SvREFCNT_dec(data.longest_float); 1988 longest_float_length = 0; 1989 } 1990 1991 longest_fixed_length = CHR_SVLEN(data.longest_fixed); 1992 if (longest_fixed_length 1993 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ 1994 && (!(data.flags & SF_FIX_BEFORE_MEOL) 1995 || (RExC_flags & PMf_MULTILINE)))) { 1996 int t; 1997 1998 if (SvUTF8(data.longest_fixed)) { 1999 r->anchored_utf8 = data.longest_fixed; 2000 r->anchored_substr = Nullsv; 2001 } else { 2002 r->anchored_substr = data.longest_fixed; 2003 r->anchored_utf8 = Nullsv; 2004 } 2005 r->anchored_offset = data.offset_fixed; 2006 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ 2007 && (!(data.flags & SF_FIX_BEFORE_MEOL) 2008 || (RExC_flags & PMf_MULTILINE))); 2009 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); 2010 } 2011 else { 2012 r->anchored_substr = r->anchored_utf8 = Nullsv; 2013 SvREFCNT_dec(data.longest_fixed); 2014 longest_fixed_length = 0; 2015 } 2016 if (r->regstclass 2017 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) 2018 r->regstclass = NULL; 2019 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) 2020 && stclass_flag 2021 && !(data.start_class->flags & ANYOF_EOS) 2022 && !cl_is_anything(data.start_class)) 2023 { 2024 I32 n = add_data(pRExC_state, 1, "f"); 2025 2026 New(1006, RExC_rx->data->data[n], 1, 2027 struct regnode_charclass_class); 2028 StructCopy(data.start_class, 2029 (struct regnode_charclass_class*)RExC_rx->data->data[n], 2030 struct regnode_charclass_class); 2031 r->regstclass = (regnode*)RExC_rx->data->data[n]; 2032 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ 2033 PL_regdata = r->data; /* for regprop() */ 2034 DEBUG_r({ SV *sv = sv_newmortal(); 2035 regprop(sv, (regnode*)data.start_class); 2036 PerlIO_printf(Perl_debug_log, 2037 "synthetic stclass `%s'.\n", 2038 SvPVX(sv));}); 2039 } 2040 2041 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ 2042 if (longest_fixed_length > longest_float_length) { 2043 r->check_substr = r->anchored_substr; 2044 r->check_utf8 = r->anchored_utf8; 2045 r->check_offset_min = r->check_offset_max = r->anchored_offset; 2046 if (r->reganch & ROPT_ANCH_SINGLE) 2047 r->reganch |= ROPT_NOSCAN; 2048 } 2049 else { 2050 r->check_substr = r->float_substr; 2051 r->check_utf8 = r->float_utf8; 2052 r->check_offset_min = data.offset_float_min; 2053 r->check_offset_max = data.offset_float_max; 2054 } 2055 /* XXXX Currently intuiting is not compatible with ANCH_GPOS. 2056 This should be changed ASAP! */ 2057 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) { 2058 r->reganch |= RE_USE_INTUIT; 2059 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) 2060 r->reganch |= RE_INTUIT_TAIL; 2061 } 2062 } 2063 else { 2064 /* Several toplevels. Best we can is to set minlen. */ 2065 I32 fake; 2066 struct regnode_charclass_class ch_class; 2067 I32 last_close = 0; 2068 2069 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n")); 2070 scan = r->program + 1; 2071 cl_init(pRExC_state, &ch_class); 2072 data.start_class = &ch_class; 2073 data.last_closep = &last_close; 2074 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); 2075 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 2076 = r->float_substr = r->float_utf8 = Nullsv; 2077 if (!(data.start_class->flags & ANYOF_EOS) 2078 && !cl_is_anything(data.start_class)) 2079 { 2080 I32 n = add_data(pRExC_state, 1, "f"); 2081 2082 New(1006, RExC_rx->data->data[n], 1, 2083 struct regnode_charclass_class); 2084 StructCopy(data.start_class, 2085 (struct regnode_charclass_class*)RExC_rx->data->data[n], 2086 struct regnode_charclass_class); 2087 r->regstclass = (regnode*)RExC_rx->data->data[n]; 2088 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ 2089 DEBUG_r({ SV* sv = sv_newmortal(); 2090 regprop(sv, (regnode*)data.start_class); 2091 PerlIO_printf(Perl_debug_log, 2092 "synthetic stclass `%s'.\n", 2093 SvPVX(sv));}); 2094 } 2095 } 2096 2097 r->minlen = minlen; 2098 if (RExC_seen & REG_SEEN_GPOS) 2099 r->reganch |= ROPT_GPOS_SEEN; 2100 if (RExC_seen & REG_SEEN_LOOKBEHIND) 2101 r->reganch |= ROPT_LOOKBEHIND_SEEN; 2102 if (RExC_seen & REG_SEEN_EVAL) 2103 r->reganch |= ROPT_EVAL_SEEN; 2104 if (RExC_seen & REG_SEEN_CANY) 2105 r->reganch |= ROPT_CANY_SEEN; 2106 Newz(1002, r->startp, RExC_npar, I32); 2107 Newz(1002, r->endp, RExC_npar, I32); 2108 PL_regdata = r->data; /* for regprop() */ 2109 DEBUG_r(regdump(r)); 2110 return(r); 2111 } 2112 2113 /* 2114 - reg - regular expression, i.e. main body or parenthesized thing 2115 * 2116 * Caller must absorb opening parenthesis. 2117 * 2118 * Combining parenthesis handling with the base level of regular expression 2119 * is a trifle forced, but the need to tie the tails of the branches to what 2120 * follows makes it hard to avoid. 2121 */ 2122 STATIC regnode * 2123 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) 2124 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ 2125 { 2126 register regnode *ret; /* Will be the head of the group. */ 2127 register regnode *br; 2128 register regnode *lastbr; 2129 register regnode *ender = 0; 2130 register I32 parno = 0; 2131 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0; 2132 2133 /* for (?g), (?gc), and (?o) warnings; warning 2134 about (?c) will warn about (?g) -- japhy */ 2135 2136 I32 wastedflags = 0x00, 2137 wasted_o = 0x01, 2138 wasted_g = 0x02, 2139 wasted_gc = 0x02 | 0x04, 2140 wasted_c = 0x04; 2141 2142 char * parse_start = RExC_parse; /* MJD */ 2143 char *oregcomp_parse = RExC_parse; 2144 char c; 2145 2146 *flagp = 0; /* Tentatively. */ 2147 2148 2149 /* Make an OPEN node, if parenthesized. */ 2150 if (paren) { 2151 if (*RExC_parse == '?') { /* (?...) */ 2152 U32 posflags = 0, negflags = 0; 2153 U32 *flagsp = &posflags; 2154 int logical = 0; 2155 char *seqstart = RExC_parse; 2156 2157 RExC_parse++; 2158 paren = *RExC_parse++; 2159 ret = NULL; /* For look-ahead/behind. */ 2160 switch (paren) { 2161 case '<': /* (?<...) */ 2162 RExC_seen |= REG_SEEN_LOOKBEHIND; 2163 if (*RExC_parse == '!') 2164 paren = ','; 2165 if (*RExC_parse != '=' && *RExC_parse != '!') 2166 goto unknown; 2167 RExC_parse++; 2168 case '=': /* (?=...) */ 2169 case '!': /* (?!...) */ 2170 RExC_seen_zerolen++; 2171 case ':': /* (?:...) */ 2172 case '>': /* (?>...) */ 2173 break; 2174 case '$': /* (?$...) */ 2175 case '@': /* (?@...) */ 2176 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 2177 break; 2178 case '#': /* (?#...) */ 2179 while (*RExC_parse && *RExC_parse != ')') 2180 RExC_parse++; 2181 if (*RExC_parse != ')') 2182 FAIL("Sequence (?#... not terminated"); 2183 nextchar(pRExC_state); 2184 *flagp = TRYAGAIN; 2185 return NULL; 2186 case 'p': /* (?p...) */ 2187 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) 2188 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})"); 2189 /* FALL THROUGH*/ 2190 case '?': /* (??...) */ 2191 logical = 1; 2192 if (*RExC_parse != '{') 2193 goto unknown; 2194 paren = *RExC_parse++; 2195 /* FALL THROUGH */ 2196 case '{': /* (?{...}) */ 2197 { 2198 I32 count = 1, n = 0; 2199 char c; 2200 char *s = RExC_parse; 2201 SV *sv; 2202 OP_4tree *sop, *rop; 2203 2204 RExC_seen_zerolen++; 2205 RExC_seen |= REG_SEEN_EVAL; 2206 while (count && (c = *RExC_parse)) { 2207 if (c == '\\' && RExC_parse[1]) 2208 RExC_parse++; 2209 else if (c == '{') 2210 count++; 2211 else if (c == '}') 2212 count--; 2213 RExC_parse++; 2214 } 2215 if (*RExC_parse != ')') 2216 { 2217 RExC_parse = s; 2218 vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); 2219 } 2220 if (!SIZE_ONLY) { 2221 AV *av; 2222 2223 if (RExC_parse - 1 - s) 2224 sv = newSVpvn(s, RExC_parse - 1 - s); 2225 else 2226 sv = newSVpvn("", 0); 2227 2228 ENTER; 2229 Perl_save_re_context(aTHX); 2230 rop = sv_compile_2op(sv, &sop, "re", &av); 2231 sop->op_private |= OPpREFCOUNTED; 2232 /* re_dup will OpREFCNT_inc */ 2233 OpREFCNT_set(sop, 1); 2234 LEAVE; 2235 2236 n = add_data(pRExC_state, 3, "nop"); 2237 RExC_rx->data->data[n] = (void*)rop; 2238 RExC_rx->data->data[n+1] = (void*)sop; 2239 RExC_rx->data->data[n+2] = (void*)av; 2240 SvREFCNT_dec(sv); 2241 } 2242 else { /* First pass */ 2243 if (PL_reginterp_cnt < ++RExC_seen_evals 2244 && PL_curcop != &PL_compiling) 2245 /* No compiled RE interpolated, has runtime 2246 components ===> unsafe. */ 2247 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 2248 if (PL_tainting && PL_tainted) 2249 FAIL("Eval-group in insecure regular expression"); 2250 } 2251 2252 nextchar(pRExC_state); 2253 if (logical) { 2254 ret = reg_node(pRExC_state, LOGICAL); 2255 if (!SIZE_ONLY) 2256 ret->flags = 2; 2257 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); 2258 /* deal with the length of this later - MJD */ 2259 return ret; 2260 } 2261 return reganode(pRExC_state, EVAL, n); 2262 } 2263 case '(': /* (?(?{...})...) and (?(?=...)...) */ 2264 { 2265 if (RExC_parse[0] == '?') { /* (?(?...)) */ 2266 if (RExC_parse[1] == '=' || RExC_parse[1] == '!' 2267 || RExC_parse[1] == '<' 2268 || RExC_parse[1] == '{') { /* Lookahead or eval. */ 2269 I32 flag; 2270 2271 ret = reg_node(pRExC_state, LOGICAL); 2272 if (!SIZE_ONLY) 2273 ret->flags = 1; 2274 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag)); 2275 goto insert_if; 2276 } 2277 } 2278 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { 2279 /* (?(1)...) */ 2280 parno = atoi(RExC_parse++); 2281 2282 while (isDIGIT(*RExC_parse)) 2283 RExC_parse++; 2284 ret = reganode(pRExC_state, GROUPP, parno); 2285 2286 if ((c = *nextchar(pRExC_state)) != ')') 2287 vFAIL("Switch condition not recognized"); 2288 insert_if: 2289 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); 2290 br = regbranch(pRExC_state, &flags, 1); 2291 if (br == NULL) 2292 br = reganode(pRExC_state, LONGJMP, 0); 2293 else 2294 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); 2295 c = *nextchar(pRExC_state); 2296 if (flags&HASWIDTH) 2297 *flagp |= HASWIDTH; 2298 if (c == '|') { 2299 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ 2300 regbranch(pRExC_state, &flags, 1); 2301 regtail(pRExC_state, ret, lastbr); 2302 if (flags&HASWIDTH) 2303 *flagp |= HASWIDTH; 2304 c = *nextchar(pRExC_state); 2305 } 2306 else 2307 lastbr = NULL; 2308 if (c != ')') 2309 vFAIL("Switch (?(condition)... contains too many branches"); 2310 ender = reg_node(pRExC_state, TAIL); 2311 regtail(pRExC_state, br, ender); 2312 if (lastbr) { 2313 regtail(pRExC_state, lastbr, ender); 2314 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); 2315 } 2316 else 2317 regtail(pRExC_state, ret, ender); 2318 return ret; 2319 } 2320 else { 2321 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); 2322 } 2323 } 2324 case 0: 2325 RExC_parse--; /* for vFAIL to print correctly */ 2326 vFAIL("Sequence (? incomplete"); 2327 break; 2328 default: 2329 --RExC_parse; 2330 parse_flags: /* (?i) */ 2331 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) { 2332 /* (?g), (?gc) and (?o) are useless here 2333 and must be globally applied -- japhy */ 2334 2335 if (*RExC_parse == 'o' || *RExC_parse == 'g') { 2336 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 2337 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g; 2338 if (! (wastedflags & wflagbit) ) { 2339 wastedflags |= wflagbit; 2340 vWARN5( 2341 RExC_parse + 1, 2342 "Useless (%s%c) - %suse /%c modifier", 2343 flagsp == &negflags ? "?-" : "?", 2344 *RExC_parse, 2345 flagsp == &negflags ? "don't " : "", 2346 *RExC_parse 2347 ); 2348 } 2349 } 2350 } 2351 else if (*RExC_parse == 'c') { 2352 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { 2353 if (! (wastedflags & wasted_c) ) { 2354 wastedflags |= wasted_gc; 2355 vWARN3( 2356 RExC_parse + 1, 2357 "Useless (%sc) - %suse /gc modifier", 2358 flagsp == &negflags ? "?-" : "?", 2359 flagsp == &negflags ? "don't " : "" 2360 ); 2361 } 2362 } 2363 } 2364 else { pmflag(flagsp, *RExC_parse); } 2365 2366 ++RExC_parse; 2367 } 2368 if (*RExC_parse == '-') { 2369 flagsp = &negflags; 2370 wastedflags = 0; /* reset so (?g-c) warns twice */ 2371 ++RExC_parse; 2372 goto parse_flags; 2373 } 2374 RExC_flags |= posflags; 2375 RExC_flags &= ~negflags; 2376 if (*RExC_parse == ':') { 2377 RExC_parse++; 2378 paren = ':'; 2379 break; 2380 } 2381 unknown: 2382 if (*RExC_parse != ')') { 2383 RExC_parse++; 2384 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); 2385 } 2386 nextchar(pRExC_state); 2387 *flagp = TRYAGAIN; 2388 return NULL; 2389 } 2390 } 2391 else { /* (...) */ 2392 parno = RExC_npar; 2393 RExC_npar++; 2394 ret = reganode(pRExC_state, OPEN, parno); 2395 Set_Node_Length(ret, 1); /* MJD */ 2396 Set_Node_Offset(ret, RExC_parse); /* MJD */ 2397 open = 1; 2398 } 2399 } 2400 else /* ! paren */ 2401 ret = NULL; 2402 2403 /* Pick up the branches, linking them together. */ 2404 parse_start = RExC_parse; /* MJD */ 2405 br = regbranch(pRExC_state, &flags, 1); 2406 /* branch_len = (paren != 0); */ 2407 2408 if (br == NULL) 2409 return(NULL); 2410 if (*RExC_parse == '|') { 2411 if (!SIZE_ONLY && RExC_extralen) { 2412 reginsert(pRExC_state, BRANCHJ, br); 2413 } 2414 else { /* MJD */ 2415 reginsert(pRExC_state, BRANCH, br); 2416 Set_Node_Length(br, paren != 0); 2417 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); 2418 } 2419 have_branch = 1; 2420 if (SIZE_ONLY) 2421 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ 2422 } 2423 else if (paren == ':') { 2424 *flagp |= flags&SIMPLE; 2425 } 2426 if (open) { /* Starts with OPEN. */ 2427 regtail(pRExC_state, ret, br); /* OPEN -> first. */ 2428 } 2429 else if (paren != '?') /* Not Conditional */ 2430 ret = br; 2431 *flagp |= flags & (SPSTART | HASWIDTH); 2432 lastbr = br; 2433 while (*RExC_parse == '|') { 2434 if (!SIZE_ONLY && RExC_extralen) { 2435 ender = reganode(pRExC_state, LONGJMP,0); 2436 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ 2437 } 2438 if (SIZE_ONLY) 2439 RExC_extralen += 2; /* Account for LONGJMP. */ 2440 nextchar(pRExC_state); 2441 br = regbranch(pRExC_state, &flags, 0); 2442 2443 if (br == NULL) 2444 return(NULL); 2445 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ 2446 lastbr = br; 2447 if (flags&HASWIDTH) 2448 *flagp |= HASWIDTH; 2449 *flagp |= flags&SPSTART; 2450 } 2451 2452 if (have_branch || paren != ':') { 2453 /* Make a closing node, and hook it on the end. */ 2454 switch (paren) { 2455 case ':': 2456 ender = reg_node(pRExC_state, TAIL); 2457 break; 2458 case 1: 2459 ender = reganode(pRExC_state, CLOSE, parno); 2460 Set_Node_Offset(ender,RExC_parse+1); /* MJD */ 2461 Set_Node_Length(ender,1); /* MJD */ 2462 break; 2463 case '<': 2464 case ',': 2465 case '=': 2466 case '!': 2467 *flagp &= ~HASWIDTH; 2468 /* FALL THROUGH */ 2469 case '>': 2470 ender = reg_node(pRExC_state, SUCCEED); 2471 break; 2472 case 0: 2473 ender = reg_node(pRExC_state, END); 2474 break; 2475 } 2476 regtail(pRExC_state, lastbr, ender); 2477 2478 if (have_branch) { 2479 /* Hook the tails of the branches to the closing node. */ 2480 for (br = ret; br != NULL; br = regnext(br)) { 2481 regoptail(pRExC_state, br, ender); 2482 } 2483 } 2484 } 2485 2486 { 2487 char *p; 2488 static char parens[] = "=!<,>"; 2489 2490 if (paren && (p = strchr(parens, paren))) { 2491 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 2492 int flag = (p - parens) > 1; 2493 2494 if (paren == '>') 2495 node = SUSPEND, flag = 0; 2496 reginsert(pRExC_state, node,ret); 2497 ret->flags = flag; 2498 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL)); 2499 } 2500 } 2501 2502 /* Check for proper termination. */ 2503 if (paren) { 2504 RExC_flags = oregflags; 2505 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { 2506 RExC_parse = oregcomp_parse; 2507 vFAIL("Unmatched ("); 2508 } 2509 } 2510 else if (!paren && RExC_parse < RExC_end) { 2511 if (*RExC_parse == ')') { 2512 RExC_parse++; 2513 vFAIL("Unmatched )"); 2514 } 2515 else 2516 FAIL("Junk on end of regexp"); /* "Can't happen". */ 2517 /* NOTREACHED */ 2518 } 2519 2520 return(ret); 2521 } 2522 2523 /* 2524 - regbranch - one alternative of an | operator 2525 * 2526 * Implements the concatenation operator. 2527 */ 2528 STATIC regnode * 2529 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first) 2530 { 2531 register regnode *ret; 2532 register regnode *chain = NULL; 2533 register regnode *latest; 2534 I32 flags = 0, c = 0; 2535 2536 if (first) 2537 ret = NULL; 2538 else { 2539 if (!SIZE_ONLY && RExC_extralen) 2540 ret = reganode(pRExC_state, BRANCHJ,0); 2541 else { 2542 ret = reg_node(pRExC_state, BRANCH); 2543 Set_Node_Length(ret, 1); 2544 } 2545 } 2546 2547 if (!first && SIZE_ONLY) 2548 RExC_extralen += 1; /* BRANCHJ */ 2549 2550 *flagp = WORST; /* Tentatively. */ 2551 2552 RExC_parse--; 2553 nextchar(pRExC_state); 2554 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 2555 flags &= ~TRYAGAIN; 2556 latest = regpiece(pRExC_state, &flags); 2557 if (latest == NULL) { 2558 if (flags & TRYAGAIN) 2559 continue; 2560 return(NULL); 2561 } 2562 else if (ret == NULL) 2563 ret = latest; 2564 *flagp |= flags&HASWIDTH; 2565 if (chain == NULL) /* First piece. */ 2566 *flagp |= flags&SPSTART; 2567 else { 2568 RExC_naughty++; 2569 regtail(pRExC_state, chain, latest); 2570 } 2571 chain = latest; 2572 c++; 2573 } 2574 if (chain == NULL) { /* Loop ran zero times. */ 2575 chain = reg_node(pRExC_state, NOTHING); 2576 if (ret == NULL) 2577 ret = chain; 2578 } 2579 if (c == 1) { 2580 *flagp |= flags&SIMPLE; 2581 } 2582 2583 return(ret); 2584 } 2585 2586 /* 2587 - regpiece - something followed by possible [*+?] 2588 * 2589 * Note that the branching code sequences used for ? and the general cases 2590 * of * and + are somewhat optimized: they use the same NOTHING node as 2591 * both the endmarker for their branch list and the body of the last branch. 2592 * It might seem that this node could be dispensed with entirely, but the 2593 * endmarker role is not redundant. 2594 */ 2595 STATIC regnode * 2596 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) 2597 { 2598 register regnode *ret; 2599 register char op; 2600 register char *next; 2601 I32 flags; 2602 char *origparse = RExC_parse; 2603 char *maxpos; 2604 I32 min; 2605 I32 max = REG_INFTY; 2606 char *parse_start; 2607 2608 ret = regatom(pRExC_state, &flags); 2609 if (ret == NULL) { 2610 if (flags & TRYAGAIN) 2611 *flagp |= TRYAGAIN; 2612 return(NULL); 2613 } 2614 2615 op = *RExC_parse; 2616 2617 if (op == '{' && regcurly(RExC_parse)) { 2618 parse_start = RExC_parse; /* MJD */ 2619 next = RExC_parse + 1; 2620 maxpos = Nullch; 2621 while (isDIGIT(*next) || *next == ',') { 2622 if (*next == ',') { 2623 if (maxpos) 2624 break; 2625 else 2626 maxpos = next; 2627 } 2628 next++; 2629 } 2630 if (*next == '}') { /* got one */ 2631 if (!maxpos) 2632 maxpos = next; 2633 RExC_parse++; 2634 min = atoi(RExC_parse); 2635 if (*maxpos == ',') 2636 maxpos++; 2637 else 2638 maxpos = RExC_parse; 2639 max = atoi(maxpos); 2640 if (!max && *maxpos != '0') 2641 max = REG_INFTY; /* meaning "infinity" */ 2642 else if (max >= REG_INFTY) 2643 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 2644 RExC_parse = next; 2645 nextchar(pRExC_state); 2646 2647 do_curly: 2648 if ((flags&SIMPLE)) { 2649 RExC_naughty += 2 + RExC_naughty / 2; 2650 reginsert(pRExC_state, CURLY, ret); 2651 Set_Node_Offset(ret, parse_start+1); /* MJD */ 2652 Set_Node_Cur_Length(ret); 2653 } 2654 else { 2655 regnode *w = reg_node(pRExC_state, WHILEM); 2656 2657 w->flags = 0; 2658 regtail(pRExC_state, ret, w); 2659 if (!SIZE_ONLY && RExC_extralen) { 2660 reginsert(pRExC_state, LONGJMP,ret); 2661 reginsert(pRExC_state, NOTHING,ret); 2662 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ 2663 } 2664 reginsert(pRExC_state, CURLYX,ret); 2665 /* MJD hk */ 2666 Set_Node_Offset(ret, parse_start+1); 2667 Set_Node_Length(ret, 2668 op == '{' ? (RExC_parse - parse_start) : 1); 2669 2670 if (!SIZE_ONLY && RExC_extralen) 2671 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ 2672 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); 2673 if (SIZE_ONLY) 2674 RExC_whilem_seen++, RExC_extralen += 3; 2675 RExC_naughty += 4 + RExC_naughty; /* compound interest */ 2676 } 2677 ret->flags = 0; 2678 2679 if (min > 0) 2680 *flagp = WORST; 2681 if (max > 0) 2682 *flagp |= HASWIDTH; 2683 if (max && max < min) 2684 vFAIL("Can't do {n,m} with n > m"); 2685 if (!SIZE_ONLY) { 2686 ARG1_SET(ret, (U16)min); 2687 ARG2_SET(ret, (U16)max); 2688 } 2689 2690 goto nest_check; 2691 } 2692 } 2693 2694 if (!ISMULT1(op)) { 2695 *flagp = flags; 2696 return(ret); 2697 } 2698 2699 #if 0 /* Now runtime fix should be reliable. */ 2700 2701 /* if this is reinstated, don't forget to put this back into perldiag: 2702 2703 =item Regexp *+ operand could be empty at {#} in regex m/%s/ 2704 2705 (F) The part of the regexp subject to either the * or + quantifier 2706 could match an empty string. The {#} shows in the regular 2707 expression about where the problem was discovered. 2708 2709 */ 2710 2711 if (!(flags&HASWIDTH) && op != '?') 2712 vFAIL("Regexp *+ operand could be empty"); 2713 #endif 2714 2715 parse_start = RExC_parse; 2716 nextchar(pRExC_state); 2717 2718 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); 2719 2720 if (op == '*' && (flags&SIMPLE)) { 2721 reginsert(pRExC_state, STAR, ret); 2722 ret->flags = 0; 2723 RExC_naughty += 4; 2724 } 2725 else if (op == '*') { 2726 min = 0; 2727 goto do_curly; 2728 } 2729 else if (op == '+' && (flags&SIMPLE)) { 2730 reginsert(pRExC_state, PLUS, ret); 2731 ret->flags = 0; 2732 RExC_naughty += 3; 2733 } 2734 else if (op == '+') { 2735 min = 1; 2736 goto do_curly; 2737 } 2738 else if (op == '?') { 2739 min = 0; max = 1; 2740 goto do_curly; 2741 } 2742 nest_check: 2743 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) { 2744 vWARN3(RExC_parse, 2745 "%.*s matches null string many times", 2746 RExC_parse - origparse, 2747 origparse); 2748 } 2749 2750 if (*RExC_parse == '?') { 2751 nextchar(pRExC_state); 2752 reginsert(pRExC_state, MINMOD, ret); 2753 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE); 2754 } 2755 if (ISMULT2(RExC_parse)) { 2756 RExC_parse++; 2757 vFAIL("Nested quantifiers"); 2758 } 2759 2760 return(ret); 2761 } 2762 2763 /* 2764 - regatom - the lowest level 2765 * 2766 * Optimization: gobbles an entire sequence of ordinary characters so that 2767 * it can turn them into a single node, which is smaller to store and 2768 * faster to run. Backslashed characters are exceptions, each becoming a 2769 * separate node; the code is simpler that way and it's not worth fixing. 2770 * 2771 * [Yes, it is worth fixing, some scripts can run twice the speed.] */ 2772 STATIC regnode * 2773 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) 2774 { 2775 register regnode *ret = 0; 2776 I32 flags; 2777 char *parse_start = 0; 2778 2779 *flagp = WORST; /* Tentatively. */ 2780 2781 tryagain: 2782 switch (*RExC_parse) { 2783 case '^': 2784 RExC_seen_zerolen++; 2785 nextchar(pRExC_state); 2786 if (RExC_flags & PMf_MULTILINE) 2787 ret = reg_node(pRExC_state, MBOL); 2788 else if (RExC_flags & PMf_SINGLELINE) 2789 ret = reg_node(pRExC_state, SBOL); 2790 else 2791 ret = reg_node(pRExC_state, BOL); 2792 Set_Node_Length(ret, 1); /* MJD */ 2793 break; 2794 case '$': 2795 nextchar(pRExC_state); 2796 if (*RExC_parse) 2797 RExC_seen_zerolen++; 2798 if (RExC_flags & PMf_MULTILINE) 2799 ret = reg_node(pRExC_state, MEOL); 2800 else if (RExC_flags & PMf_SINGLELINE) 2801 ret = reg_node(pRExC_state, SEOL); 2802 else 2803 ret = reg_node(pRExC_state, EOL); 2804 Set_Node_Length(ret, 1); /* MJD */ 2805 break; 2806 case '.': 2807 nextchar(pRExC_state); 2808 if (RExC_flags & PMf_SINGLELINE) 2809 ret = reg_node(pRExC_state, SANY); 2810 else 2811 ret = reg_node(pRExC_state, REG_ANY); 2812 *flagp |= HASWIDTH|SIMPLE; 2813 RExC_naughty++; 2814 Set_Node_Length(ret, 1); /* MJD */ 2815 break; 2816 case '[': 2817 { 2818 char *oregcomp_parse = ++RExC_parse; 2819 ret = regclass(pRExC_state); 2820 if (*RExC_parse != ']') { 2821 RExC_parse = oregcomp_parse; 2822 vFAIL("Unmatched ["); 2823 } 2824 nextchar(pRExC_state); 2825 *flagp |= HASWIDTH|SIMPLE; 2826 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ 2827 break; 2828 } 2829 case '(': 2830 nextchar(pRExC_state); 2831 ret = reg(pRExC_state, 1, &flags); 2832 if (ret == NULL) { 2833 if (flags & TRYAGAIN) { 2834 if (RExC_parse == RExC_end) { 2835 /* Make parent create an empty node if needed. */ 2836 *flagp |= TRYAGAIN; 2837 return(NULL); 2838 } 2839 goto tryagain; 2840 } 2841 return(NULL); 2842 } 2843 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); 2844 break; 2845 case '|': 2846 case ')': 2847 if (flags & TRYAGAIN) { 2848 *flagp |= TRYAGAIN; 2849 return NULL; 2850 } 2851 vFAIL("Internal urp"); 2852 /* Supposed to be caught earlier. */ 2853 break; 2854 case '{': 2855 if (!regcurly(RExC_parse)) { 2856 RExC_parse++; 2857 goto defchar; 2858 } 2859 /* FALL THROUGH */ 2860 case '?': 2861 case '+': 2862 case '*': 2863 RExC_parse++; 2864 vFAIL("Quantifier follows nothing"); 2865 break; 2866 case '\\': 2867 switch (*++RExC_parse) { 2868 case 'A': 2869 RExC_seen_zerolen++; 2870 ret = reg_node(pRExC_state, SBOL); 2871 *flagp |= SIMPLE; 2872 nextchar(pRExC_state); 2873 Set_Node_Length(ret, 2); /* MJD */ 2874 break; 2875 case 'G': 2876 ret = reg_node(pRExC_state, GPOS); 2877 RExC_seen |= REG_SEEN_GPOS; 2878 *flagp |= SIMPLE; 2879 nextchar(pRExC_state); 2880 Set_Node_Length(ret, 2); /* MJD */ 2881 break; 2882 case 'Z': 2883 ret = reg_node(pRExC_state, SEOL); 2884 *flagp |= SIMPLE; 2885 RExC_seen_zerolen++; /* Do not optimize RE away */ 2886 nextchar(pRExC_state); 2887 break; 2888 case 'z': 2889 ret = reg_node(pRExC_state, EOS); 2890 *flagp |= SIMPLE; 2891 RExC_seen_zerolen++; /* Do not optimize RE away */ 2892 nextchar(pRExC_state); 2893 Set_Node_Length(ret, 2); /* MJD */ 2894 break; 2895 case 'C': 2896 ret = reg_node(pRExC_state, CANY); 2897 RExC_seen |= REG_SEEN_CANY; 2898 *flagp |= HASWIDTH|SIMPLE; 2899 nextchar(pRExC_state); 2900 Set_Node_Length(ret, 2); /* MJD */ 2901 break; 2902 case 'X': 2903 ret = reg_node(pRExC_state, CLUMP); 2904 *flagp |= HASWIDTH; 2905 nextchar(pRExC_state); 2906 Set_Node_Length(ret, 2); /* MJD */ 2907 break; 2908 case 'w': 2909 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); 2910 *flagp |= HASWIDTH|SIMPLE; 2911 nextchar(pRExC_state); 2912 Set_Node_Length(ret, 2); /* MJD */ 2913 break; 2914 case 'W': 2915 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); 2916 *flagp |= HASWIDTH|SIMPLE; 2917 nextchar(pRExC_state); 2918 Set_Node_Length(ret, 2); /* MJD */ 2919 break; 2920 case 'b': 2921 RExC_seen_zerolen++; 2922 RExC_seen |= REG_SEEN_LOOKBEHIND; 2923 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); 2924 *flagp |= SIMPLE; 2925 nextchar(pRExC_state); 2926 Set_Node_Length(ret, 2); /* MJD */ 2927 break; 2928 case 'B': 2929 RExC_seen_zerolen++; 2930 RExC_seen |= REG_SEEN_LOOKBEHIND; 2931 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); 2932 *flagp |= SIMPLE; 2933 nextchar(pRExC_state); 2934 Set_Node_Length(ret, 2); /* MJD */ 2935 break; 2936 case 's': 2937 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); 2938 *flagp |= HASWIDTH|SIMPLE; 2939 nextchar(pRExC_state); 2940 Set_Node_Length(ret, 2); /* MJD */ 2941 break; 2942 case 'S': 2943 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); 2944 *flagp |= HASWIDTH|SIMPLE; 2945 nextchar(pRExC_state); 2946 Set_Node_Length(ret, 2); /* MJD */ 2947 break; 2948 case 'd': 2949 ret = reg_node(pRExC_state, DIGIT); 2950 *flagp |= HASWIDTH|SIMPLE; 2951 nextchar(pRExC_state); 2952 Set_Node_Length(ret, 2); /* MJD */ 2953 break; 2954 case 'D': 2955 ret = reg_node(pRExC_state, NDIGIT); 2956 *flagp |= HASWIDTH|SIMPLE; 2957 nextchar(pRExC_state); 2958 Set_Node_Length(ret, 2); /* MJD */ 2959 break; 2960 case 'p': 2961 case 'P': 2962 { 2963 char* oldregxend = RExC_end; 2964 char* parse_start = RExC_parse; 2965 2966 if (RExC_parse[1] == '{') { 2967 /* a lovely hack--pretend we saw [\pX] instead */ 2968 RExC_end = strchr(RExC_parse, '}'); 2969 if (!RExC_end) { 2970 U8 c = (U8)*RExC_parse; 2971 RExC_parse += 2; 2972 RExC_end = oldregxend; 2973 vFAIL2("Missing right brace on \\%c{}", c); 2974 } 2975 RExC_end++; 2976 } 2977 else { 2978 RExC_end = RExC_parse + 2; 2979 if (RExC_end > oldregxend) 2980 RExC_end = oldregxend; 2981 } 2982 RExC_parse--; 2983 2984 ret = regclass(pRExC_state); 2985 2986 RExC_end = oldregxend; 2987 RExC_parse--; 2988 Set_Node_Cur_Length(ret); /* MJD */ 2989 nextchar(pRExC_state); 2990 *flagp |= HASWIDTH|SIMPLE; 2991 } 2992 break; 2993 case 'n': 2994 case 'r': 2995 case 't': 2996 case 'f': 2997 case 'e': 2998 case 'a': 2999 case 'x': 3000 case 'c': 3001 case '0': 3002 goto defchar; 3003 case '1': case '2': case '3': case '4': 3004 case '5': case '6': case '7': case '8': case '9': 3005 { 3006 I32 num = atoi(RExC_parse); 3007 3008 if (num > 9 && num >= RExC_npar) 3009 goto defchar; 3010 else { 3011 char * parse_start = RExC_parse - 1; /* MJD */ 3012 while (isDIGIT(*RExC_parse)) 3013 RExC_parse++; 3014 3015 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens) 3016 vFAIL("Reference to nonexistent group"); 3017 RExC_sawback = 1; 3018 ret = reganode(pRExC_state, 3019 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF), 3020 num); 3021 *flagp |= HASWIDTH; 3022 3023 /* override incorrect value set in reganode MJD */ 3024 Set_Node_Offset(ret, parse_start+1); 3025 Set_Node_Cur_Length(ret); /* MJD */ 3026 RExC_parse--; 3027 nextchar(pRExC_state); 3028 } 3029 } 3030 break; 3031 case '\0': 3032 if (RExC_parse >= RExC_end) 3033 FAIL("Trailing \\"); 3034 /* FALL THROUGH */ 3035 default: 3036 /* Do not generate `unrecognized' warnings here, we fall 3037 back into the quick-grab loop below */ 3038 goto defchar; 3039 } 3040 break; 3041 3042 case '#': 3043 if (RExC_flags & PMf_EXTENDED) { 3044 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++; 3045 if (RExC_parse < RExC_end) 3046 goto tryagain; 3047 } 3048 /* FALL THROUGH */ 3049 3050 default: { 3051 register STRLEN len; 3052 register UV ender; 3053 register char *p; 3054 char *oldp, *s; 3055 STRLEN numlen; 3056 STRLEN foldlen; 3057 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf; 3058 3059 parse_start = RExC_parse - 1; 3060 3061 RExC_parse++; 3062 3063 defchar: 3064 ender = 0; 3065 ret = reg_node(pRExC_state, 3066 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); 3067 s = STRING(ret); 3068 for (len = 0, p = RExC_parse - 1; 3069 len < 127 && p < RExC_end; 3070 len++) 3071 { 3072 oldp = p; 3073 3074 if (RExC_flags & PMf_EXTENDED) 3075 p = regwhite(p, RExC_end); 3076 switch (*p) { 3077 case '^': 3078 case '$': 3079 case '.': 3080 case '[': 3081 case '(': 3082 case ')': 3083 case '|': 3084 goto loopdone; 3085 case '\\': 3086 switch (*++p) { 3087 case 'A': 3088 case 'C': 3089 case 'X': 3090 case 'G': 3091 case 'Z': 3092 case 'z': 3093 case 'w': 3094 case 'W': 3095 case 'b': 3096 case 'B': 3097 case 's': 3098 case 'S': 3099 case 'd': 3100 case 'D': 3101 case 'p': 3102 case 'P': 3103 --p; 3104 goto loopdone; 3105 case 'n': 3106 ender = '\n'; 3107 p++; 3108 break; 3109 case 'r': 3110 ender = '\r'; 3111 p++; 3112 break; 3113 case 't': 3114 ender = '\t'; 3115 p++; 3116 break; 3117 case 'f': 3118 ender = '\f'; 3119 p++; 3120 break; 3121 case 'e': 3122 ender = ASCII_TO_NATIVE('\033'); 3123 p++; 3124 break; 3125 case 'a': 3126 ender = ASCII_TO_NATIVE('\007'); 3127 p++; 3128 break; 3129 case 'x': 3130 if (*++p == '{') { 3131 char* e = strchr(p, '}'); 3132 3133 if (!e) { 3134 RExC_parse = p + 1; 3135 vFAIL("Missing right brace on \\x{}"); 3136 } 3137 else { 3138 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 3139 | PERL_SCAN_DISALLOW_PREFIX; 3140 numlen = e - p - 1; 3141 ender = grok_hex(p + 1, &numlen, &flags, NULL); 3142 if (ender > 0xff) 3143 RExC_utf8 = 1; 3144 /* numlen is generous */ 3145 if (numlen + len >= 127) { 3146 p--; 3147 goto loopdone; 3148 } 3149 p = e + 1; 3150 } 3151 } 3152 else { 3153 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 3154 numlen = 2; 3155 ender = grok_hex(p, &numlen, &flags, NULL); 3156 p += numlen; 3157 } 3158 break; 3159 case 'c': 3160 p++; 3161 ender = UCHARAT(p++); 3162 ender = toCTRL(ender); 3163 break; 3164 case '0': case '1': case '2': case '3':case '4': 3165 case '5': case '6': case '7': case '8':case '9': 3166 if (*p == '0' || 3167 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { 3168 I32 flags = 0; 3169 numlen = 3; 3170 ender = grok_oct(p, &numlen, &flags, NULL); 3171 p += numlen; 3172 } 3173 else { 3174 --p; 3175 goto loopdone; 3176 } 3177 break; 3178 case '\0': 3179 if (p >= RExC_end) 3180 FAIL("Trailing \\"); 3181 /* FALL THROUGH */ 3182 default: 3183 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) 3184 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); 3185 goto normal_default; 3186 } 3187 break; 3188 default: 3189 normal_default: 3190 if (UTF8_IS_START(*p) && UTF) { 3191 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 3192 &numlen, 0); 3193 p += numlen; 3194 } 3195 else 3196 ender = *p++; 3197 break; 3198 } 3199 if (RExC_flags & PMf_EXTENDED) 3200 p = regwhite(p, RExC_end); 3201 if (UTF && FOLD) { 3202 /* Prime the casefolded buffer. */ 3203 ender = toFOLD_uni(ender, tmpbuf, &foldlen); 3204 } 3205 if (ISMULT2(p)) { /* Back off on ?+*. */ 3206 if (len) 3207 p = oldp; 3208 else if (UTF) { 3209 STRLEN unilen; 3210 3211 if (FOLD) { 3212 /* Emit all the Unicode characters. */ 3213 for (foldbuf = tmpbuf; 3214 foldlen; 3215 foldlen -= numlen) { 3216 ender = utf8_to_uvchr(foldbuf, &numlen); 3217 if (numlen > 0) { 3218 reguni(pRExC_state, ender, s, &unilen); 3219 s += unilen; 3220 len += unilen; 3221 /* In EBCDIC the numlen 3222 * and unilen can differ. */ 3223 foldbuf += numlen; 3224 if (numlen >= foldlen) 3225 break; 3226 } 3227 else 3228 break; /* "Can't happen." */ 3229 } 3230 } 3231 else { 3232 reguni(pRExC_state, ender, s, &unilen); 3233 if (unilen > 0) { 3234 s += unilen; 3235 len += unilen; 3236 } 3237 } 3238 } 3239 else { 3240 len++; 3241 REGC((char)ender, s++); 3242 } 3243 break; 3244 } 3245 if (UTF) { 3246 STRLEN unilen; 3247 3248 if (FOLD) { 3249 /* Emit all the Unicode characters. */ 3250 for (foldbuf = tmpbuf; 3251 foldlen; 3252 foldlen -= numlen) { 3253 ender = utf8_to_uvchr(foldbuf, &numlen); 3254 if (numlen > 0) { 3255 reguni(pRExC_state, ender, s, &unilen); 3256 len += unilen; 3257 s += unilen; 3258 /* In EBCDIC the numlen 3259 * and unilen can differ. */ 3260 foldbuf += numlen; 3261 if (numlen >= foldlen) 3262 break; 3263 } 3264 else 3265 break; 3266 } 3267 } 3268 else { 3269 reguni(pRExC_state, ender, s, &unilen); 3270 if (unilen > 0) { 3271 s += unilen; 3272 len += unilen; 3273 } 3274 } 3275 len--; 3276 } 3277 else 3278 REGC((char)ender, s++); 3279 } 3280 loopdone: 3281 RExC_parse = p - 1; 3282 Set_Node_Cur_Length(ret); /* MJD */ 3283 nextchar(pRExC_state); 3284 { 3285 /* len is STRLEN which is unsigned, need to copy to signed */ 3286 IV iv = len; 3287 if (iv < 0) 3288 vFAIL("Internal disaster"); 3289 } 3290 if (len > 0) 3291 *flagp |= HASWIDTH; 3292 if (len == 1) 3293 *flagp |= SIMPLE; 3294 if (!SIZE_ONLY) 3295 STR_LEN(ret) = len; 3296 if (SIZE_ONLY) 3297 RExC_size += STR_SZ(len); 3298 else 3299 RExC_emit += STR_SZ(len); 3300 } 3301 break; 3302 } 3303 3304 /* If the encoding pragma is in effect recode the text of 3305 * any EXACT-kind nodes. */ 3306 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) { 3307 STRLEN oldlen = STR_LEN(ret); 3308 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); 3309 3310 if (RExC_utf8) 3311 SvUTF8_on(sv); 3312 if (sv_utf8_downgrade(sv, TRUE)) { 3313 char *s = sv_recode_to_utf8(sv, PL_encoding); 3314 STRLEN newlen = SvCUR(sv); 3315 3316 if (!SIZE_ONLY) { 3317 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", 3318 (int)oldlen, STRING(ret), 3319 (int)newlen, s)); 3320 Copy(s, STRING(ret), newlen, char); 3321 STR_LEN(ret) += newlen - oldlen; 3322 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); 3323 } else 3324 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); 3325 } 3326 } 3327 3328 return(ret); 3329 } 3330 3331 STATIC char * 3332 S_regwhite(pTHX_ char *p, char *e) 3333 { 3334 while (p < e) { 3335 if (isSPACE(*p)) 3336 ++p; 3337 else if (*p == '#') { 3338 do { 3339 p++; 3340 } while (p < e && *p != '\n'); 3341 } 3342 else 3343 break; 3344 } 3345 return p; 3346 } 3347 3348 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 3349 Character classes ([:foo:]) can also be negated ([:^foo:]). 3350 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 3351 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 3352 but trigger failures because they are currently unimplemented. */ 3353 3354 #define POSIXCC_DONE(c) ((c) == ':') 3355 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 3356 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 3357 3358 STATIC I32 3359 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) 3360 { 3361 char *posixcc = 0; 3362 I32 namedclass = OOB_NAMEDCLASS; 3363 3364 if (value == '[' && RExC_parse + 1 < RExC_end && 3365 /* I smell either [: or [= or [. -- POSIX has been here, right? */ 3366 POSIXCC(UCHARAT(RExC_parse))) { 3367 char c = UCHARAT(RExC_parse); 3368 char* s = RExC_parse++; 3369 3370 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) 3371 RExC_parse++; 3372 if (RExC_parse == RExC_end) 3373 /* Grandfather lone [:, [=, [. */ 3374 RExC_parse = s; 3375 else { 3376 char* t = RExC_parse++; /* skip over the c */ 3377 3378 if (UCHARAT(RExC_parse) == ']') { 3379 RExC_parse++; /* skip over the ending ] */ 3380 posixcc = s + 1; 3381 if (*s == ':') { 3382 I32 complement = *posixcc == '^' ? *posixcc++ : 0; 3383 I32 skip = 5; /* the most common skip */ 3384 3385 switch (*posixcc) { 3386 case 'a': 3387 if (strnEQ(posixcc, "alnum", 5)) 3388 namedclass = 3389 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; 3390 else if (strnEQ(posixcc, "alpha", 5)) 3391 namedclass = 3392 complement ? ANYOF_NALPHA : ANYOF_ALPHA; 3393 else if (strnEQ(posixcc, "ascii", 5)) 3394 namedclass = 3395 complement ? ANYOF_NASCII : ANYOF_ASCII; 3396 break; 3397 case 'b': 3398 if (strnEQ(posixcc, "blank", 5)) 3399 namedclass = 3400 complement ? ANYOF_NBLANK : ANYOF_BLANK; 3401 break; 3402 case 'c': 3403 if (strnEQ(posixcc, "cntrl", 5)) 3404 namedclass = 3405 complement ? ANYOF_NCNTRL : ANYOF_CNTRL; 3406 break; 3407 case 'd': 3408 if (strnEQ(posixcc, "digit", 5)) 3409 namedclass = 3410 complement ? ANYOF_NDIGIT : ANYOF_DIGIT; 3411 break; 3412 case 'g': 3413 if (strnEQ(posixcc, "graph", 5)) 3414 namedclass = 3415 complement ? ANYOF_NGRAPH : ANYOF_GRAPH; 3416 break; 3417 case 'l': 3418 if (strnEQ(posixcc, "lower", 5)) 3419 namedclass = 3420 complement ? ANYOF_NLOWER : ANYOF_LOWER; 3421 break; 3422 case 'p': 3423 if (strnEQ(posixcc, "print", 5)) 3424 namedclass = 3425 complement ? ANYOF_NPRINT : ANYOF_PRINT; 3426 else if (strnEQ(posixcc, "punct", 5)) 3427 namedclass = 3428 complement ? ANYOF_NPUNCT : ANYOF_PUNCT; 3429 break; 3430 case 's': 3431 if (strnEQ(posixcc, "space", 5)) 3432 namedclass = 3433 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; 3434 break; 3435 case 'u': 3436 if (strnEQ(posixcc, "upper", 5)) 3437 namedclass = 3438 complement ? ANYOF_NUPPER : ANYOF_UPPER; 3439 break; 3440 case 'w': /* this is not POSIX, this is the Perl \w */ 3441 if (strnEQ(posixcc, "word", 4)) { 3442 namedclass = 3443 complement ? ANYOF_NALNUM : ANYOF_ALNUM; 3444 skip = 4; 3445 } 3446 break; 3447 case 'x': 3448 if (strnEQ(posixcc, "xdigit", 6)) { 3449 namedclass = 3450 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; 3451 skip = 6; 3452 } 3453 break; 3454 } 3455 if (namedclass == OOB_NAMEDCLASS || 3456 posixcc[skip] != ':' || 3457 posixcc[skip+1] != ']') 3458 { 3459 Simple_vFAIL3("POSIX class [:%.*s:] unknown", 3460 t - s - 1, s + 1); 3461 } 3462 } else if (!SIZE_ONLY) { 3463 /* [[=foo=]] and [[.foo.]] are still future. */ 3464 3465 /* adjust RExC_parse so the warning shows after 3466 the class closes */ 3467 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') 3468 RExC_parse++; 3469 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 3470 } 3471 } else { 3472 /* Maternal grandfather: 3473 * "[:" ending in ":" but not in ":]" */ 3474 RExC_parse = s; 3475 } 3476 } 3477 } 3478 3479 return namedclass; 3480 } 3481 3482 STATIC void 3483 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) 3484 { 3485 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) { 3486 char *s = RExC_parse; 3487 char c = *s++; 3488 3489 while(*s && isALNUM(*s)) 3490 s++; 3491 if (*s && c == *s && s[1] == ']') { 3492 if (ckWARN(WARN_REGEXP)) 3493 vWARN3(s+2, 3494 "POSIX syntax [%c %c] belongs inside character classes", 3495 c, c); 3496 3497 /* [[=foo=]] and [[.foo.]] are still future. */ 3498 if (POSIXCC_NOTYET(c)) { 3499 /* adjust RExC_parse so the error shows after 3500 the class closes */ 3501 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']') 3502 ; 3503 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); 3504 } 3505 } 3506 } 3507 } 3508 3509 STATIC regnode * 3510 S_regclass(pTHX_ RExC_state_t *pRExC_state) 3511 { 3512 register UV value; 3513 register UV nextvalue; 3514 register IV prevvalue = OOB_UNICODE; 3515 register IV range = 0; 3516 register regnode *ret; 3517 STRLEN numlen; 3518 IV namedclass; 3519 char *rangebegin = 0; 3520 bool need_class = 0; 3521 SV *listsv = Nullsv; 3522 register char *e; 3523 UV n; 3524 bool optimize_invert = TRUE; 3525 AV* unicode_alternate = 0; 3526 #ifdef EBCDIC 3527 UV literal_endpoint = 0; 3528 #endif 3529 3530 ret = reganode(pRExC_state, ANYOF, 0); 3531 3532 if (!SIZE_ONLY) 3533 ANYOF_FLAGS(ret) = 0; 3534 3535 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ 3536 RExC_naughty++; 3537 RExC_parse++; 3538 if (!SIZE_ONLY) 3539 ANYOF_FLAGS(ret) |= ANYOF_INVERT; 3540 } 3541 3542 if (SIZE_ONLY) 3543 RExC_size += ANYOF_SKIP; 3544 else { 3545 RExC_emit += ANYOF_SKIP; 3546 if (FOLD) 3547 ANYOF_FLAGS(ret) |= ANYOF_FOLD; 3548 if (LOC) 3549 ANYOF_FLAGS(ret) |= ANYOF_LOCALE; 3550 ANYOF_BITMAP_ZERO(ret); 3551 listsv = newSVpvn("# comment\n", 10); 3552 } 3553 3554 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 3555 3556 if (!SIZE_ONLY && POSIXCC(nextvalue)) 3557 checkposixcc(pRExC_state); 3558 3559 /* allow 1st char to be ] (allowing it to be - is dealt with later) */ 3560 if (UCHARAT(RExC_parse) == ']') 3561 goto charclassloop; 3562 3563 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { 3564 3565 charclassloop: 3566 3567 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 3568 3569 if (!range) 3570 rangebegin = RExC_parse; 3571 if (UTF) { 3572 value = utf8n_to_uvchr((U8*)RExC_parse, 3573 RExC_end - RExC_parse, 3574 &numlen, 0); 3575 RExC_parse += numlen; 3576 } 3577 else 3578 value = UCHARAT(RExC_parse++); 3579 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; 3580 if (value == '[' && POSIXCC(nextvalue)) 3581 namedclass = regpposixcc(pRExC_state, value); 3582 else if (value == '\\') { 3583 if (UTF) { 3584 value = utf8n_to_uvchr((U8*)RExC_parse, 3585 RExC_end - RExC_parse, 3586 &numlen, 0); 3587 RExC_parse += numlen; 3588 } 3589 else 3590 value = UCHARAT(RExC_parse++); 3591 /* Some compilers cannot handle switching on 64-bit integer 3592 * values, therefore value cannot be an UV. Yes, this will 3593 * be a problem later if we want switch on Unicode. 3594 * A similar issue a little bit later when switching on 3595 * namedclass. --jhi */ 3596 switch ((I32)value) { 3597 case 'w': namedclass = ANYOF_ALNUM; break; 3598 case 'W': namedclass = ANYOF_NALNUM; break; 3599 case 's': namedclass = ANYOF_SPACE; break; 3600 case 'S': namedclass = ANYOF_NSPACE; break; 3601 case 'd': namedclass = ANYOF_DIGIT; break; 3602 case 'D': namedclass = ANYOF_NDIGIT; break; 3603 case 'p': 3604 case 'P': 3605 if (RExC_parse >= RExC_end) 3606 vFAIL2("Empty \\%c{}", (U8)value); 3607 if (*RExC_parse == '{') { 3608 U8 c = (U8)value; 3609 e = strchr(RExC_parse++, '}'); 3610 if (!e) 3611 vFAIL2("Missing right brace on \\%c{}", c); 3612 while (isSPACE(UCHARAT(RExC_parse))) 3613 RExC_parse++; 3614 if (e == RExC_parse) 3615 vFAIL2("Empty \\%c{}", c); 3616 n = e - RExC_parse; 3617 while (isSPACE(UCHARAT(RExC_parse + n - 1))) 3618 n--; 3619 } 3620 else { 3621 e = RExC_parse; 3622 n = 1; 3623 } 3624 if (!SIZE_ONLY) { 3625 if (UCHARAT(RExC_parse) == '^') { 3626 RExC_parse++; 3627 n--; 3628 value = value == 'p' ? 'P' : 'p'; /* toggle */ 3629 while (isSPACE(UCHARAT(RExC_parse))) { 3630 RExC_parse++; 3631 n--; 3632 } 3633 } 3634 if (value == 'p') 3635 Perl_sv_catpvf(aTHX_ listsv, 3636 "+utf8::%.*s\n", (int)n, RExC_parse); 3637 else 3638 Perl_sv_catpvf(aTHX_ listsv, 3639 "!utf8::%.*s\n", (int)n, RExC_parse); 3640 } 3641 RExC_parse = e + 1; 3642 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 3643 continue; 3644 case 'n': value = '\n'; break; 3645 case 'r': value = '\r'; break; 3646 case 't': value = '\t'; break; 3647 case 'f': value = '\f'; break; 3648 case 'b': value = '\b'; break; 3649 case 'e': value = ASCII_TO_NATIVE('\033');break; 3650 case 'a': value = ASCII_TO_NATIVE('\007');break; 3651 case 'x': 3652 if (*RExC_parse == '{') { 3653 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 3654 | PERL_SCAN_DISALLOW_PREFIX; 3655 e = strchr(RExC_parse++, '}'); 3656 if (!e) 3657 vFAIL("Missing right brace on \\x{}"); 3658 3659 numlen = e - RExC_parse; 3660 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 3661 RExC_parse = e + 1; 3662 } 3663 else { 3664 I32 flags = PERL_SCAN_DISALLOW_PREFIX; 3665 numlen = 2; 3666 value = grok_hex(RExC_parse, &numlen, &flags, NULL); 3667 RExC_parse += numlen; 3668 } 3669 break; 3670 case 'c': 3671 value = UCHARAT(RExC_parse++); 3672 value = toCTRL(value); 3673 break; 3674 case '0': case '1': case '2': case '3': case '4': 3675 case '5': case '6': case '7': case '8': case '9': 3676 { 3677 I32 flags = 0; 3678 numlen = 3; 3679 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 3680 RExC_parse += numlen; 3681 break; 3682 } 3683 default: 3684 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) 3685 vWARN2(RExC_parse, 3686 "Unrecognized escape \\%c in character class passed through", 3687 (int)value); 3688 break; 3689 } 3690 } /* end of \blah */ 3691 #ifdef EBCDIC 3692 else 3693 literal_endpoint++; 3694 #endif 3695 3696 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 3697 3698 if (!SIZE_ONLY && !need_class) 3699 ANYOF_CLASS_ZERO(ret); 3700 3701 need_class = 1; 3702 3703 /* a bad range like a-\d, a-[:digit:] ? */ 3704 if (range) { 3705 if (!SIZE_ONLY) { 3706 if (ckWARN(WARN_REGEXP)) 3707 vWARN4(RExC_parse, 3708 "False [] range \"%*.*s\"", 3709 RExC_parse - rangebegin, 3710 RExC_parse - rangebegin, 3711 rangebegin); 3712 if (prevvalue < 256) { 3713 ANYOF_BITMAP_SET(ret, prevvalue); 3714 ANYOF_BITMAP_SET(ret, '-'); 3715 } 3716 else { 3717 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 3718 Perl_sv_catpvf(aTHX_ listsv, 3719 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); 3720 } 3721 } 3722 3723 range = 0; /* this was not a true range */ 3724 } 3725 3726 if (!SIZE_ONLY) { 3727 if (namedclass > OOB_NAMEDCLASS) 3728 optimize_invert = FALSE; 3729 /* Possible truncation here but in some 64-bit environments 3730 * the compiler gets heartburn about switch on 64-bit values. 3731 * A similar issue a little earlier when switching on value. 3732 * --jhi */ 3733 switch ((I32)namedclass) { 3734 case ANYOF_ALNUM: 3735 if (LOC) 3736 ANYOF_CLASS_SET(ret, ANYOF_ALNUM); 3737 else { 3738 for (value = 0; value < 256; value++) 3739 if (isALNUM(value)) 3740 ANYOF_BITMAP_SET(ret, value); 3741 } 3742 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); 3743 break; 3744 case ANYOF_NALNUM: 3745 if (LOC) 3746 ANYOF_CLASS_SET(ret, ANYOF_NALNUM); 3747 else { 3748 for (value = 0; value < 256; value++) 3749 if (!isALNUM(value)) 3750 ANYOF_BITMAP_SET(ret, value); 3751 } 3752 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); 3753 break; 3754 case ANYOF_ALNUMC: 3755 if (LOC) 3756 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); 3757 else { 3758 for (value = 0; value < 256; value++) 3759 if (isALNUMC(value)) 3760 ANYOF_BITMAP_SET(ret, value); 3761 } 3762 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); 3763 break; 3764 case ANYOF_NALNUMC: 3765 if (LOC) 3766 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC); 3767 else { 3768 for (value = 0; value < 256; value++) 3769 if (!isALNUMC(value)) 3770 ANYOF_BITMAP_SET(ret, value); 3771 } 3772 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); 3773 break; 3774 case ANYOF_ALPHA: 3775 if (LOC) 3776 ANYOF_CLASS_SET(ret, ANYOF_ALPHA); 3777 else { 3778 for (value = 0; value < 256; value++) 3779 if (isALPHA(value)) 3780 ANYOF_BITMAP_SET(ret, value); 3781 } 3782 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); 3783 break; 3784 case ANYOF_NALPHA: 3785 if (LOC) 3786 ANYOF_CLASS_SET(ret, ANYOF_NALPHA); 3787 else { 3788 for (value = 0; value < 256; value++) 3789 if (!isALPHA(value)) 3790 ANYOF_BITMAP_SET(ret, value); 3791 } 3792 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); 3793 break; 3794 case ANYOF_ASCII: 3795 if (LOC) 3796 ANYOF_CLASS_SET(ret, ANYOF_ASCII); 3797 else { 3798 #ifndef EBCDIC 3799 for (value = 0; value < 128; value++) 3800 ANYOF_BITMAP_SET(ret, value); 3801 #else /* EBCDIC */ 3802 for (value = 0; value < 256; value++) { 3803 if (isASCII(value)) 3804 ANYOF_BITMAP_SET(ret, value); 3805 } 3806 #endif /* EBCDIC */ 3807 } 3808 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); 3809 break; 3810 case ANYOF_NASCII: 3811 if (LOC) 3812 ANYOF_CLASS_SET(ret, ANYOF_NASCII); 3813 else { 3814 #ifndef EBCDIC 3815 for (value = 128; value < 256; value++) 3816 ANYOF_BITMAP_SET(ret, value); 3817 #else /* EBCDIC */ 3818 for (value = 0; value < 256; value++) { 3819 if (!isASCII(value)) 3820 ANYOF_BITMAP_SET(ret, value); 3821 } 3822 #endif /* EBCDIC */ 3823 } 3824 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); 3825 break; 3826 case ANYOF_BLANK: 3827 if (LOC) 3828 ANYOF_CLASS_SET(ret, ANYOF_BLANK); 3829 else { 3830 for (value = 0; value < 256; value++) 3831 if (isBLANK(value)) 3832 ANYOF_BITMAP_SET(ret, value); 3833 } 3834 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); 3835 break; 3836 case ANYOF_NBLANK: 3837 if (LOC) 3838 ANYOF_CLASS_SET(ret, ANYOF_NBLANK); 3839 else { 3840 for (value = 0; value < 256; value++) 3841 if (!isBLANK(value)) 3842 ANYOF_BITMAP_SET(ret, value); 3843 } 3844 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); 3845 break; 3846 case ANYOF_CNTRL: 3847 if (LOC) 3848 ANYOF_CLASS_SET(ret, ANYOF_CNTRL); 3849 else { 3850 for (value = 0; value < 256; value++) 3851 if (isCNTRL(value)) 3852 ANYOF_BITMAP_SET(ret, value); 3853 } 3854 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); 3855 break; 3856 case ANYOF_NCNTRL: 3857 if (LOC) 3858 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL); 3859 else { 3860 for (value = 0; value < 256; value++) 3861 if (!isCNTRL(value)) 3862 ANYOF_BITMAP_SET(ret, value); 3863 } 3864 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); 3865 break; 3866 case ANYOF_DIGIT: 3867 if (LOC) 3868 ANYOF_CLASS_SET(ret, ANYOF_DIGIT); 3869 else { 3870 /* consecutive digits assumed */ 3871 for (value = '0'; value <= '9'; value++) 3872 ANYOF_BITMAP_SET(ret, value); 3873 } 3874 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); 3875 break; 3876 case ANYOF_NDIGIT: 3877 if (LOC) 3878 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); 3879 else { 3880 /* consecutive digits assumed */ 3881 for (value = 0; value < '0'; value++) 3882 ANYOF_BITMAP_SET(ret, value); 3883 for (value = '9' + 1; value < 256; value++) 3884 ANYOF_BITMAP_SET(ret, value); 3885 } 3886 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); 3887 break; 3888 case ANYOF_GRAPH: 3889 if (LOC) 3890 ANYOF_CLASS_SET(ret, ANYOF_GRAPH); 3891 else { 3892 for (value = 0; value < 256; value++) 3893 if (isGRAPH(value)) 3894 ANYOF_BITMAP_SET(ret, value); 3895 } 3896 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); 3897 break; 3898 case ANYOF_NGRAPH: 3899 if (LOC) 3900 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH); 3901 else { 3902 for (value = 0; value < 256; value++) 3903 if (!isGRAPH(value)) 3904 ANYOF_BITMAP_SET(ret, value); 3905 } 3906 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); 3907 break; 3908 case ANYOF_LOWER: 3909 if (LOC) 3910 ANYOF_CLASS_SET(ret, ANYOF_LOWER); 3911 else { 3912 for (value = 0; value < 256; value++) 3913 if (isLOWER(value)) 3914 ANYOF_BITMAP_SET(ret, value); 3915 } 3916 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); 3917 break; 3918 case ANYOF_NLOWER: 3919 if (LOC) 3920 ANYOF_CLASS_SET(ret, ANYOF_NLOWER); 3921 else { 3922 for (value = 0; value < 256; value++) 3923 if (!isLOWER(value)) 3924 ANYOF_BITMAP_SET(ret, value); 3925 } 3926 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); 3927 break; 3928 case ANYOF_PRINT: 3929 if (LOC) 3930 ANYOF_CLASS_SET(ret, ANYOF_PRINT); 3931 else { 3932 for (value = 0; value < 256; value++) 3933 if (isPRINT(value)) 3934 ANYOF_BITMAP_SET(ret, value); 3935 } 3936 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); 3937 break; 3938 case ANYOF_NPRINT: 3939 if (LOC) 3940 ANYOF_CLASS_SET(ret, ANYOF_NPRINT); 3941 else { 3942 for (value = 0; value < 256; value++) 3943 if (!isPRINT(value)) 3944 ANYOF_BITMAP_SET(ret, value); 3945 } 3946 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); 3947 break; 3948 case ANYOF_PSXSPC: 3949 if (LOC) 3950 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC); 3951 else { 3952 for (value = 0; value < 256; value++) 3953 if (isPSXSPC(value)) 3954 ANYOF_BITMAP_SET(ret, value); 3955 } 3956 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); 3957 break; 3958 case ANYOF_NPSXSPC: 3959 if (LOC) 3960 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC); 3961 else { 3962 for (value = 0; value < 256; value++) 3963 if (!isPSXSPC(value)) 3964 ANYOF_BITMAP_SET(ret, value); 3965 } 3966 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); 3967 break; 3968 case ANYOF_PUNCT: 3969 if (LOC) 3970 ANYOF_CLASS_SET(ret, ANYOF_PUNCT); 3971 else { 3972 for (value = 0; value < 256; value++) 3973 if (isPUNCT(value)) 3974 ANYOF_BITMAP_SET(ret, value); 3975 } 3976 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); 3977 break; 3978 case ANYOF_NPUNCT: 3979 if (LOC) 3980 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT); 3981 else { 3982 for (value = 0; value < 256; value++) 3983 if (!isPUNCT(value)) 3984 ANYOF_BITMAP_SET(ret, value); 3985 } 3986 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); 3987 break; 3988 case ANYOF_SPACE: 3989 if (LOC) 3990 ANYOF_CLASS_SET(ret, ANYOF_SPACE); 3991 else { 3992 for (value = 0; value < 256; value++) 3993 if (isSPACE(value)) 3994 ANYOF_BITMAP_SET(ret, value); 3995 } 3996 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n"); 3997 break; 3998 case ANYOF_NSPACE: 3999 if (LOC) 4000 ANYOF_CLASS_SET(ret, ANYOF_NSPACE); 4001 else { 4002 for (value = 0; value < 256; value++) 4003 if (!isSPACE(value)) 4004 ANYOF_BITMAP_SET(ret, value); 4005 } 4006 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n"); 4007 break; 4008 case ANYOF_UPPER: 4009 if (LOC) 4010 ANYOF_CLASS_SET(ret, ANYOF_UPPER); 4011 else { 4012 for (value = 0; value < 256; value++) 4013 if (isUPPER(value)) 4014 ANYOF_BITMAP_SET(ret, value); 4015 } 4016 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); 4017 break; 4018 case ANYOF_NUPPER: 4019 if (LOC) 4020 ANYOF_CLASS_SET(ret, ANYOF_NUPPER); 4021 else { 4022 for (value = 0; value < 256; value++) 4023 if (!isUPPER(value)) 4024 ANYOF_BITMAP_SET(ret, value); 4025 } 4026 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); 4027 break; 4028 case ANYOF_XDIGIT: 4029 if (LOC) 4030 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT); 4031 else { 4032 for (value = 0; value < 256; value++) 4033 if (isXDIGIT(value)) 4034 ANYOF_BITMAP_SET(ret, value); 4035 } 4036 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); 4037 break; 4038 case ANYOF_NXDIGIT: 4039 if (LOC) 4040 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT); 4041 else { 4042 for (value = 0; value < 256; value++) 4043 if (!isXDIGIT(value)) 4044 ANYOF_BITMAP_SET(ret, value); 4045 } 4046 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); 4047 break; 4048 default: 4049 vFAIL("Invalid [::] class"); 4050 break; 4051 } 4052 if (LOC) 4053 ANYOF_FLAGS(ret) |= ANYOF_CLASS; 4054 continue; 4055 } 4056 } /* end of namedclass \blah */ 4057 4058 if (range) { 4059 if (prevvalue > (IV)value) /* b-a */ { 4060 Simple_vFAIL4("Invalid [] range \"%*.*s\"", 4061 RExC_parse - rangebegin, 4062 RExC_parse - rangebegin, 4063 rangebegin); 4064 range = 0; /* not a valid range */ 4065 } 4066 } 4067 else { 4068 prevvalue = value; /* save the beginning of the range */ 4069 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && 4070 RExC_parse[1] != ']') { 4071 RExC_parse++; 4072 4073 /* a bad range like \w-, [:word:]- ? */ 4074 if (namedclass > OOB_NAMEDCLASS) { 4075 if (ckWARN(WARN_REGEXP)) 4076 vWARN4(RExC_parse, 4077 "False [] range \"%*.*s\"", 4078 RExC_parse - rangebegin, 4079 RExC_parse - rangebegin, 4080 rangebegin); 4081 if (!SIZE_ONLY) 4082 ANYOF_BITMAP_SET(ret, '-'); 4083 } else 4084 range = 1; /* yeah, it's a range! */ 4085 continue; /* but do it the next time */ 4086 } 4087 } 4088 4089 /* now is the next time */ 4090 if (!SIZE_ONLY) { 4091 IV i; 4092 4093 if (prevvalue < 256) { 4094 IV ceilvalue = value < 256 ? value : 255; 4095 4096 #ifdef EBCDIC 4097 /* In EBCDIC [\x89-\x91] should include 4098 * the \x8e but [i-j] should not. */ 4099 if (literal_endpoint == 2 && 4100 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || 4101 (isUPPER(prevvalue) && isUPPER(ceilvalue)))) 4102 { 4103 if (isLOWER(prevvalue)) { 4104 for (i = prevvalue; i <= ceilvalue; i++) 4105 if (isLOWER(i)) 4106 ANYOF_BITMAP_SET(ret, i); 4107 } else { 4108 for (i = prevvalue; i <= ceilvalue; i++) 4109 if (isUPPER(i)) 4110 ANYOF_BITMAP_SET(ret, i); 4111 } 4112 } 4113 else 4114 #endif 4115 for (i = prevvalue; i <= ceilvalue; i++) 4116 ANYOF_BITMAP_SET(ret, i); 4117 } 4118 if (value > 255 || UTF) { 4119 UV prevnatvalue = NATIVE_TO_UNI(prevvalue); 4120 UV natvalue = NATIVE_TO_UNI(value); 4121 4122 ANYOF_FLAGS(ret) |= ANYOF_UNICODE; 4123 if (prevnatvalue < natvalue) { /* what about > ? */ 4124 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", 4125 prevnatvalue, natvalue); 4126 } 4127 else if (prevnatvalue == natvalue) { 4128 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); 4129 if (FOLD) { 4130 U8 foldbuf[UTF8_MAXLEN_FOLD+1]; 4131 STRLEN foldlen; 4132 UV f = to_uni_fold(natvalue, foldbuf, &foldlen); 4133 4134 /* If folding and foldable and a single 4135 * character, insert also the folded version 4136 * to the charclass. */ 4137 if (f != value) { 4138 if (foldlen == (STRLEN)UNISKIP(f)) 4139 Perl_sv_catpvf(aTHX_ listsv, 4140 "%04"UVxf"\n", f); 4141 else { 4142 /* Any multicharacter foldings 4143 * require the following transform: 4144 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) 4145 * where E folds into "pq" and F folds 4146 * into "rst", all other characters 4147 * fold to single characters. We save 4148 * away these multicharacter foldings, 4149 * to be later saved as part of the 4150 * additional "s" data. */ 4151 SV *sv; 4152 4153 if (!unicode_alternate) 4154 unicode_alternate = newAV(); 4155 sv = newSVpvn((char*)foldbuf, foldlen); 4156 SvUTF8_on(sv); 4157 av_push(unicode_alternate, sv); 4158 } 4159 } 4160 4161 /* If folding and the value is one of the Greek 4162 * sigmas insert a few more sigmas to make the 4163 * folding rules of the sigmas to work right. 4164 * Note that not all the possible combinations 4165 * are handled here: some of them are handled 4166 * by the standard folding rules, and some of 4167 * them (literal or EXACTF cases) are handled 4168 * during runtime in regexec.c:S_find_byclass(). */ 4169 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { 4170 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 4171 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); 4172 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 4173 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 4174 } 4175 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA) 4176 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", 4177 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); 4178 } 4179 } 4180 } 4181 #ifdef EBCDIC 4182 literal_endpoint = 0; 4183 #endif 4184 } 4185 4186 range = 0; /* this range (if it was one) is done now */ 4187 } 4188 4189 if (need_class) { 4190 ANYOF_FLAGS(ret) |= ANYOF_LARGE; 4191 if (SIZE_ONLY) 4192 RExC_size += ANYOF_CLASS_ADD_SKIP; 4193 else 4194 RExC_emit += ANYOF_CLASS_ADD_SKIP; 4195 } 4196 4197 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ 4198 if (!SIZE_ONLY && 4199 /* If the only flag is folding (plus possibly inversion). */ 4200 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) 4201 ) { 4202 for (value = 0; value < 256; ++value) { 4203 if (ANYOF_BITMAP_TEST(ret, value)) { 4204 UV fold = PL_fold[value]; 4205 4206 if (fold != value) 4207 ANYOF_BITMAP_SET(ret, fold); 4208 } 4209 } 4210 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; 4211 } 4212 4213 /* optimize inverted simple patterns (e.g. [^a-z]) */ 4214 if (!SIZE_ONLY && optimize_invert && 4215 /* If the only flag is inversion. */ 4216 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { 4217 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) 4218 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; 4219 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; 4220 } 4221 4222 if (!SIZE_ONLY) { 4223 AV *av = newAV(); 4224 SV *rv; 4225 4226 /* The 0th element stores the character class description 4227 * in its textual form: used later (regexec.c:Perl_regclass_swatch()) 4228 * to initialize the appropriate swash (which gets stored in 4229 * the 1st element), and also useful for dumping the regnode. 4230 * The 2nd element stores the multicharacter foldings, 4231 * used later (regexec.c:s_reginclasslen()). */ 4232 av_store(av, 0, listsv); 4233 av_store(av, 1, NULL); 4234 av_store(av, 2, (SV*)unicode_alternate); 4235 rv = newRV_noinc((SV*)av); 4236 n = add_data(pRExC_state, 1, "s"); 4237 RExC_rx->data->data[n] = (void*)rv; 4238 ARG_SET(ret, n); 4239 } 4240 4241 return ret; 4242 } 4243 4244 STATIC char* 4245 S_nextchar(pTHX_ RExC_state_t *pRExC_state) 4246 { 4247 char* retval = RExC_parse++; 4248 4249 for (;;) { 4250 if (*RExC_parse == '(' && RExC_parse[1] == '?' && 4251 RExC_parse[2] == '#') { 4252 while (*RExC_parse && *RExC_parse != ')') 4253 RExC_parse++; 4254 RExC_parse++; 4255 continue; 4256 } 4257 if (RExC_flags & PMf_EXTENDED) { 4258 if (isSPACE(*RExC_parse)) { 4259 RExC_parse++; 4260 continue; 4261 } 4262 else if (*RExC_parse == '#') { 4263 while (*RExC_parse && *RExC_parse != '\n') 4264 RExC_parse++; 4265 RExC_parse++; 4266 continue; 4267 } 4268 } 4269 return retval; 4270 } 4271 } 4272 4273 /* 4274 - reg_node - emit a node 4275 */ 4276 STATIC regnode * /* Location. */ 4277 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 4278 { 4279 register regnode *ret; 4280 register regnode *ptr; 4281 4282 ret = RExC_emit; 4283 if (SIZE_ONLY) { 4284 SIZE_ALIGN(RExC_size); 4285 RExC_size += 1; 4286 return(ret); 4287 } 4288 4289 NODE_ALIGN_FILL(ret); 4290 ptr = ret; 4291 FILL_ADVANCE_NODE(ptr, op); 4292 if (RExC_offsets) { /* MJD */ 4293 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 4294 "reg_node", __LINE__, 4295 reg_name[op], 4296 RExC_emit - RExC_emit_start > RExC_offsets[0] 4297 ? "Overwriting end of array!\n" : "OK", 4298 RExC_emit - RExC_emit_start, 4299 RExC_parse - RExC_start, 4300 RExC_offsets[0])); 4301 Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); 4302 } 4303 4304 RExC_emit = ptr; 4305 4306 return(ret); 4307 } 4308 4309 /* 4310 - reganode - emit a node with an argument 4311 */ 4312 STATIC regnode * /* Location. */ 4313 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 4314 { 4315 register regnode *ret; 4316 register regnode *ptr; 4317 4318 ret = RExC_emit; 4319 if (SIZE_ONLY) { 4320 SIZE_ALIGN(RExC_size); 4321 RExC_size += 2; 4322 return(ret); 4323 } 4324 4325 NODE_ALIGN_FILL(ret); 4326 ptr = ret; 4327 FILL_ADVANCE_NODE_ARG(ptr, op, arg); 4328 if (RExC_offsets) { /* MJD */ 4329 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 4330 "reganode", 4331 RExC_emit - RExC_emit_start > RExC_offsets[0] ? 4332 "Overwriting end of array!\n" : "OK", 4333 RExC_emit - RExC_emit_start, 4334 RExC_parse - RExC_start, 4335 RExC_offsets[0])); 4336 Set_Cur_Node_Offset; 4337 } 4338 4339 RExC_emit = ptr; 4340 4341 return(ret); 4342 } 4343 4344 /* 4345 - reguni - emit (if appropriate) a Unicode character 4346 */ 4347 STATIC void 4348 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp) 4349 { 4350 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); 4351 } 4352 4353 /* 4354 - reginsert - insert an operator in front of already-emitted operand 4355 * 4356 * Means relocating the operand. 4357 */ 4358 STATIC void 4359 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) 4360 { 4361 register regnode *src; 4362 register regnode *dst; 4363 register regnode *place; 4364 register int offset = regarglen[(U8)op]; 4365 4366 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ 4367 4368 if (SIZE_ONLY) { 4369 RExC_size += NODE_STEP_REGNODE + offset; 4370 return; 4371 } 4372 4373 src = RExC_emit; 4374 RExC_emit += NODE_STEP_REGNODE + offset; 4375 dst = RExC_emit; 4376 while (src > opnd) { 4377 StructCopy(--src, --dst, regnode); 4378 if (RExC_offsets) { /* MJD 20010112 */ 4379 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", 4380 "reg_insert", 4381 dst - RExC_emit_start > RExC_offsets[0] 4382 ? "Overwriting end of array!\n" : "OK", 4383 src - RExC_emit_start, 4384 dst - RExC_emit_start, 4385 RExC_offsets[0])); 4386 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); 4387 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); 4388 } 4389 } 4390 4391 4392 place = opnd; /* Op node, where operand used to be. */ 4393 if (RExC_offsets) { /* MJD */ 4394 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 4395 "reginsert", 4396 place - RExC_emit_start > RExC_offsets[0] 4397 ? "Overwriting end of array!\n" : "OK", 4398 place - RExC_emit_start, 4399 RExC_parse - RExC_start, 4400 RExC_offsets[0])); 4401 Set_Node_Offset(place, RExC_parse); 4402 } 4403 src = NEXTOPER(place); 4404 FILL_ADVANCE_NODE(place, op); 4405 Zero(src, offset, regnode); 4406 } 4407 4408 /* 4409 - regtail - set the next-pointer at the end of a node chain of p to val. 4410 */ 4411 STATIC void 4412 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val) 4413 { 4414 register regnode *scan; 4415 register regnode *temp; 4416 4417 if (SIZE_ONLY) 4418 return; 4419 4420 /* Find last node. */ 4421 scan = p; 4422 for (;;) { 4423 temp = regnext(scan); 4424 if (temp == NULL) 4425 break; 4426 scan = temp; 4427 } 4428 4429 if (reg_off_by_arg[OP(scan)]) { 4430 ARG_SET(scan, val - scan); 4431 } 4432 else { 4433 NEXT_OFF(scan) = val - scan; 4434 } 4435 } 4436 4437 /* 4438 - regoptail - regtail on operand of first argument; nop if operandless 4439 */ 4440 STATIC void 4441 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val) 4442 { 4443 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 4444 if (p == NULL || SIZE_ONLY) 4445 return; 4446 if (PL_regkind[(U8)OP(p)] == BRANCH) { 4447 regtail(pRExC_state, NEXTOPER(p), val); 4448 } 4449 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) { 4450 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val); 4451 } 4452 else 4453 return; 4454 } 4455 4456 /* 4457 - regcurly - a little FSA that accepts {\d+,?\d*} 4458 */ 4459 STATIC I32 4460 S_regcurly(pTHX_ register char *s) 4461 { 4462 if (*s++ != '{') 4463 return FALSE; 4464 if (!isDIGIT(*s)) 4465 return FALSE; 4466 while (isDIGIT(*s)) 4467 s++; 4468 if (*s == ',') 4469 s++; 4470 while (isDIGIT(*s)) 4471 s++; 4472 if (*s != '}') 4473 return FALSE; 4474 return TRUE; 4475 } 4476 4477 4478 #ifdef DEBUGGING 4479 4480 STATIC regnode * 4481 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) 4482 { 4483 register U8 op = EXACT; /* Arbitrary non-END op. */ 4484 register regnode *next; 4485 4486 while (op != END && (!last || node < last)) { 4487 /* While that wasn't END last time... */ 4488 4489 NODE_ALIGN(node); 4490 op = OP(node); 4491 if (op == CLOSE) 4492 l--; 4493 next = regnext(node); 4494 /* Where, what. */ 4495 if (OP(node) == OPTIMIZED) 4496 goto after_print; 4497 regprop(sv, node); 4498 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), 4499 (int)(2*l + 1), "", SvPVX(sv)); 4500 if (next == NULL) /* Next ptr. */ 4501 PerlIO_printf(Perl_debug_log, "(0)"); 4502 else 4503 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start)); 4504 (void)PerlIO_putc(Perl_debug_log, '\n'); 4505 after_print: 4506 if (PL_regkind[(U8)op] == BRANCHJ) { 4507 register regnode *nnode = (OP(next) == LONGJMP 4508 ? regnext(next) 4509 : next); 4510 if (last && nnode > last) 4511 nnode = last; 4512 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1); 4513 } 4514 else if (PL_regkind[(U8)op] == BRANCH) { 4515 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1); 4516 } 4517 else if ( op == CURLY) { /* `next' might be very big: optimizer */ 4518 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS, 4519 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1); 4520 } 4521 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { 4522 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS, 4523 next, sv, l + 1); 4524 } 4525 else if ( op == PLUS || op == STAR) { 4526 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1); 4527 } 4528 else if (op == ANYOF) { 4529 /* arglen 1 + class block */ 4530 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE) 4531 ? ANYOF_CLASS_SKIP : ANYOF_SKIP); 4532 node = NEXTOPER(node); 4533 } 4534 else if (PL_regkind[(U8)op] == EXACT) { 4535 /* Literal string, where present. */ 4536 node += NODE_SZ_STR(node) - 1; 4537 node = NEXTOPER(node); 4538 } 4539 else { 4540 node = NEXTOPER(node); 4541 node += regarglen[(U8)op]; 4542 } 4543 if (op == CURLYX || op == OPEN) 4544 l++; 4545 else if (op == WHILEM) 4546 l--; 4547 } 4548 return node; 4549 } 4550 4551 #endif /* DEBUGGING */ 4552 4553 /* 4554 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 4555 */ 4556 void 4557 Perl_regdump(pTHX_ regexp *r) 4558 { 4559 #ifdef DEBUGGING 4560 SV *sv = sv_newmortal(); 4561 4562 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0); 4563 4564 /* Header fields of interest. */ 4565 if (r->anchored_substr) 4566 PerlIO_printf(Perl_debug_log, 4567 "anchored `%s%.*s%s'%s at %"IVdf" ", 4568 PL_colors[0], 4569 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)), 4570 SvPVX(r->anchored_substr), 4571 PL_colors[1], 4572 SvTAIL(r->anchored_substr) ? "$" : "", 4573 (IV)r->anchored_offset); 4574 else if (r->anchored_utf8) 4575 PerlIO_printf(Perl_debug_log, 4576 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ", 4577 PL_colors[0], 4578 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)), 4579 SvPVX(r->anchored_utf8), 4580 PL_colors[1], 4581 SvTAIL(r->anchored_utf8) ? "$" : "", 4582 (IV)r->anchored_offset); 4583 if (r->float_substr) 4584 PerlIO_printf(Perl_debug_log, 4585 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", 4586 PL_colors[0], 4587 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)), 4588 SvPVX(r->float_substr), 4589 PL_colors[1], 4590 SvTAIL(r->float_substr) ? "$" : "", 4591 (IV)r->float_min_offset, (UV)r->float_max_offset); 4592 else if (r->float_utf8) 4593 PerlIO_printf(Perl_debug_log, 4594 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", 4595 PL_colors[0], 4596 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)), 4597 SvPVX(r->float_utf8), 4598 PL_colors[1], 4599 SvTAIL(r->float_utf8) ? "$" : "", 4600 (IV)r->float_min_offset, (UV)r->float_max_offset); 4601 if (r->check_substr || r->check_utf8) 4602 PerlIO_printf(Perl_debug_log, 4603 r->check_substr == r->float_substr 4604 && r->check_utf8 == r->float_utf8 4605 ? "(checking floating" : "(checking anchored"); 4606 if (r->reganch & ROPT_NOSCAN) 4607 PerlIO_printf(Perl_debug_log, " noscan"); 4608 if (r->reganch & ROPT_CHECK_ALL) 4609 PerlIO_printf(Perl_debug_log, " isall"); 4610 if (r->check_substr || r->check_utf8) 4611 PerlIO_printf(Perl_debug_log, ") "); 4612 4613 if (r->regstclass) { 4614 regprop(sv, r->regstclass); 4615 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv)); 4616 } 4617 if (r->reganch & ROPT_ANCH) { 4618 PerlIO_printf(Perl_debug_log, "anchored"); 4619 if (r->reganch & ROPT_ANCH_BOL) 4620 PerlIO_printf(Perl_debug_log, "(BOL)"); 4621 if (r->reganch & ROPT_ANCH_MBOL) 4622 PerlIO_printf(Perl_debug_log, "(MBOL)"); 4623 if (r->reganch & ROPT_ANCH_SBOL) 4624 PerlIO_printf(Perl_debug_log, "(SBOL)"); 4625 if (r->reganch & ROPT_ANCH_GPOS) 4626 PerlIO_printf(Perl_debug_log, "(GPOS)"); 4627 PerlIO_putc(Perl_debug_log, ' '); 4628 } 4629 if (r->reganch & ROPT_GPOS_SEEN) 4630 PerlIO_printf(Perl_debug_log, "GPOS "); 4631 if (r->reganch & ROPT_SKIP) 4632 PerlIO_printf(Perl_debug_log, "plus "); 4633 if (r->reganch & ROPT_IMPLICIT) 4634 PerlIO_printf(Perl_debug_log, "implicit "); 4635 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen); 4636 if (r->reganch & ROPT_EVAL_SEEN) 4637 PerlIO_printf(Perl_debug_log, "with eval "); 4638 PerlIO_printf(Perl_debug_log, "\n"); 4639 if (r->offsets) { 4640 U32 i; 4641 U32 len = r->offsets[0]; 4642 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]); 4643 for (i = 1; i <= len; i++) 4644 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 4645 (UV)r->offsets[i*2-1], 4646 (UV)r->offsets[i*2]); 4647 PerlIO_printf(Perl_debug_log, "\n"); 4648 } 4649 #endif /* DEBUGGING */ 4650 } 4651 4652 #ifdef DEBUGGING 4653 4654 STATIC void 4655 S_put_byte(pTHX_ SV *sv, int c) 4656 { 4657 if (isCNTRL(c) || c == 255 || !isPRINT(c)) 4658 Perl_sv_catpvf(aTHX_ sv, "\\%o", c); 4659 else if (c == '-' || c == ']' || c == '\\' || c == '^') 4660 Perl_sv_catpvf(aTHX_ sv, "\\%c", c); 4661 else 4662 Perl_sv_catpvf(aTHX_ sv, "%c", c); 4663 } 4664 4665 #endif /* DEBUGGING */ 4666 4667 /* 4668 - regprop - printable representation of opcode 4669 */ 4670 void 4671 Perl_regprop(pTHX_ SV *sv, regnode *o) 4672 { 4673 #ifdef DEBUGGING 4674 register int k; 4675 4676 sv_setpvn(sv, "", 0); 4677 if (OP(o) >= reg_num) /* regnode.type is unsigned */ 4678 /* It would be nice to FAIL() here, but this may be called from 4679 regexec.c, and it would be hard to supply pRExC_state. */ 4680 Perl_croak(aTHX_ "Corrupted regexp opcode"); 4681 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */ 4682 4683 k = PL_regkind[(U8)OP(o)]; 4684 4685 if (k == EXACT) { 4686 SV *dsv = sv_2mortal(newSVpvn("", 0)); 4687 /* Using is_utf8_string() is a crude hack but it may 4688 * be the best for now since we have no flag "this EXACTish 4689 * node was UTF-8" --jhi */ 4690 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o)); 4691 char *s = do_utf8 ? 4692 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 4693 UNI_DISPLAY_REGEX) : 4694 STRING(o); 4695 int len = do_utf8 ? 4696 strlen(s) : 4697 STR_LEN(o); 4698 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", 4699 PL_colors[0], 4700 len, s, 4701 PL_colors[1]); 4702 } 4703 else if (k == CURLY) { 4704 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) 4705 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ 4706 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); 4707 } 4708 else if (k == WHILEM && o->flags) /* Ordinal/of */ 4709 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); 4710 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP ) 4711 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ 4712 else if (k == LOGICAL) 4713 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ 4714 else if (k == ANYOF) { 4715 int i, rangestart = -1; 4716 U8 flags = ANYOF_FLAGS(o); 4717 const char * const anyofs[] = { /* Should be synchronized with 4718 * ANYOF_ #xdefines in regcomp.h */ 4719 "\\w", 4720 "\\W", 4721 "\\s", 4722 "\\S", 4723 "\\d", 4724 "\\D", 4725 "[:alnum:]", 4726 "[:^alnum:]", 4727 "[:alpha:]", 4728 "[:^alpha:]", 4729 "[:ascii:]", 4730 "[:^ascii:]", 4731 "[:ctrl:]", 4732 "[:^ctrl:]", 4733 "[:graph:]", 4734 "[:^graph:]", 4735 "[:lower:]", 4736 "[:^lower:]", 4737 "[:print:]", 4738 "[:^print:]", 4739 "[:punct:]", 4740 "[:^punct:]", 4741 "[:upper:]", 4742 "[:^upper:]", 4743 "[:xdigit:]", 4744 "[:^xdigit:]", 4745 "[:space:]", 4746 "[:^space:]", 4747 "[:blank:]", 4748 "[:^blank:]" 4749 }; 4750 4751 if (flags & ANYOF_LOCALE) 4752 sv_catpv(sv, "{loc}"); 4753 if (flags & ANYOF_FOLD) 4754 sv_catpv(sv, "{i}"); 4755 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 4756 if (flags & ANYOF_INVERT) 4757 sv_catpv(sv, "^"); 4758 for (i = 0; i <= 256; i++) { 4759 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { 4760 if (rangestart == -1) 4761 rangestart = i; 4762 } else if (rangestart != -1) { 4763 if (i <= rangestart + 3) 4764 for (; rangestart < i; rangestart++) 4765 put_byte(sv, rangestart); 4766 else { 4767 put_byte(sv, rangestart); 4768 sv_catpv(sv, "-"); 4769 put_byte(sv, i - 1); 4770 } 4771 rangestart = -1; 4772 } 4773 } 4774 4775 if (o->flags & ANYOF_CLASS) 4776 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) 4777 if (ANYOF_CLASS_TEST(o,i)) 4778 sv_catpv(sv, anyofs[i]); 4779 4780 if (flags & ANYOF_UNICODE) 4781 sv_catpv(sv, "{unicode}"); 4782 else if (flags & ANYOF_UNICODE_ALL) 4783 sv_catpv(sv, "{unicode_all}"); 4784 4785 { 4786 SV *lv; 4787 SV *sw = regclass_swash(o, FALSE, &lv, 0); 4788 4789 if (lv) { 4790 if (sw) { 4791 U8 s[UTF8_MAXLEN+1]; 4792 4793 for (i = 0; i <= 256; i++) { /* just the first 256 */ 4794 U8 *e = uvchr_to_utf8(s, i); 4795 4796 if (i < 256 && swash_fetch(sw, s, TRUE)) { 4797 if (rangestart == -1) 4798 rangestart = i; 4799 } else if (rangestart != -1) { 4800 U8 *p; 4801 4802 if (i <= rangestart + 3) 4803 for (; rangestart < i; rangestart++) { 4804 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++) 4805 put_byte(sv, *p); 4806 } 4807 else { 4808 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++) 4809 put_byte(sv, *p); 4810 sv_catpv(sv, "-"); 4811 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++) 4812 put_byte(sv, *p); 4813 } 4814 rangestart = -1; 4815 } 4816 } 4817 4818 sv_catpv(sv, "..."); /* et cetera */ 4819 } 4820 4821 { 4822 char *s = savepv(SvPVX(lv)); 4823 char *origs = s; 4824 4825 while(*s && *s != '\n') s++; 4826 4827 if (*s == '\n') { 4828 char *t = ++s; 4829 4830 while (*s) { 4831 if (*s == '\n') 4832 *s = ' '; 4833 s++; 4834 } 4835 if (s[-1] == ' ') 4836 s[-1] = 0; 4837 4838 sv_catpv(sv, t); 4839 } 4840 4841 Safefree(origs); 4842 } 4843 } 4844 } 4845 4846 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 4847 } 4848 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) 4849 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags); 4850 #endif /* DEBUGGING */ 4851 } 4852 4853 SV * 4854 Perl_re_intuit_string(pTHX_ regexp *prog) 4855 { /* Assume that RE_INTUIT is set */ 4856 DEBUG_r( 4857 { STRLEN n_a; 4858 char *s = SvPV(prog->check_substr 4859 ? prog->check_substr : prog->check_utf8, n_a); 4860 4861 if (!PL_colorset) reginitcolors(); 4862 PerlIO_printf(Perl_debug_log, 4863 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n", 4864 PL_colors[4], 4865 prog->check_substr ? "" : "utf8 ", 4866 PL_colors[5],PL_colors[0], 4867 s, 4868 PL_colors[1], 4869 (strlen(s) > 60 ? "..." : "")); 4870 } ); 4871 4872 return prog->check_substr ? prog->check_substr : prog->check_utf8; 4873 } 4874 4875 void 4876 Perl_pregfree(pTHX_ struct regexp *r) 4877 { 4878 #ifdef DEBUGGING 4879 SV *dsv = PERL_DEBUG_PAD_ZERO(0); 4880 #endif 4881 4882 if (!r || (--r->refcnt > 0)) 4883 return; 4884 DEBUG_r({ 4885 int len; 4886 char *s; 4887 4888 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp, 4889 r->prelen, 60, UNI_DISPLAY_REGEX) 4890 : pv_display(dsv, r->precomp, r->prelen, 0, 60); 4891 len = SvCUR(dsv); 4892 if (!PL_colorset) 4893 reginitcolors(); 4894 PerlIO_printf(Perl_debug_log, 4895 "%sFreeing REx:%s `%s%*.*s%s%s'\n", 4896 PL_colors[4],PL_colors[5],PL_colors[0], 4897 len, len, s, 4898 PL_colors[1], 4899 len > 60 ? "..." : ""); 4900 }); 4901 4902 if (r->precomp) 4903 Safefree(r->precomp); 4904 if (r->offsets) /* 20010421 MJD */ 4905 Safefree(r->offsets); 4906 if (RX_MATCH_COPIED(r)) 4907 Safefree(r->subbeg); 4908 if (r->substrs) { 4909 if (r->anchored_substr) 4910 SvREFCNT_dec(r->anchored_substr); 4911 if (r->anchored_utf8) 4912 SvREFCNT_dec(r->anchored_utf8); 4913 if (r->float_substr) 4914 SvREFCNT_dec(r->float_substr); 4915 if (r->float_utf8) 4916 SvREFCNT_dec(r->float_utf8); 4917 Safefree(r->substrs); 4918 } 4919 if (r->data) { 4920 int n = r->data->count; 4921 AV* new_comppad = NULL; 4922 AV* old_comppad; 4923 SV** old_curpad; 4924 4925 while (--n >= 0) { 4926 /* If you add a ->what type here, update the comment in regcomp.h */ 4927 switch (r->data->what[n]) { 4928 case 's': 4929 SvREFCNT_dec((SV*)r->data->data[n]); 4930 break; 4931 case 'f': 4932 Safefree(r->data->data[n]); 4933 break; 4934 case 'p': 4935 new_comppad = (AV*)r->data->data[n]; 4936 break; 4937 case 'o': 4938 if (new_comppad == NULL) 4939 Perl_croak(aTHX_ "panic: pregfree comppad"); 4940 old_comppad = PL_comppad; 4941 old_curpad = PL_curpad; 4942 /* Watch out for global destruction's random ordering. */ 4943 if (SvTYPE(new_comppad) == SVt_PVAV) { 4944 PL_comppad = new_comppad; 4945 PL_curpad = AvARRAY(new_comppad); 4946 } 4947 else 4948 PL_curpad = NULL; 4949 4950 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) { 4951 op_free((OP_4tree*)r->data->data[n]); 4952 } 4953 4954 PL_comppad = old_comppad; 4955 PL_curpad = old_curpad; 4956 SvREFCNT_dec((SV*)new_comppad); 4957 new_comppad = NULL; 4958 break; 4959 case 'n': 4960 break; 4961 default: 4962 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]); 4963 } 4964 } 4965 Safefree(r->data->what); 4966 Safefree(r->data); 4967 } 4968 Safefree(r->startp); 4969 Safefree(r->endp); 4970 Safefree(r); 4971 } 4972 4973 /* 4974 - regnext - dig the "next" pointer out of a node 4975 * 4976 * [Note, when REGALIGN is defined there are two places in regmatch() 4977 * that bypass this code for speed.] 4978 */ 4979 regnode * 4980 Perl_regnext(pTHX_ register regnode *p) 4981 { 4982 register I32 offset; 4983 4984 if (p == &PL_regdummy) 4985 return(NULL); 4986 4987 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); 4988 if (offset == 0) 4989 return(NULL); 4990 4991 return(p+offset); 4992 } 4993 4994 STATIC void 4995 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) 4996 { 4997 va_list args; 4998 STRLEN l1 = strlen(pat1); 4999 STRLEN l2 = strlen(pat2); 5000 char buf[512]; 5001 SV *msv; 5002 char *message; 5003 5004 if (l1 > 510) 5005 l1 = 510; 5006 if (l1 + l2 > 510) 5007 l2 = 510 - l1; 5008 Copy(pat1, buf, l1 , char); 5009 Copy(pat2, buf + l1, l2 , char); 5010 buf[l1 + l2] = '\n'; 5011 buf[l1 + l2 + 1] = '\0'; 5012 #ifdef I_STDARG 5013 /* ANSI variant takes additional second argument */ 5014 va_start(args, pat2); 5015 #else 5016 va_start(args); 5017 #endif 5018 msv = vmess(buf, &args); 5019 va_end(args); 5020 message = SvPV(msv,l1); 5021 if (l1 > 512) 5022 l1 = 512; 5023 Copy(message, buf, l1 , char); 5024 buf[l1] = '\0'; /* Overwrite \n */ 5025 Perl_croak(aTHX_ "%s", buf); 5026 } 5027 5028 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 5029 5030 void 5031 Perl_save_re_context(pTHX) 5032 { 5033 #if 0 5034 SAVEPPTR(RExC_precomp); /* uncompiled string. */ 5035 SAVEI32(RExC_npar); /* () count. */ 5036 SAVEI32(RExC_size); /* Code size. */ 5037 SAVEI32(RExC_flags); /* are we folding, multilining? */ 5038 SAVEVPTR(RExC_rx); /* from regcomp.c */ 5039 SAVEI32(RExC_seen); /* from regcomp.c */ 5040 SAVEI32(RExC_sawback); /* Did we see \1, ...? */ 5041 SAVEI32(RExC_naughty); /* How bad is this pattern? */ 5042 SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */ 5043 SAVEPPTR(RExC_end); /* End of input for compile */ 5044 SAVEPPTR(RExC_parse); /* Input-scan pointer. */ 5045 #endif 5046 5047 SAVEI32(PL_reg_flags); /* from regexec.c */ 5048 SAVEPPTR(PL_bostr); 5049 SAVEPPTR(PL_reginput); /* String-input pointer. */ 5050 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */ 5051 SAVEPPTR(PL_regeol); /* End of input, for $ check. */ 5052 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */ 5053 SAVEVPTR(PL_regendp); /* Ditto for endp. */ 5054 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */ 5055 SAVEPPTR(PL_regtill); /* How far we are required to go. */ 5056 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */ 5057 PL_reg_start_tmp = 0; 5058 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */ 5059 PL_reg_start_tmpl = 0; 5060 SAVEVPTR(PL_regdata); 5061 SAVEI32(PL_reg_eval_set); /* from regexec.c */ 5062 SAVEI32(PL_regnarrate); /* from regexec.c */ 5063 SAVEVPTR(PL_regprogram); /* from regexec.c */ 5064 SAVEINT(PL_regindent); /* from regexec.c */ 5065 SAVEVPTR(PL_regcc); /* from regexec.c */ 5066 SAVEVPTR(PL_curcop); 5067 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */ 5068 SAVEVPTR(PL_reg_re); /* from regexec.c */ 5069 SAVEPPTR(PL_reg_ganch); /* from regexec.c */ 5070 SAVESPTR(PL_reg_sv); /* from regexec.c */ 5071 SAVEI8(PL_reg_match_utf8); /* from regexec.c */ 5072 SAVEVPTR(PL_reg_magic); /* from regexec.c */ 5073 SAVEI32(PL_reg_oldpos); /* from regexec.c */ 5074 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */ 5075 SAVEVPTR(PL_reg_curpm); /* from regexec.c */ 5076 SAVEI32(PL_regnpar); /* () count. */ 5077 SAVEI32(PL_regsize); /* from regexec.c */ 5078 #ifdef DEBUGGING 5079 SAVEPPTR(PL_reg_starttry); /* from regexec.c */ 5080 #endif 5081 } 5082 5083 static void 5084 clear_re(pTHX_ void *r) 5085 { 5086 ReREFCNT_dec((regexp *)r); 5087 } 5088 5089